From 3133d6ab1a108f41cc2d83ccd47e41b007d6c84f Mon Sep 17 00:00:00 2001
From: Guo Sheng <guosheng@baidu.com>
Date: Thu, 10 Aug 2023 16:34:16 +0800
Subject: [PATCH] Remove contents in ppdiffusers (#6663)

---
 ppdiffusers/LICENSE                           |  203 -
 ppdiffusers/Makefile                          |   30 -
 ppdiffusers/VERSION                           |    1 -
 ppdiffusers/deploy/README.md                  |  186 -
 ppdiffusers/deploy/controlnet/README.md       |  136 -
 ppdiffusers/deploy/controlnet/export.md       |   55 -
 ppdiffusers/deploy/controlnet/export_model.py |  234 --
 ppdiffusers/deploy/controlnet/infer.py        |  625 ----
 .../deploy/controlnet/infer_dygraph.py        |  360 --
 .../deploy/controlnet/infer_dygraph_toch.py   |  419 ---
 ppdiffusers/deploy/export.md                  |   65 -
 ppdiffusers/deploy/export_model.py            |  176 -
 ppdiffusers/deploy/infer.py                   |  708 ----
 ppdiffusers/deploy/infer_dygraph.py           |  358 --
 ppdiffusers/deploy/infer_dygraph_torch.py     |  432 ---
 ppdiffusers/deploy/requirements.txt           |    1 -
 .../scripts/test_controlnet_infer_dygraph.sh  |   58 -
 .../test_controlnet_infer_dygraph_torch.sh    |   32 -
 .../scripts/test_controlnet_infer_fd.sh       |   65 -
 .../deploy/scripts/test_infer_dygraph.sh      |   75 -
 .../scripts/test_infer_dygraph_torch.sh       |   29 -
 ppdiffusers/deploy/scripts/test_infer_fd.sh   |  130 -
 .../export_model.py                           |  171 -
 .../stable_diffusion_image_variation/infer.py |  419 ---
 .../infer_dygraph.py                          |  141 -
 .../infer_dygraph_torch.py                    |  283 --
 .../examples/Stable-CycleDiffusion/README.md  |   26 -
 .../examples/Stable-CycleDiffusion/app.py     |  676 ----
 .../Stable-CycleDiffusion/ptp_utils.py        |  134 -
 .../Stable-CycleDiffusion/requirements.txt    |    3 -
 .../Stable-CycleDiffusion/seq_aligner.py      |  206 -
 .../examples/autoencoder/vae/README.md        |  236 --
 .../vae/config/f8encoder_f16decoder.yaml      |   74 -
 .../examples/autoencoder/vae/config/vae.json  |   36 -
 .../vae/data/filelist/laion400m_en.filelist   |   10 -
 .../vae/data/filelist/train.filelist.list     |    1 -
 .../vae/data/filelist/write_filelist.py       |   20 -
 .../examples/autoencoder/vae/ldm/__init__.py  |   17 -
 .../vae/ldm/autoencoder_datasets.py           |  603 ---
 .../vae/ldm/image_degradation/__init__.py     |   16 -
 .../vae/ldm/image_degradation/bsrgan.py       |  643 ----
 .../vae/ldm/image_degradation/bsrgan_light.py |  648 ----
 .../vae/ldm/image_degradation/utils_image.py  |  209 --
 .../examples/autoencoder/vae/ldm/losses.py    |  542 ---
 .../examples/autoencoder/vae/ldm/model.py     |  365 --
 .../autoencoder/vae/ldm/text_image_pair.py    |  235 --
 .../examples/autoencoder/vae/requirements.txt |   10 -
 ppdiffusers/examples/autoencoder/vae/run.sh   |   42 -
 .../autoencoder/vae/scripts/README.md         |   46 -
 .../vae/scripts/calculate_psnr_ssim.py        |  190 -
 .../scripts/convert_kl_8_to_ppdiffusers.py    |  401 --
 .../autoencoder/vae/scripts/fid_score.py      |  278 --
 .../vae/scripts/get_autoencoder_results.py    |   70 -
 .../autoencoder/vae/scripts/inception.py      |  493 ---
 .../examples/autoencoder/vae/train_vae.py     |  487 ---
 .../examples/clip_interrogator/LICENSE        |   21 -
 .../examples/clip_interrogator/README.md      |   31 -
 .../clip_interrogator/__init__.py             |   39 -
 .../clip_interrogator/blip_decoder.py         |  126 -
 .../clip_interrogator/clip_interrogator.py    |  355 --
 .../examples/clip_interrogator/dumpy.py       |   80 -
 .../examples/clip_interrogator/predict.py     |   73 -
 .../clip_interrogator/requirements.txt        |    5 -
 .../examples/clip_interrogator/run_cli.py     |  137 -
 .../examples/clip_interrogator/run_gradio.py  |  136 -
 ppdiffusers/examples/community/README.md      |  628 ----
 ...p_guided_images_mixing_stable_diffusion.py |  408 --
 .../community/clip_guided_stable_diffusion.py |  425 ---
 .../community/composable_stable_diffusion.py  |  424 ---
 .../inference_clip_guided_stable_diffusion.py |  147 -
 .../community/interpolate_stable_diffusion.py |  523 ---
 .../community/lpw_stable_diffusion.py         | 1078 ------
 .../examples/community/mixture_tiling.py      |  417 ---
 .../examples/community/one_step_unet.py       |   37 -
 ...e_fastdeploy_stable_diffusion_hires_fix.py |  589 ---
 ...tdeploy_stable_diffusion_mixture_tiling.py |  453 ---
 .../examples/community/reference_only.py      | 1110 ------
 .../stable_diffusion_controlnet_img2img.py    |  912 -----
 .../community/stable_diffusion_hires_fix.py   |  768 ----
 .../community/stable_diffusion_mega.py        | 3333 -----------------
 .../community/webui_stable_diffusion.py       | 2151 -----------
 .../community/wildcard_stable_diffusion.py    |  423 ---
 ppdiffusers/examples/controlnet/README.md     |  211 --
 .../controlnet/annotator/_base_/ade20k.yml    |   44 -
 .../annotator/_base_/cityscapes.yml           |   44 -
 .../annotator/_base_/cityscapes_1024x1024.yml |   20 -
 .../controlnet/annotator/canny/__init__.py    |   21 -
 .../controlnet/annotator/hed/__init__.py      |  168 -
 .../annotator/midas_paddle/__init__.py        |   49 -
 .../annotator/midas_paddle/api_inference.py   |   89 -
 .../controlnet/annotator/mlsd/__init__.py     |   48 -
 .../annotator/mlsd/models/mbv2_mlsd_large.py  |  309 --
 .../controlnet/annotator/mlsd/utils.py        |  208 -
 .../controlnet/annotator/openpose/__init__.py |   51 -
 .../controlnet/annotator/openpose/util.py     |  247 --
 .../annotator/ppdet_hrnet/__init__.py         |  101 -
 .../annotator/ppdet_hrnet/benchmark_utils.py  |  273 --
 .../ppdet_hrnet/det_keypoint_unite_infer.py   |  448 ---
 .../ppdet_hrnet/det_keypoint_unite_utils.py   |  122 -
 .../controlnet/annotator/ppdet_hrnet/infer.py | 1052 ------
 .../annotator/ppdet_hrnet/keypoint_infer.py   |  373 --
 .../ppdet_hrnet/keypoint_postprocess.py       |  347 --
 .../ppdet_hrnet/keypoint_preprocess.py        |  233 --
 .../ppdet_hrnet/picodet_postprocess.py        |  221 --
 .../annotator/ppdet_hrnet/preprocess.py       |  482 ---
 .../controlnet/annotator/ppdet_hrnet/util.py  |  245 --
 .../controlnet/annotator/ppdet_hrnet/utils.py |  473 ---
 .../annotator/ppdet_hrnet/visualize.py        |  365 --
 .../annotator/segformer_paddle/__init__.py    |  400 --
 .../annotator/segformer_paddle/predict.py     |  261 --
 .../segformer_b5_ade20k_512x512_160k.yml      |   45 -
 ...segformer_b5_cityscapes_1024x1024_160k.yml |   34 -
 .../annotator/segmenter_paddle/__init__.py    |  388 --
 .../annotator/segmenter_paddle/predict.py     |  261 --
 ...er_vit_base_linear_ade20k_512x512_160k.yml |   38 -
 ...nter_vit_base_mask_ade20k_512x512_160k.yml |   10 -
 .../controlnet/annotator/shuffle/__init__.py  |   88 -
 .../examples/controlnet/annotator/util.py     |   82 -
 .../examples/controlnet/control/__init__.py   |   19 -
 .../controlnet/control/control_args.py        |   61 -
 .../controlnet/control/control_trainer.py     |  112 -
 .../controlnet/control/dumpy_dataset.py       |   70 -
 .../examples/controlnet/control/model.py      |  289 --
 .../extract_controlnet_ema_weights.py         |   50 -
 .../examples/controlnet/gradio_canny2image.py |  131 -
 .../examples/controlnet/gradio_depth2image.py |  128 -
 .../examples/controlnet/gradio_hed2image.py   |  133 -
 .../examples/controlnet/gradio_hough2image.py |  143 -
 .../examples/controlnet/gradio_ip2p2image.py  |  122 -
 .../controlnet/gradio_normal2image.py         |  136 -
 .../controlnet/gradio_pose2image_openpose.py  |  136 -
 .../controlnet/gradio_pose2image_ppdetpose.py |  136 -
 .../controlnet/gradio_seg2image_segformer.py  |  133 -
 .../controlnet/gradio_seg2image_segmenter.py  |  133 -
 .../controlnet/gradio_shuffle2image.py        |  122 -
 .../examples/controlnet/requirements.txt      |    6 -
 .../train_txt2img_control_trainer.py          |   99 -
 ppdiffusers/examples/dreambooth/README.md     |  282 --
 .../examples/dreambooth/requirements.txt      |    3 -
 .../examples/dreambooth/train_dreambooth.py   |  858 -----
 .../dreambooth/train_dreambooth_lora.py       | 1070 ------
 ppdiffusers/examples/inference/README.md      |   36 -
 ...e_guided_generation-versatile_diffusion.py |   27 -
 ...uided_image_inpainting-paint_by_example.py |   33 -
 .../inference/image_inpainting-repaint.py     |   38 -
 .../image_mixing-stable_diffusion.py          |   70 -
 ...ge_text_guided_generation-alt_diffusion.py |   31 -
 ...image_text_guided_generation-controlnet.py |   36 -
 ...age_text_guided_generation-deepfloyd_if.py |   59 -
 ...text_guided_generation-stable_diffusion.py |   33 -
 ...xt_guided_generation-stable_diffusion_2.py |   32 -
 .../image_to_text_generation-unidiffuser.py   |   23 -
 .../image_variation-stable_diffusion.py       |   43 -
 .../inference/image_variation-unidiffuser.py  |   23 -
 .../image_variation-versatile_diffusion.py    |   24 -
 .../super_resolution-latent_diffusion.py      |   33 -
 ...ed_generation-semantic_stable_diffusion.py |   43 -
 ...xt_guided_image_inpainting-deepfloyd_if.py |   55 -
 ...uided_image_inpainting-stable_diffusion.py |   32 -
 ...ded_image_inpainting-stable_diffusion_2.py |   30 -
 ...ided_image_upscaling-stable_diffusion_2.py |   25 -
 .../text_to_audio_generation-audio_ldm.py     |   31 -
 .../text_to_image_generation-alt_diffusion.py |   25 -
 .../text_to_image_generation-controlnet.py    |   47 -
 .../text_to_image_generation-deepfloyd_if.py  |   65 -
 ...xt_to_image_generation-latent_diffusion.py |   25 -
 ...xt_to_image_generation-stable_diffusion.py |   25 -
 ..._to_image_generation-stable_diffusion_2.py |   24 -
 ..._image_generation-stable_diffusion_safe.py |   22 -
 .../text_to_image_generation-t2i-adapter.py   |   35 -
 .../text_to_image_generation-unclip.py        |   24 -
 .../text_to_image_generation-unidiffuser.py   |   22 -
 ...to_image_generation-versatile_diffusion.py |   21 -
 .../text_to_image_generation-vq_diffusion.py  |   22 -
 ...eration_mixture_tiling-stable_diffusion.py |   43 -
 .../text_to_video_generation-synth.py         |   24 -
 .../text_to_video_generation-zero.py          |   28 -
 .../inference/text_variation-unidiffuser.py   |   23 -
 ...tional_audio_generation-audio_diffusion.py |   34 -
 ...tional_audio_generation-dance_diffusion.py |   27 -
 ..._audio_generation-spectrogram_diffusion.py |   36 -
 .../unconditional_image_generation-ddim.py    |   24 -
 .../unconditional_image_generation-ddpm.py    |   24 -
 ...mage_generation-latent_diffusion_uncond.py |   24 -
 .../unconditional_image_generation-pndm.py    |   24 -
 ...nditional_image_generation-score_sde_ve.py |   24 -
 ...l_image_generation-stochastic_karras_ve.py |   25 -
 ...onditional_image_generation-unidiffuser.py |   21 -
 ...image_text_joint_generation-unidiffuser.py |   24 -
 ...conditional_text_generation-unidiffuser.py |   22 -
 ppdiffusers/examples/reproduce/README.md      |   44 -
 ppdiffusers/examples/reproduce/README_cn.md   |   45 -
 .../examples/reproduce/align_record.md        |   69 -
 ppdiffusers/examples/stable_diffusion/bf16.sh |   56 -
 .../data/filelist/laion400m_en.filelist       |   10 -
 .../data/filelist/laion_aes.filelist          |   50 -
 .../data/filelist/laion_aes.filelist.list     |    1 -
 .../data/filelist/train.filelist.list         |    1 -
 .../data/filelist/write_filelist.py           |   20 -
 .../examples/stable_diffusion/prepare.sh      |   21 -
 .../stable_diffusion/requirements.txt         |    6 -
 .../examples/stable_diffusion/sd/__init__.py  |   19 -
 .../examples/stable_diffusion/sd/model.py     |  355 --
 .../examples/stable_diffusion/sd/sd_args.py   |  157 -
 .../stable_diffusion/sd/sd_trainer.py         |  254 --
 .../sd/text_image_pair_dataset.py             |  236 --
 .../train_txt2img_laion400m_trainer.py        |  121 -
 ppdiffusers/examples/t2i-adapter/README.md    |  167 -
 .../examples/t2i-adapter/adapter/__init__.py  |   20 -
 .../t2i-adapter/adapter/adapter_args.py       |  129 -
 .../t2i-adapter/adapter/adapter_trainer.py    |  140 -
 .../t2i-adapter/adapter/annotator_utils.py    |  131 -
 .../t2i-adapter/adapter/data_preprocess.py    |  103 -
 .../t2i-adapter/adapter/dumpy_dataset.py      |   82 -
 .../examples/t2i-adapter/adapter/model.py     |  332 --
 .../adapter/text_image_pair_dataset.py        |  224 --
 ppdiffusers/examples/t2i-adapter/annotator    |    1 -
 .../t2i-adapter/config/openpose_adapter.json  |   16 -
 .../data/laion-aes-canny.filelist.test        |    1 -
 .../data/laion-aes-canny.filelist.train       |   50 -
 .../data/laion-aes-openpose.filelist.test     |    1 -
 .../data/laion-aes-openpose.filelist.train    |   49 -
 .../t2i-adapter/data/test.canny.filelist      |    1 -
 .../t2i-adapter/data/test.openpose.filelist   |    1 -
 .../t2i-adapter/data/train.canny.filelist     |    1 -
 .../t2i-adapter/data/train.openpose.filelist  |    1 -
 ppdiffusers/examples/t2i-adapter/generate.py  |  236 --
 .../examples/t2i-adapter/requirements.txt     |    5 -
 ...onvert_diffusers_adapter_to_ppdiffusers.py |   63 -
 ...onvert_orig_adapter_ckpt_to_ppdiffusers.py |  132 -
 .../convert_t2i_adapter_to_latest_version.py  |   94 -
 .../t2i-adapter/tools/make_dummpy_dataset.py  |  124 -
 .../t2i-adapter/train_t2i_adapter_trainer.py  |  101 -
 ppdiffusers/examples/text_to_image/README.md  |  250 --
 .../examples/text_to_image/requirements.txt   |    3 -
 .../text_to_image/train_text_to_image.py      |  816 ----
 .../text_to_image/train_text_to_image_lora.py |  921 -----
 .../text_to_image_laion400m/README.md         |  331 --
 .../config/ldmbert.json                       |   15 -
 .../text_to_image_laion400m/config/unet.json  |   35 -
 .../data/filelist/laion400m_en.filelist       |   10 -
 .../data/filelist/laion_aes.filelist          |   50 -
 .../data/filelist/laion_aes.filelist.list     |    1 -
 .../data/filelist/train.filelist.list         |    1 -
 .../data/filelist/write_filelist.py           |   20 -
 .../generate_images.py                        |  165 -
 .../generate_pipelines.py                     |  167 -
 .../text_to_image_laion400m/ldm/__init__.py   |   19 -
 .../text_to_image_laion400m/ldm/ldm_args.py   |  180 -
 .../ldm/ldm_trainer.py                        |  220 --
 .../text_to_image_laion400m/ldm/model.py      |  340 --
 .../ldm/text_image_pair_dataset.py            |  236 --
 .../text_to_image_laion400m/requirements.txt  |    5 -
 .../text_to_image_laion400m/scripts/README.md |   60 -
 .../convert_orig_ldm_ckpt_to_ppdiffusers.py   |  775 ----
 .../convert_ppdiffusers_to_orig_ldm_ckpt.py   |  331 --
 .../scripts/plot_fid_clip_score.py            |   44 -
 .../scripts/text2img_L12H768_unet800M.yaml    |  104 -
 .../scripts/text2img_L32H1280_unet800M.yaml   |  105 -
 .../train_txt2img_laion400m_no_trainer.py     |  218 --
 .../train_txt2img_laion400m_trainer.py        |  107 -
 .../examples/textual_inversion/README.md      |  285 --
 .../textual_inversion/requirements.txt        |    3 -
 .../train_textual_inversion.py                |  930 -----
 ppdiffusers/examples/tomesd/README.md         |  120 -
 .../unconditional_image_generation/README.md  |  123 -
 .../requirements.txt                          |    5 -
 .../train_unconditional.py                    |  616 ---
 ppdiffusers/ppdiffusers/__init__.py           |  257 --
 ppdiffusers/ppdiffusers/commands/__init__.py  |   28 -
 ppdiffusers/ppdiffusers/commands/env.py       |   67 -
 .../ppdiffusers/commands/ppdiffusers_cli.py   |   42 -
 .../ppdiffusers/configuration_utils.py        |  666 ----
 .../ppdiffusers/experimental/README.md        |    6 -
 .../ppdiffusers/experimental/__init__.py      |   16 -
 .../ppdiffusers/experimental/rl/__init__.py   |   16 -
 .../experimental/rl/value_guided_sampling.py  |  152 -
 ppdiffusers/ppdiffusers/image_processor.py    |  255 --
 ppdiffusers/ppdiffusers/initializer.py        |   20 -
 ppdiffusers/ppdiffusers/loaders.py            | 1621 --------
 ppdiffusers/ppdiffusers/models/__init__.py    |   46 -
 ppdiffusers/ppdiffusers/models/adapter.py     |  252 --
 ppdiffusers/ppdiffusers/models/attention.py   |  574 ---
 .../ppdiffusers/models/attention_processor.py | 1078 ------
 .../ppdiffusers/models/autoencoder_kl.py      |  341 --
 ppdiffusers/ppdiffusers/models/controlnet.py  |  618 ---
 .../ppdiffusers/models/cross_attention.py     |   95 -
 .../ppdiffusers/models/dual_transformer_2d.py |  152 -
 ppdiffusers/ppdiffusers/models/ema.py         |  104 -
 ppdiffusers/ppdiffusers/models/embeddings.py  |  464 ---
 .../models/modeling_pytorch_paddle_utils.py   |  167 -
 .../ppdiffusers/models/modeling_utils.py      |  791 ----
 .../ppdiffusers/models/prior_transformer.py   |  219 --
 ppdiffusers/ppdiffusers/models/resnet.py      |  866 -----
 .../ppdiffusers/models/t5_film_transformer.py |  323 --
 .../ppdiffusers/models/transformer_2d.py      |  309 --
 .../models/transformer_temporal.py            |  165 -
 ppdiffusers/ppdiffusers/models/unet_1d.py     |  250 --
 .../ppdiffusers/models/unet_1d_blocks.py      |  724 ----
 ppdiffusers/ppdiffusers/models/unet_2d.py     |  351 --
 .../ppdiffusers/models/unet_2d_blocks.py      | 2916 --------------
 .../ppdiffusers/models/unet_2d_condition.py   |  836 -----
 .../ppdiffusers/models/unet_3d_blocks.py      |  638 ----
 .../ppdiffusers/models/unet_3d_condition.py   |  527 ---
 ppdiffusers/ppdiffusers/models/uvit.py        |  386 --
 ppdiffusers/ppdiffusers/models/vae.py         |  424 ---
 ppdiffusers/ppdiffusers/models/vq_model.py    |  157 -
 ppdiffusers/ppdiffusers/optimization.py       |  312 --
 ppdiffusers/ppdiffusers/patches/__init__.py   |   15 -
 .../ppdiffusers/patches/ppnlp_patch_utils.py  | 1544 --------
 .../ppdiffusers/patches/tomesd_patch_utils.py |  429 ---
 .../patches/webui_lora_patch_utils.py         | 2622 -------------
 ppdiffusers/ppdiffusers/pipeline_utils.py     |   24 -
 ppdiffusers/ppdiffusers/pipelines/README.md   |  569 ---
 ppdiffusers/ppdiffusers/pipelines/__init__.py |  163 -
 .../pipelines/alt_diffusion/__init__.py       |   48 -
 .../alt_diffusion/modeling_roberta_series.py  |  155 -
 .../alt_diffusion/pipeline_alt_diffusion.py   |  611 ---
 .../pipeline_alt_diffusion_img2img.py         |  682 ----
 .../pipelines/audio_diffusion/__init__.py     |   18 -
 .../pipelines/audio_diffusion/mel.py          |  160 -
 .../pipeline_audio_diffusion.py               |  273 --
 .../pipelines/audioldm/__init__.py            |   28 -
 .../pipelines/audioldm/pipeline_audioldm.py   |  505 ---
 .../pipelines/dance_diffusion/__init__.py     |   17 -
 .../pipeline_dance_diffusion.py               |  109 -
 .../ppdiffusers/pipelines/ddim/__init__.py    |   17 -
 .../pipelines/ddim/pipeline_ddim.py           |  117 -
 .../ppdiffusers/pipelines/ddpm/__init__.py    |   17 -
 .../pipelines/ddpm/pipeline_ddpm.py           |   92 -
 .../pipelines/deepfloyd_if/__init__.py        |   75 -
 .../pipelines/deepfloyd_if/pipeline_if.py     |  750 ----
 .../deepfloyd_if/pipeline_if_img2img.py       |  865 -----
 .../pipeline_if_img2img_superresolution.py    |  982 -----
 .../deepfloyd_if/pipeline_if_inpainting.py    |  989 -----
 .../pipeline_if_inpainting_superresolution.py | 1098 ------
 .../pipeline_if_superresolution.py            |  843 -----
 .../pipelines/deepfloyd_if/safety_checker.py  |   75 -
 .../pipelines/deepfloyd_if/timesteps.py       |  593 ---
 .../pipelines/deepfloyd_if/watermark.py       |   60 -
 .../ppdiffusers/pipelines/dit/__init__.py     |   16 -
 .../ppdiffusers/pipelines/dit/pipeline_dit.py |  208 -
 .../ppdiffusers/pipelines/fastdeploy_utils.py | 1477 --------
 .../pipelines/latent_diffusion/__init__.py    |   25 -
 .../pipeline_latent_diffusion.py              |  802 ----
 ...peline_latent_diffusion_superresolution.py |  155 -
 .../latent_diffusion_uncond/__init__.py       |   17 -
 .../pipeline_latent_diffusion_uncond.py       |  101 -
 .../pipelines/paint_by_example/__init__.py    |   26 -
 .../paint_by_example/image_encoder.py         |   81 -
 .../pipeline_paint_by_example.py              |  530 ---
 .../ppdiffusers/pipelines/pipeline_utils.py   | 1687 ---------
 .../ppdiffusers/pipelines/pndm/__init__.py    |   17 -
 .../pipelines/pndm/pipeline_pndm.py           |   98 -
 .../ppdiffusers/pipelines/repaint/__init__.py |   16 -
 .../pipelines/repaint/pipeline_repaint.py     |  169 -
 .../pipelines/score_sde_ve/__init__.py        |   17 -
 .../score_sde_ve/pipeline_score_sde_ve.py     |  100 -
 .../semantic_stable_diffusion/__init__.py     |   45 -
 .../custom_quantile.py                        |  207 -
 .../pipeline_semantic_stable_diffusion.py     |  713 ----
 .../spectrogram_diffusion/__init__.py         |   40 -
 .../continous_encoder.py                      |   81 -
 .../spectrogram_diffusion/midi_utils.py       |  637 ----
 .../spectrogram_diffusion/notes_encoder.py    |   76 -
 .../pipeline_spectrogram_diffusion.py         |  177 -
 .../pipelines/stable_diffusion/__init__.py    |  133 -
 .../stable_diffusion/convert_from_ckpt.py     | 1421 -------
 .../convert_from_ckpt_deprecated.py           | 1151 ------
 .../stable_diffusion/hf_clip_model.py         | 1314 -------
 .../pipeline_cycle_diffusion.py               |  717 ----
 .../pipeline_fastdeploy_cycle_diffusion.py    |  434 ---
 .../pipeline_fastdeploy_stable_diffusion.py   |  329 --
 ..._fastdeploy_stable_diffusion_controlnet.py |   31 -
 ...deploy_stable_diffusion_image_variation.py |  328 --
 ...ine_fastdeploy_stable_diffusion_img2img.py |  352 --
 ...ine_fastdeploy_stable_diffusion_inpaint.py |  556 ---
 ...tdeploy_stable_diffusion_inpaint_legacy.py |  527 ---
 ...peline_fastdeploy_stable_diffusion_mega.py |  360 --
 ...ine_fastdeploy_stable_diffusion_upscale.py |  317 --
 .../pipeline_stable_diffusion.py              |  608 ---
 .../pipeline_stable_diffusion_adapter.py      |  592 ---
 .../pipeline_stable_diffusion_all_in_one.py   | 1343 -------
 ...line_stable_diffusion_attend_and_excite.py |  993 -----
 .../pipeline_stable_diffusion_controlnet.py   |  947 -----
 .../pipeline_stable_diffusion_depth2img.py    |  664 ----
 ...peline_stable_diffusion_image_variation.py |  374 --
 .../pipeline_stable_diffusion_img2img.py      |  678 ----
 .../pipeline_stable_diffusion_inpaint.py      |  803 ----
 ...ipeline_stable_diffusion_inpaint_legacy.py |  650 ----
 ...eline_stable_diffusion_instruct_pix2pix.py |  670 ----
 .../pipeline_stable_diffusion_k_diffusion.py  |   16 -
 ...ipeline_stable_diffusion_latent_upscale.py |  479 ---
 .../pipeline_stable_diffusion_mega.py         |  203 -
 ...pipeline_stable_diffusion_model_editing.py |  731 ----
 .../pipeline_stable_diffusion_panorama.py     |  601 ---
 .../pipeline_stable_diffusion_pix2pix_zero.py | 1182 ------
 .../pipeline_stable_diffusion_sag.py          |  717 ----
 .../pipeline_stable_diffusion_upscale.py      |  549 ---
 .../pipeline_stable_unclip.py                 |  838 -----
 .../pipeline_stable_unclip_img2img.py         |  725 ----
 .../stable_diffusion/safety_checker.py        |  129 -
 .../stable_unclip_image_normalizer.py         |   73 -
 .../stable_diffusion_safe/__init__.py         |   84 -
 .../pipeline_stable_diffusion_safe.py         |  713 ----
 .../stable_diffusion_safe/safety_checker.py   |  113 -
 .../stochastic_karras_ve/__init__.py          |   17 -
 .../pipeline_stochastic_karras_ve.py          |  128 -
 .../text_to_video_synthesis/__init__.py       |   52 -
 .../pipeline_text_to_video_synth.py           |  481 ---
 .../pipeline_text_to_video_zero.py            |  536 ---
 .../ppdiffusers/pipelines/unclip/__init__.py  |   33 -
 .../pipelines/unclip/pipeline_unclip.py       |  505 ---
 .../unclip/pipeline_unclip_image_variation.py |  441 ---
 .../ppdiffusers/pipelines/unclip/text_proj.py |   91 -
 .../pipelines/unidiffuser/__init__.py         |   56 -
 .../pipelines/unidiffuser/caption_decoder.py  |  230 --
 .../unidiffuser/pipeline_unidiffuser.py       |  826 ----
 .../pipelines/versatile_diffusion/__init__.py |   43 -
 .../versatile_diffusion/modeling_text_unet.py | 1699 ---------
 .../pipeline_versatile_diffusion.py           |  457 ---
 ...ipeline_versatile_diffusion_dual_guided.py |  556 ---
 ...ine_versatile_diffusion_image_variation.py |  385 --
 ...eline_versatile_diffusion_text_to_image.py |  472 ---
 .../pipelines/vq_diffusion/__init__.py        |   23 -
 .../vq_diffusion/pipeline_vq_diffusion.py     |  341 --
 .../ppdiffusers/schedulers/__init__.py        |   64 -
 .../schedulers/preconfig/__init__.py          |   38 -
 ...fig_scheduling_euler_ancestral_discrete.py |  310 --
 .../preconfig_scheduling_lms_discrete.py      |  337 --
 .../ppdiffusers/schedulers/scheduling_ddim.py |  424 ---
 .../schedulers/scheduling_ddim_inverse.py     |  267 --
 .../ppdiffusers/schedulers/scheduling_ddpm.py |  459 ---
 .../schedulers/scheduling_deis_multistep.py   |  509 ---
 .../scheduling_dpmsolver_multistep.py         |  606 ---
 .../scheduling_dpmsolver_singlestep.py        |  626 ----
 .../scheduling_dpmsolver_unidiffuser.py       |  473 ---
 .../scheduling_euler_ancestral_discrete.py    |  273 --
 .../schedulers/scheduling_euler_discrete.py   |  347 --
 .../schedulers/scheduling_heun_discrete.py    |  339 --
 .../schedulers/scheduling_ipndm.py            |  165 -
 .../scheduling_k_dpm_2_ancestral_discrete.py  |  345 --
 .../schedulers/scheduling_k_dpm_2_discrete.py |  327 --
 .../schedulers/scheduling_karras_ve.py        |  233 --
 .../schedulers/scheduling_lms_discrete.py     |  295 --
 .../ppdiffusers/schedulers/scheduling_pndm.py |  426 ---
 .../schedulers/scheduling_repaint.py          |  324 --
 .../schedulers/scheduling_sde_ve.py           |  277 --
 .../schedulers/scheduling_sde_vp.py           |   90 -
 .../schedulers/scheduling_unclip.py           |  312 --
 .../schedulers/scheduling_unipc_multistep.py  |  640 ----
 .../schedulers/scheduling_utils.py            |  178 -
 .../schedulers/scheduling_vq_diffusion.py     |  497 ---
 ppdiffusers/ppdiffusers/training_utils.py     |  382 --
 ppdiffusers/ppdiffusers/utils/__init__.py     |  143 -
 ppdiffusers/ppdiffusers/utils/constants.py    |   87 -
 .../ppdiffusers/utils/deprecation_utils.py    |   64 -
 ppdiffusers/ppdiffusers/utils/doc_utils.py    |   39 -
 .../ppdiffusers/utils/download_utils.py       |  661 ----
 .../utils/dummy_fastdeploy_objects.py         |   32 -
 .../utils/dummy_note_seq_objects.py           |   31 -
 .../utils/dummy_paddle_and_einops_objects.py  |   32 -
 .../utils/dummy_paddle_and_librosa_objects.py |   47 -
 ...paddle_and_paddlenlp_and_einops_objects.py |   32 -
 ...le_and_paddlenlp_and_fastdeploy_objects.py |  124 -
 ...e_and_paddlenlp_and_k_diffusion_objects.py |   32 -
 ...ddle_and_paddlenlp_and_note_seq_objects.py |   31 -
 .../dummy_paddle_and_paddlenlp_objects.py     |  692 ----
 .../utils/dummy_paddle_and_scipy_objects.py   |   47 -
 .../ppdiffusers/utils/dummy_paddle_objects.py |  780 ----
 .../utils/dynamic_modules_utils.py            |  445 ---
 ppdiffusers/ppdiffusers/utils/hub_utils.py    |  219 --
 ppdiffusers/ppdiffusers/utils/import_utils.py |  619 ---
 .../ppdiffusers/utils/initializer_utils.py    |  325 --
 ppdiffusers/ppdiffusers/utils/load_utils.py   |  365 --
 ppdiffusers/ppdiffusers/utils/logging.py      |  339 --
 .../ppdiffusers/utils/model_card_template.md  |   48 -
 ppdiffusers/ppdiffusers/utils/outputs.py      |  118 -
 ppdiffusers/ppdiffusers/utils/paddle_utils.py |  212 --
 ppdiffusers/ppdiffusers/utils/pil_utils.py    |   66 -
 .../ppdiffusers/utils/testing_utils.py        |  539 ---
 ppdiffusers/ppdiffusers/version.py            |   17 -
 ppdiffusers/requirements.txt                  |    5 -
 .../cocoeval_keypoints_score/README.md        |   83 -
 .../cocoeval_keypoints_score/annotator        |    1 -
 .../cocoeval_keypoints.py                     |   32 -
 ...t_openpose_keypoints_result_coco_format.py |  264 --
 .../scripts/convert_diffusers_model/README.md |  323 --
 ...eDiffusionImageVariation_to_ppdiffusers.py |  250 --
 ...DiffusionUpscalePipeline_to_ppdiffusers.py |  252 --
 ...users_VersatileDiffusion_to_ppdiffusers.py |  263 --
 ..._diffusers_alt_diffusion_to_ppdiffusers.py |  331 --
 ...ert_diffusers_controlnet_to_ppdiffusers.py |   64 -
 ...s_latent_diffusion_model_to_ppdiffusers.py |  172 -
 ...diffusers_paintbyexample_to_ppdiffusers.py |  181 -
 ...table_diffusion2.0_depth_to_ppdiffusers.py |  196 -
 ...ble_diffusion_controlnet_to_ppdiffusers.py |  257 --
 ...ffusers_stable_diffusion_to_ppdiffusers.py |  244 --
 ...convert_diffusers_unclip_to_ppdiffusers.py |  221 --
 ...t_diffusers_vq_diffusion_to_ppdiffusers.py |  186 -
 .../convert_orig_sd_ckpt_to_ppdiffusers.py    |  974 -----
 ..._stablediffusion2.0_ckpt_to_ppdiffusers.py |  755 ----
 ...iffusers_stable_diffusion_to_fastdeploy.py |  169 -
 .../convert_diffusers_model/requirements.txt  |    7 -
 ppdiffusers/scripts/fid_clip_score/README.md  |   79 -
 .../fid_clip_score/compute_fid_clip_score.py  |  158 -
 .../scripts/fid_clip_score/fid_score.py       |  331 --
 .../scripts/fid_clip_score/inception.py       |  493 ---
 ppdiffusers/setup.py                          |   71 -
 ppdiffusers/tests/__init__.py                 |   14 -
 .../fixtures/custom_pipeline/__init__.py      |   14 -
 .../fixtures/custom_pipeline/pipeline.py      |   99 -
 .../fixtures/custom_pipeline/what_ever.py     |   99 -
 ppdiffusers/tests/fixtures/elise_format0.mid  |  Bin 14210 -> 0 bytes
 ppdiffusers/tests/models/__init__.py          |   14 -
 .../tests/models/test_attention_processor.py  |   89 -
 ppdiffusers/tests/models/test_layers_utils.py |  674 ----
 ppdiffusers/tests/models/test_lora_layers.py  |  222 --
 .../tests/models/test_modeling_common.py      |  381 --
 .../tests/models/test_models_unet_1d.py       |  234 --
 .../tests/models/test_models_unet_2d.py       |  244 --
 .../models/test_models_unet_2d_condition.py   |  797 ----
 .../models/test_models_unet_3d_condition.py   |  345 --
 ppdiffusers/tests/models/test_models_vae.py   |  322 --
 ppdiffusers/tests/models/test_models_vq.py    |   90 -
 .../tests/models/test_unet_2d_blocks.py       |  556 ---
 .../tests/models/test_unet_blocks_common.py   |  102 -
 ppdiffusers/tests/others/__init__.py          |   14 -
 ppdiffusers/tests/others/test_config.py       |  182 -
 ppdiffusers/tests/others/test_ema.py          |  155 -
 ppdiffusers/tests/others/test_hub_utils.py    |   49 -
 .../tests/others/test_image_processor.py      |  149 -
 ppdiffusers/tests/others/test_outputs.py      |   62 -
 ppdiffusers/tests/others/test_training.py     |  139 -
 ppdiffusers/tests/others/test_utils.py        |  152 -
 ppdiffusers/tests/pipelines/__init__.py       |   14 -
 .../tests/pipelines/altdiffusion/__init__.py  |   14 -
 .../altdiffusion/test_alt_diffusion.py        |  224 --
 .../test_alt_diffusion_img2img.py             |  255 --
 .../pipelines/audio_diffusion/__init__.py     |   14 -
 .../audio_diffusion/test_audio_diffusion.py   |  167 -
 .../tests/pipelines/audioldm/__init__.py      |   13 -
 .../tests/pipelines/audioldm/test_audioldm.py |  399 --
 .../pipelines/dance_diffusion/__init__.py     |   14 -
 .../dance_diffusion/test_dance_diffusion.py   |  115 -
 ppdiffusers/tests/pipelines/ddim/__init__.py  |   14 -
 ppdiffusers/tests/pipelines/ddim/test_ddim.py |  106 -
 ppdiffusers/tests/pipelines/ddpm/__init__.py  |   14 -
 ppdiffusers/tests/pipelines/ddpm/test_ddpm.py |   99 -
 .../tests/pipelines/deepfloyd_if/__init__.py  |  281 --
 .../tests/pipelines/deepfloyd_if/test_if.py   |  322 --
 .../pipelines/deepfloyd_if/test_if_img2img.py |   78 -
 .../test_if_img2img_superresolution.py        |   77 -
 .../deepfloyd_if/test_if_inpainting.py        |   77 -
 .../test_if_inpainting_superresolution.py     |   79 -
 .../deepfloyd_if/test_if_superresolution.py   |   75 -
 ppdiffusers/tests/pipelines/dit/__init__.py   |   14 -
 ppdiffusers/tests/pipelines/dit/test_dit.py   |  185 -
 .../tests/pipelines/karras_ve/__init__.py     |   14 -
 .../pipelines/karras_ve/test_karras_ve.py     |   73 -
 .../pipelines/latent_diffusion/__init__.py    |   14 -
 .../latent_diffusion/test_latent_diffusion.py |  192 -
 .../test_latent_diffusion_superresolution.py  |  107 -
 .../test_latent_diffusion_uncond.py           |  105 -
 .../pipelines/paint_by_example/__init__.py    |   14 -
 .../paint_by_example/test_paint_by_example.py |  188 -
 .../tests/pipelines/pipeline_params.py        |  137 -
 ppdiffusers/tests/pipelines/pndm/__init__.py  |   14 -
 ppdiffusers/tests/pipelines/pndm/test_pndm.py |   83 -
 .../tests/pipelines/repaint/__init__.py       |   14 -
 .../tests/pipelines/repaint/test_repaint.py   |  136 -
 .../tests/pipelines/score_sde_ve/__init__.py  |   14 -
 .../score_sde_ve/test_score_sde_ve.py         |   73 -
 .../semantic_stable_diffusion/__init__.py     |   14 -
 .../test_semantic_diffusion.py                |  509 ---
 .../spectrogram_diffusion/__init__.py         |   13 -
 .../test_spectrogram_diffusion.py             |  232 --
 .../pipelines/stable_diffusion/__init__.py    |   14 -
 .../stable_diffusion/test_cycle_diffusion.py  |  224 --
 .../test_onnx_stable_diffusion.py             |  265 --
 .../test_onnx_stable_diffusion_img2img.py     |  206 -
 .../test_onnx_stable_diffusion_inpaint.py     |  121 -
 ...st_onnx_stable_diffusion_inpaint_legacy.py |   81 -
 .../stable_diffusion/test_stable_diffusion.py |  674 ----
 .../test_stable_diffusion_adapter.py          |  217 --
 .../test_stable_diffusion_controlnet.py       |  355 --
 .../test_stable_diffusion_image_variation.py  |  270 --
 .../test_stable_diffusion_img2img.py          |  446 ---
 .../test_stable_diffusion_inpaint.py          |  455 ---
 .../test_stable_diffusion_inpaint_legacy.py   |  541 ---
 ...st_stable_diffusion_instruction_pix2pix.py |  293 --
 .../test_stable_diffusion_k_diffusion.py      |   87 -
 .../test_stable_diffusion_panorama.py         |  269 --
 .../test_stable_diffusion_pix2pix_zero.py     |  409 --
 .../test_stable_diffusion_sag.py              |  147 -
 .../pipelines/stable_diffusion_2/__init__.py  |   14 -
 .../test_stable_diffusion.py                  |  413 --
 ...test_stable_diffusion_attend_and_excite.py |  172 -
 .../test_stable_diffusion_depth.py            |  473 ---
 .../test_stable_diffusion_inpaint.py          |  194 -
 .../test_stable_diffusion_latent_upscale.py   |  229 --
 .../test_stable_diffusion_upscale.py          |  260 --
 .../test_stable_diffusion_v_pred.py           |  396 --
 .../stable_diffusion_safe/__init__.py         |   14 -
 .../test_safe_diffusion.py                    |  374 --
 .../tests/pipelines/stable_unclip/__init__.py |   14 -
 .../stable_unclip/test_stable_unclip.py       |  186 -
 .../test_stable_unclip_img2img.py             |  218 --
 .../tests/pipelines/test_pipeline_utils.py    |  149 -
 ppdiffusers/tests/pipelines/test_pipelines.py | 1158 ------
 .../tests/pipelines/test_pipelines_common.py  |  488 ---
 .../tests/pipelines/text_to_video/__init__.py |   13 -
 .../text_to_video/test_text_to_video.py       |  170 -
 .../text_to_video/test_text_to_video_zero.py  |   38 -
 .../tests/pipelines/unclip/__init__.py        |   14 -
 .../tests/pipelines/unclip/test_unclip.py     |  367 --
 .../unclip/test_unclip_image_variation.py     |  398 --
 .../tests/pipelines/unidiffuser/__init__.py   |   13 -
 .../pipelines/versatile_diffusion/__init__.py |   14 -
 .../test_versatile_diffusion_dual_guided.py   |  101 -
 ...est_versatile_diffusion_image_variation.py |   47 -
 .../test_versatile_diffusion_mega.py          |  124 -
 .../test_versatile_diffusion_text_to_image.py |   72 -
 .../tests/pipelines/vq_diffusion/__init__.py  |   14 -
 .../vq_diffusion/test_vq_diffusion.py         |  199 -
 ppdiffusers/tests/schedulers/__init__.py      |   13 -
 .../tests/schedulers/test_scheduler_ddim.py   |  154 -
 .../tests/schedulers/test_scheduler_ddpm.py   |  201 -
 .../tests/schedulers/test_scheduler_deis.py   |  252 --
 .../schedulers/test_scheduler_dpm_multi.py    |  274 --
 .../schedulers/test_scheduler_dpm_single.py   |  226 --
 .../tests/schedulers/test_scheduler_euler.py  |  155 -
 .../test_scheduler_euler_ancestral.py         |  128 -
 .../tests/schedulers/test_scheduler_heun.py   |  150 -
 .../tests/schedulers/test_scheduler_ipndm.py  |  175 -
 .../test_scheduler_kdpm2_ancestral.py         |  130 -
 .../test_scheduler_kdpm2_discrete.py          |  127 -
 .../tests/schedulers/test_scheduler_lms.py    |  127 -
 .../tests/schedulers/test_scheduler_pndm.py   |  256 --
 .../schedulers/test_scheduler_score_sde_ve.py |  211 --
 .../tests/schedulers/test_scheduler_unclip.py |  151 -
 .../tests/schedulers/test_scheduler_unipc.py  |  253 --
 .../schedulers/test_scheduler_vq_diffusion.py |   70 -
 .../tests/schedulers/test_schedulers.py       |  642 ----
 644 files changed, 169104 deletions(-)
 delete mode 100644 ppdiffusers/LICENSE
 delete mode 100644 ppdiffusers/Makefile
 delete mode 100644 ppdiffusers/VERSION
 delete mode 100644 ppdiffusers/deploy/README.md
 delete mode 100644 ppdiffusers/deploy/controlnet/README.md
 delete mode 100644 ppdiffusers/deploy/controlnet/export.md
 delete mode 100644 ppdiffusers/deploy/controlnet/export_model.py
 delete mode 100644 ppdiffusers/deploy/controlnet/infer.py
 delete mode 100644 ppdiffusers/deploy/controlnet/infer_dygraph.py
 delete mode 100644 ppdiffusers/deploy/controlnet/infer_dygraph_toch.py
 delete mode 100644 ppdiffusers/deploy/export.md
 delete mode 100644 ppdiffusers/deploy/export_model.py
 delete mode 100644 ppdiffusers/deploy/infer.py
 delete mode 100644 ppdiffusers/deploy/infer_dygraph.py
 delete mode 100644 ppdiffusers/deploy/infer_dygraph_torch.py
 delete mode 100644 ppdiffusers/deploy/requirements.txt
 delete mode 100644 ppdiffusers/deploy/scripts/test_controlnet_infer_dygraph.sh
 delete mode 100644 ppdiffusers/deploy/scripts/test_controlnet_infer_dygraph_torch.sh
 delete mode 100644 ppdiffusers/deploy/scripts/test_controlnet_infer_fd.sh
 delete mode 100644 ppdiffusers/deploy/scripts/test_infer_dygraph.sh
 delete mode 100644 ppdiffusers/deploy/scripts/test_infer_dygraph_torch.sh
 delete mode 100644 ppdiffusers/deploy/scripts/test_infer_fd.sh
 delete mode 100644 ppdiffusers/deploy/stable_diffusion_image_variation/export_model.py
 delete mode 100644 ppdiffusers/deploy/stable_diffusion_image_variation/infer.py
 delete mode 100644 ppdiffusers/deploy/stable_diffusion_image_variation/infer_dygraph.py
 delete mode 100644 ppdiffusers/deploy/stable_diffusion_image_variation/infer_dygraph_torch.py
 delete mode 100644 ppdiffusers/examples/Stable-CycleDiffusion/README.md
 delete mode 100644 ppdiffusers/examples/Stable-CycleDiffusion/app.py
 delete mode 100644 ppdiffusers/examples/Stable-CycleDiffusion/ptp_utils.py
 delete mode 100644 ppdiffusers/examples/Stable-CycleDiffusion/requirements.txt
 delete mode 100644 ppdiffusers/examples/Stable-CycleDiffusion/seq_aligner.py
 delete mode 100644 ppdiffusers/examples/autoencoder/vae/README.md
 delete mode 100644 ppdiffusers/examples/autoencoder/vae/config/f8encoder_f16decoder.yaml
 delete mode 100644 ppdiffusers/examples/autoencoder/vae/config/vae.json
 delete mode 100644 ppdiffusers/examples/autoencoder/vae/data/filelist/laion400m_en.filelist
 delete mode 100644 ppdiffusers/examples/autoencoder/vae/data/filelist/train.filelist.list
 delete mode 100644 ppdiffusers/examples/autoencoder/vae/data/filelist/write_filelist.py
 delete mode 100644 ppdiffusers/examples/autoencoder/vae/ldm/__init__.py
 delete mode 100644 ppdiffusers/examples/autoencoder/vae/ldm/autoencoder_datasets.py
 delete mode 100644 ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/__init__.py
 delete mode 100644 ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/bsrgan.py
 delete mode 100644 ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/bsrgan_light.py
 delete mode 100644 ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/utils_image.py
 delete mode 100644 ppdiffusers/examples/autoencoder/vae/ldm/losses.py
 delete mode 100644 ppdiffusers/examples/autoencoder/vae/ldm/model.py
 delete mode 100644 ppdiffusers/examples/autoencoder/vae/ldm/text_image_pair.py
 delete mode 100644 ppdiffusers/examples/autoencoder/vae/requirements.txt
 delete mode 100644 ppdiffusers/examples/autoencoder/vae/run.sh
 delete mode 100644 ppdiffusers/examples/autoencoder/vae/scripts/README.md
 delete mode 100644 ppdiffusers/examples/autoencoder/vae/scripts/calculate_psnr_ssim.py
 delete mode 100644 ppdiffusers/examples/autoencoder/vae/scripts/convert_kl_8_to_ppdiffusers.py
 delete mode 100644 ppdiffusers/examples/autoencoder/vae/scripts/fid_score.py
 delete mode 100644 ppdiffusers/examples/autoencoder/vae/scripts/get_autoencoder_results.py
 delete mode 100644 ppdiffusers/examples/autoencoder/vae/scripts/inception.py
 delete mode 100644 ppdiffusers/examples/autoencoder/vae/train_vae.py
 delete mode 100644 ppdiffusers/examples/clip_interrogator/LICENSE
 delete mode 100644 ppdiffusers/examples/clip_interrogator/README.md
 delete mode 100644 ppdiffusers/examples/clip_interrogator/clip_interrogator/__init__.py
 delete mode 100644 ppdiffusers/examples/clip_interrogator/clip_interrogator/blip_decoder.py
 delete mode 100644 ppdiffusers/examples/clip_interrogator/clip_interrogator/clip_interrogator.py
 delete mode 100644 ppdiffusers/examples/clip_interrogator/dumpy.py
 delete mode 100644 ppdiffusers/examples/clip_interrogator/predict.py
 delete mode 100644 ppdiffusers/examples/clip_interrogator/requirements.txt
 delete mode 100755 ppdiffusers/examples/clip_interrogator/run_cli.py
 delete mode 100755 ppdiffusers/examples/clip_interrogator/run_gradio.py
 delete mode 100644 ppdiffusers/examples/community/README.md
 delete mode 100644 ppdiffusers/examples/community/clip_guided_images_mixing_stable_diffusion.py
 delete mode 100644 ppdiffusers/examples/community/clip_guided_stable_diffusion.py
 delete mode 100644 ppdiffusers/examples/community/composable_stable_diffusion.py
 delete mode 100644 ppdiffusers/examples/community/inference_clip_guided_stable_diffusion.py
 delete mode 100644 ppdiffusers/examples/community/interpolate_stable_diffusion.py
 delete mode 100644 ppdiffusers/examples/community/lpw_stable_diffusion.py
 delete mode 100644 ppdiffusers/examples/community/mixture_tiling.py
 delete mode 100644 ppdiffusers/examples/community/one_step_unet.py
 delete mode 100644 ppdiffusers/examples/community/pipeline_fastdeploy_stable_diffusion_hires_fix.py
 delete mode 100644 ppdiffusers/examples/community/pipeline_fastdeploy_stable_diffusion_mixture_tiling.py
 delete mode 100644 ppdiffusers/examples/community/reference_only.py
 delete mode 100644 ppdiffusers/examples/community/stable_diffusion_controlnet_img2img.py
 delete mode 100644 ppdiffusers/examples/community/stable_diffusion_hires_fix.py
 delete mode 100644 ppdiffusers/examples/community/stable_diffusion_mega.py
 delete mode 100644 ppdiffusers/examples/community/webui_stable_diffusion.py
 delete mode 100644 ppdiffusers/examples/community/wildcard_stable_diffusion.py
 delete mode 100644 ppdiffusers/examples/controlnet/README.md
 delete mode 100644 ppdiffusers/examples/controlnet/annotator/_base_/ade20k.yml
 delete mode 100644 ppdiffusers/examples/controlnet/annotator/_base_/cityscapes.yml
 delete mode 100644 ppdiffusers/examples/controlnet/annotator/_base_/cityscapes_1024x1024.yml
 delete mode 100644 ppdiffusers/examples/controlnet/annotator/canny/__init__.py
 delete mode 100644 ppdiffusers/examples/controlnet/annotator/hed/__init__.py
 delete mode 100644 ppdiffusers/examples/controlnet/annotator/midas_paddle/__init__.py
 delete mode 100644 ppdiffusers/examples/controlnet/annotator/midas_paddle/api_inference.py
 delete mode 100644 ppdiffusers/examples/controlnet/annotator/mlsd/__init__.py
 delete mode 100644 ppdiffusers/examples/controlnet/annotator/mlsd/models/mbv2_mlsd_large.py
 delete mode 100644 ppdiffusers/examples/controlnet/annotator/mlsd/utils.py
 delete mode 100644 ppdiffusers/examples/controlnet/annotator/openpose/__init__.py
 delete mode 100644 ppdiffusers/examples/controlnet/annotator/openpose/util.py
 delete mode 100644 ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/__init__.py
 delete mode 100644 ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/benchmark_utils.py
 delete mode 100644 ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/det_keypoint_unite_infer.py
 delete mode 100644 ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/det_keypoint_unite_utils.py
 delete mode 100644 ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/infer.py
 delete mode 100644 ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_infer.py
 delete mode 100644 ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_postprocess.py
 delete mode 100644 ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_preprocess.py
 delete mode 100644 ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/picodet_postprocess.py
 delete mode 100644 ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/preprocess.py
 delete mode 100644 ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/util.py
 delete mode 100644 ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/utils.py
 delete mode 100644 ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/visualize.py
 delete mode 100644 ppdiffusers/examples/controlnet/annotator/segformer_paddle/__init__.py
 delete mode 100644 ppdiffusers/examples/controlnet/annotator/segformer_paddle/predict.py
 delete mode 100644 ppdiffusers/examples/controlnet/annotator/segformer_paddle/segformer_b5_ade20k_512x512_160k.yml
 delete mode 100644 ppdiffusers/examples/controlnet/annotator/segformer_paddle/segformer_b5_cityscapes_1024x1024_160k.yml
 delete mode 100644 ppdiffusers/examples/controlnet/annotator/segmenter_paddle/__init__.py
 delete mode 100644 ppdiffusers/examples/controlnet/annotator/segmenter_paddle/predict.py
 delete mode 100644 ppdiffusers/examples/controlnet/annotator/segmenter_paddle/segmenter_vit_base_linear_ade20k_512x512_160k.yml
 delete mode 100644 ppdiffusers/examples/controlnet/annotator/segmenter_paddle/segmenter_vit_base_mask_ade20k_512x512_160k.yml
 delete mode 100644 ppdiffusers/examples/controlnet/annotator/shuffle/__init__.py
 delete mode 100644 ppdiffusers/examples/controlnet/annotator/util.py
 delete mode 100644 ppdiffusers/examples/controlnet/control/__init__.py
 delete mode 100644 ppdiffusers/examples/controlnet/control/control_args.py
 delete mode 100644 ppdiffusers/examples/controlnet/control/control_trainer.py
 delete mode 100644 ppdiffusers/examples/controlnet/control/dumpy_dataset.py
 delete mode 100644 ppdiffusers/examples/controlnet/control/model.py
 delete mode 100644 ppdiffusers/examples/controlnet/extract_controlnet_ema_weights.py
 delete mode 100644 ppdiffusers/examples/controlnet/gradio_canny2image.py
 delete mode 100644 ppdiffusers/examples/controlnet/gradio_depth2image.py
 delete mode 100644 ppdiffusers/examples/controlnet/gradio_hed2image.py
 delete mode 100644 ppdiffusers/examples/controlnet/gradio_hough2image.py
 delete mode 100644 ppdiffusers/examples/controlnet/gradio_ip2p2image.py
 delete mode 100644 ppdiffusers/examples/controlnet/gradio_normal2image.py
 delete mode 100644 ppdiffusers/examples/controlnet/gradio_pose2image_openpose.py
 delete mode 100644 ppdiffusers/examples/controlnet/gradio_pose2image_ppdetpose.py
 delete mode 100644 ppdiffusers/examples/controlnet/gradio_seg2image_segformer.py
 delete mode 100644 ppdiffusers/examples/controlnet/gradio_seg2image_segmenter.py
 delete mode 100644 ppdiffusers/examples/controlnet/gradio_shuffle2image.py
 delete mode 100644 ppdiffusers/examples/controlnet/requirements.txt
 delete mode 100644 ppdiffusers/examples/controlnet/train_txt2img_control_trainer.py
 delete mode 100644 ppdiffusers/examples/dreambooth/README.md
 delete mode 100644 ppdiffusers/examples/dreambooth/requirements.txt
 delete mode 100644 ppdiffusers/examples/dreambooth/train_dreambooth.py
 delete mode 100644 ppdiffusers/examples/dreambooth/train_dreambooth_lora.py
 delete mode 100644 ppdiffusers/examples/inference/README.md
 delete mode 100644 ppdiffusers/examples/inference/dual_text_and_image_guided_generation-versatile_diffusion.py
 delete mode 100644 ppdiffusers/examples/inference/image_guided_image_inpainting-paint_by_example.py
 delete mode 100644 ppdiffusers/examples/inference/image_inpainting-repaint.py
 delete mode 100644 ppdiffusers/examples/inference/image_mixing-stable_diffusion.py
 delete mode 100644 ppdiffusers/examples/inference/image_to_image_text_guided_generation-alt_diffusion.py
 delete mode 100644 ppdiffusers/examples/inference/image_to_image_text_guided_generation-controlnet.py
 delete mode 100644 ppdiffusers/examples/inference/image_to_image_text_guided_generation-deepfloyd_if.py
 delete mode 100644 ppdiffusers/examples/inference/image_to_image_text_guided_generation-stable_diffusion.py
 delete mode 100644 ppdiffusers/examples/inference/image_to_image_text_guided_generation-stable_diffusion_2.py
 delete mode 100644 ppdiffusers/examples/inference/image_to_text_generation-unidiffuser.py
 delete mode 100644 ppdiffusers/examples/inference/image_variation-stable_diffusion.py
 delete mode 100644 ppdiffusers/examples/inference/image_variation-unidiffuser.py
 delete mode 100644 ppdiffusers/examples/inference/image_variation-versatile_diffusion.py
 delete mode 100644 ppdiffusers/examples/inference/super_resolution-latent_diffusion.py
 delete mode 100644 ppdiffusers/examples/inference/text_guided_generation-semantic_stable_diffusion.py
 delete mode 100644 ppdiffusers/examples/inference/text_guided_image_inpainting-deepfloyd_if.py
 delete mode 100644 ppdiffusers/examples/inference/text_guided_image_inpainting-stable_diffusion.py
 delete mode 100644 ppdiffusers/examples/inference/text_guided_image_inpainting-stable_diffusion_2.py
 delete mode 100644 ppdiffusers/examples/inference/text_guided_image_upscaling-stable_diffusion_2.py
 delete mode 100644 ppdiffusers/examples/inference/text_to_audio_generation-audio_ldm.py
 delete mode 100644 ppdiffusers/examples/inference/text_to_image_generation-alt_diffusion.py
 delete mode 100644 ppdiffusers/examples/inference/text_to_image_generation-controlnet.py
 delete mode 100644 ppdiffusers/examples/inference/text_to_image_generation-deepfloyd_if.py
 delete mode 100644 ppdiffusers/examples/inference/text_to_image_generation-latent_diffusion.py
 delete mode 100644 ppdiffusers/examples/inference/text_to_image_generation-stable_diffusion.py
 delete mode 100644 ppdiffusers/examples/inference/text_to_image_generation-stable_diffusion_2.py
 delete mode 100644 ppdiffusers/examples/inference/text_to_image_generation-stable_diffusion_safe.py
 delete mode 100644 ppdiffusers/examples/inference/text_to_image_generation-t2i-adapter.py
 delete mode 100644 ppdiffusers/examples/inference/text_to_image_generation-unclip.py
 delete mode 100644 ppdiffusers/examples/inference/text_to_image_generation-unidiffuser.py
 delete mode 100644 ppdiffusers/examples/inference/text_to_image_generation-versatile_diffusion.py
 delete mode 100644 ppdiffusers/examples/inference/text_to_image_generation-vq_diffusion.py
 delete mode 100644 ppdiffusers/examples/inference/text_to_image_generation_mixture_tiling-stable_diffusion.py
 delete mode 100644 ppdiffusers/examples/inference/text_to_video_generation-synth.py
 delete mode 100644 ppdiffusers/examples/inference/text_to_video_generation-zero.py
 delete mode 100644 ppdiffusers/examples/inference/text_variation-unidiffuser.py
 delete mode 100644 ppdiffusers/examples/inference/unconditional_audio_generation-audio_diffusion.py
 delete mode 100644 ppdiffusers/examples/inference/unconditional_audio_generation-dance_diffusion.py
 delete mode 100644 ppdiffusers/examples/inference/unconditional_audio_generation-spectrogram_diffusion.py
 delete mode 100644 ppdiffusers/examples/inference/unconditional_image_generation-ddim.py
 delete mode 100644 ppdiffusers/examples/inference/unconditional_image_generation-ddpm.py
 delete mode 100644 ppdiffusers/examples/inference/unconditional_image_generation-latent_diffusion_uncond.py
 delete mode 100644 ppdiffusers/examples/inference/unconditional_image_generation-pndm.py
 delete mode 100644 ppdiffusers/examples/inference/unconditional_image_generation-score_sde_ve.py
 delete mode 100644 ppdiffusers/examples/inference/unconditional_image_generation-stochastic_karras_ve.py
 delete mode 100644 ppdiffusers/examples/inference/unconditional_image_generation-unidiffuser.py
 delete mode 100644 ppdiffusers/examples/inference/unconditional_image_text_joint_generation-unidiffuser.py
 delete mode 100644 ppdiffusers/examples/inference/unconditional_text_generation-unidiffuser.py
 delete mode 100644 ppdiffusers/examples/reproduce/README.md
 delete mode 100644 ppdiffusers/examples/reproduce/README_cn.md
 delete mode 100644 ppdiffusers/examples/reproduce/align_record.md
 delete mode 100644 ppdiffusers/examples/stable_diffusion/bf16.sh
 delete mode 100644 ppdiffusers/examples/stable_diffusion/data/filelist/laion400m_en.filelist
 delete mode 100644 ppdiffusers/examples/stable_diffusion/data/filelist/laion_aes.filelist
 delete mode 100644 ppdiffusers/examples/stable_diffusion/data/filelist/laion_aes.filelist.list
 delete mode 100644 ppdiffusers/examples/stable_diffusion/data/filelist/train.filelist.list
 delete mode 100644 ppdiffusers/examples/stable_diffusion/data/filelist/write_filelist.py
 delete mode 100644 ppdiffusers/examples/stable_diffusion/prepare.sh
 delete mode 100644 ppdiffusers/examples/stable_diffusion/requirements.txt
 delete mode 100644 ppdiffusers/examples/stable_diffusion/sd/__init__.py
 delete mode 100644 ppdiffusers/examples/stable_diffusion/sd/model.py
 delete mode 100644 ppdiffusers/examples/stable_diffusion/sd/sd_args.py
 delete mode 100644 ppdiffusers/examples/stable_diffusion/sd/sd_trainer.py
 delete mode 100644 ppdiffusers/examples/stable_diffusion/sd/text_image_pair_dataset.py
 delete mode 100644 ppdiffusers/examples/stable_diffusion/train_txt2img_laion400m_trainer.py
 delete mode 100644 ppdiffusers/examples/t2i-adapter/README.md
 delete mode 100644 ppdiffusers/examples/t2i-adapter/adapter/__init__.py
 delete mode 100644 ppdiffusers/examples/t2i-adapter/adapter/adapter_args.py
 delete mode 100644 ppdiffusers/examples/t2i-adapter/adapter/adapter_trainer.py
 delete mode 100644 ppdiffusers/examples/t2i-adapter/adapter/annotator_utils.py
 delete mode 100644 ppdiffusers/examples/t2i-adapter/adapter/data_preprocess.py
 delete mode 100644 ppdiffusers/examples/t2i-adapter/adapter/dumpy_dataset.py
 delete mode 100644 ppdiffusers/examples/t2i-adapter/adapter/model.py
 delete mode 100644 ppdiffusers/examples/t2i-adapter/adapter/text_image_pair_dataset.py
 delete mode 120000 ppdiffusers/examples/t2i-adapter/annotator
 delete mode 100644 ppdiffusers/examples/t2i-adapter/config/openpose_adapter.json
 delete mode 100644 ppdiffusers/examples/t2i-adapter/data/laion-aes-canny.filelist.test
 delete mode 100644 ppdiffusers/examples/t2i-adapter/data/laion-aes-canny.filelist.train
 delete mode 100644 ppdiffusers/examples/t2i-adapter/data/laion-aes-openpose.filelist.test
 delete mode 100644 ppdiffusers/examples/t2i-adapter/data/laion-aes-openpose.filelist.train
 delete mode 100644 ppdiffusers/examples/t2i-adapter/data/test.canny.filelist
 delete mode 100644 ppdiffusers/examples/t2i-adapter/data/test.openpose.filelist
 delete mode 100644 ppdiffusers/examples/t2i-adapter/data/train.canny.filelist
 delete mode 100644 ppdiffusers/examples/t2i-adapter/data/train.openpose.filelist
 delete mode 100644 ppdiffusers/examples/t2i-adapter/generate.py
 delete mode 100644 ppdiffusers/examples/t2i-adapter/requirements.txt
 delete mode 100644 ppdiffusers/examples/t2i-adapter/tools/convert_diffusers_adapter_to_ppdiffusers.py
 delete mode 100644 ppdiffusers/examples/t2i-adapter/tools/convert_orig_adapter_ckpt_to_ppdiffusers.py
 delete mode 100644 ppdiffusers/examples/t2i-adapter/tools/convert_t2i_adapter_to_latest_version.py
 delete mode 100644 ppdiffusers/examples/t2i-adapter/tools/make_dummpy_dataset.py
 delete mode 100644 ppdiffusers/examples/t2i-adapter/train_t2i_adapter_trainer.py
 delete mode 100644 ppdiffusers/examples/text_to_image/README.md
 delete mode 100644 ppdiffusers/examples/text_to_image/requirements.txt
 delete mode 100644 ppdiffusers/examples/text_to_image/train_text_to_image.py
 delete mode 100644 ppdiffusers/examples/text_to_image/train_text_to_image_lora.py
 delete mode 100644 ppdiffusers/examples/text_to_image_laion400m/README.md
 delete mode 100644 ppdiffusers/examples/text_to_image_laion400m/config/ldmbert.json
 delete mode 100644 ppdiffusers/examples/text_to_image_laion400m/config/unet.json
 delete mode 100644 ppdiffusers/examples/text_to_image_laion400m/data/filelist/laion400m_en.filelist
 delete mode 100644 ppdiffusers/examples/text_to_image_laion400m/data/filelist/laion_aes.filelist
 delete mode 100644 ppdiffusers/examples/text_to_image_laion400m/data/filelist/laion_aes.filelist.list
 delete mode 100644 ppdiffusers/examples/text_to_image_laion400m/data/filelist/train.filelist.list
 delete mode 100644 ppdiffusers/examples/text_to_image_laion400m/data/filelist/write_filelist.py
 delete mode 100644 ppdiffusers/examples/text_to_image_laion400m/generate_images.py
 delete mode 100644 ppdiffusers/examples/text_to_image_laion400m/generate_pipelines.py
 delete mode 100644 ppdiffusers/examples/text_to_image_laion400m/ldm/__init__.py
 delete mode 100644 ppdiffusers/examples/text_to_image_laion400m/ldm/ldm_args.py
 delete mode 100644 ppdiffusers/examples/text_to_image_laion400m/ldm/ldm_trainer.py
 delete mode 100644 ppdiffusers/examples/text_to_image_laion400m/ldm/model.py
 delete mode 100644 ppdiffusers/examples/text_to_image_laion400m/ldm/text_image_pair_dataset.py
 delete mode 100644 ppdiffusers/examples/text_to_image_laion400m/requirements.txt
 delete mode 100644 ppdiffusers/examples/text_to_image_laion400m/scripts/README.md
 delete mode 100644 ppdiffusers/examples/text_to_image_laion400m/scripts/convert_orig_ldm_ckpt_to_ppdiffusers.py
 delete mode 100644 ppdiffusers/examples/text_to_image_laion400m/scripts/convert_ppdiffusers_to_orig_ldm_ckpt.py
 delete mode 100644 ppdiffusers/examples/text_to_image_laion400m/scripts/plot_fid_clip_score.py
 delete mode 100644 ppdiffusers/examples/text_to_image_laion400m/scripts/text2img_L12H768_unet800M.yaml
 delete mode 100644 ppdiffusers/examples/text_to_image_laion400m/scripts/text2img_L32H1280_unet800M.yaml
 delete mode 100644 ppdiffusers/examples/text_to_image_laion400m/train_txt2img_laion400m_no_trainer.py
 delete mode 100644 ppdiffusers/examples/text_to_image_laion400m/train_txt2img_laion400m_trainer.py
 delete mode 100644 ppdiffusers/examples/textual_inversion/README.md
 delete mode 100644 ppdiffusers/examples/textual_inversion/requirements.txt
 delete mode 100644 ppdiffusers/examples/textual_inversion/train_textual_inversion.py
 delete mode 100644 ppdiffusers/examples/tomesd/README.md
 delete mode 100644 ppdiffusers/examples/unconditional_image_generation/README.md
 delete mode 100644 ppdiffusers/examples/unconditional_image_generation/requirements.txt
 delete mode 100644 ppdiffusers/examples/unconditional_image_generation/train_unconditional.py
 delete mode 100644 ppdiffusers/ppdiffusers/__init__.py
 delete mode 100644 ppdiffusers/ppdiffusers/commands/__init__.py
 delete mode 100644 ppdiffusers/ppdiffusers/commands/env.py
 delete mode 100644 ppdiffusers/ppdiffusers/commands/ppdiffusers_cli.py
 delete mode 100644 ppdiffusers/ppdiffusers/configuration_utils.py
 delete mode 100644 ppdiffusers/ppdiffusers/experimental/README.md
 delete mode 100644 ppdiffusers/ppdiffusers/experimental/__init__.py
 delete mode 100644 ppdiffusers/ppdiffusers/experimental/rl/__init__.py
 delete mode 100644 ppdiffusers/ppdiffusers/experimental/rl/value_guided_sampling.py
 delete mode 100644 ppdiffusers/ppdiffusers/image_processor.py
 delete mode 100644 ppdiffusers/ppdiffusers/initializer.py
 delete mode 100644 ppdiffusers/ppdiffusers/loaders.py
 delete mode 100644 ppdiffusers/ppdiffusers/models/__init__.py
 delete mode 100644 ppdiffusers/ppdiffusers/models/adapter.py
 delete mode 100644 ppdiffusers/ppdiffusers/models/attention.py
 delete mode 100644 ppdiffusers/ppdiffusers/models/attention_processor.py
 delete mode 100644 ppdiffusers/ppdiffusers/models/autoencoder_kl.py
 delete mode 100644 ppdiffusers/ppdiffusers/models/controlnet.py
 delete mode 100644 ppdiffusers/ppdiffusers/models/cross_attention.py
 delete mode 100644 ppdiffusers/ppdiffusers/models/dual_transformer_2d.py
 delete mode 100644 ppdiffusers/ppdiffusers/models/ema.py
 delete mode 100644 ppdiffusers/ppdiffusers/models/embeddings.py
 delete mode 100644 ppdiffusers/ppdiffusers/models/modeling_pytorch_paddle_utils.py
 delete mode 100644 ppdiffusers/ppdiffusers/models/modeling_utils.py
 delete mode 100644 ppdiffusers/ppdiffusers/models/prior_transformer.py
 delete mode 100644 ppdiffusers/ppdiffusers/models/resnet.py
 delete mode 100644 ppdiffusers/ppdiffusers/models/t5_film_transformer.py
 delete mode 100644 ppdiffusers/ppdiffusers/models/transformer_2d.py
 delete mode 100644 ppdiffusers/ppdiffusers/models/transformer_temporal.py
 delete mode 100644 ppdiffusers/ppdiffusers/models/unet_1d.py
 delete mode 100644 ppdiffusers/ppdiffusers/models/unet_1d_blocks.py
 delete mode 100644 ppdiffusers/ppdiffusers/models/unet_2d.py
 delete mode 100644 ppdiffusers/ppdiffusers/models/unet_2d_blocks.py
 delete mode 100644 ppdiffusers/ppdiffusers/models/unet_2d_condition.py
 delete mode 100644 ppdiffusers/ppdiffusers/models/unet_3d_blocks.py
 delete mode 100644 ppdiffusers/ppdiffusers/models/unet_3d_condition.py
 delete mode 100644 ppdiffusers/ppdiffusers/models/uvit.py
 delete mode 100644 ppdiffusers/ppdiffusers/models/vae.py
 delete mode 100644 ppdiffusers/ppdiffusers/models/vq_model.py
 delete mode 100644 ppdiffusers/ppdiffusers/optimization.py
 delete mode 100644 ppdiffusers/ppdiffusers/patches/__init__.py
 delete mode 100644 ppdiffusers/ppdiffusers/patches/ppnlp_patch_utils.py
 delete mode 100644 ppdiffusers/ppdiffusers/patches/tomesd_patch_utils.py
 delete mode 100644 ppdiffusers/ppdiffusers/patches/webui_lora_patch_utils.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipeline_utils.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/README.md
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/__init__.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/alt_diffusion/__init__.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/alt_diffusion/modeling_roberta_series.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/audio_diffusion/__init__.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/audio_diffusion/mel.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/audioldm/__init__.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/audioldm/pipeline_audioldm.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/dance_diffusion/__init__.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/ddim/__init__.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/ddim/pipeline_ddim.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/ddpm/__init__.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/ddpm/pipeline_ddpm.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/__init__.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/safety_checker.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/timesteps.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/watermark.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/dit/__init__.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/dit/pipeline_dit.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/fastdeploy_utils.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/latent_diffusion/__init__.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/latent_diffusion_uncond/__init__.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/paint_by_example/__init__.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/paint_by_example/image_encoder.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/pipeline_utils.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/pndm/__init__.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/pndm/pipeline_pndm.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/repaint/__init__.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/repaint/pipeline_repaint.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/score_sde_ve/__init__.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/__init__.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/custom_quantile.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/__init__.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/continous_encoder.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/midi_utils.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/notes_encoder.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion/__init__.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion/convert_from_ckpt.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion/convert_from_ckpt_deprecated.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion/hf_clip_model.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_controlnet.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_image_variation.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_img2img.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_inpaint.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_inpaint_legacy.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_mega.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_upscale.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_adapter.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_all_in_one.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
 delete mode 100755 ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_mega.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion/safety_checker.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion_safe/__init__.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion_safe/safety_checker.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stochastic_karras_ve/__init__.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/stochastic_karras_ve/pipeline_stochastic_karras_ve.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/__init__.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/unclip/__init__.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/unclip/pipeline_unclip.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/unclip/pipeline_unclip_image_variation.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/unclip/text_proj.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/unidiffuser/__init__.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/unidiffuser/caption_decoder.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/unidiffuser/pipeline_unidiffuser.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/__init__.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/modeling_text_unet.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/vq_diffusion/__init__.py
 delete mode 100644 ppdiffusers/ppdiffusers/pipelines/vq_diffusion/pipeline_vq_diffusion.py
 delete mode 100644 ppdiffusers/ppdiffusers/schedulers/__init__.py
 delete mode 100644 ppdiffusers/ppdiffusers/schedulers/preconfig/__init__.py
 delete mode 100644 ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_euler_ancestral_discrete.py
 delete mode 100644 ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_lms_discrete.py
 delete mode 100644 ppdiffusers/ppdiffusers/schedulers/scheduling_ddim.py
 delete mode 100644 ppdiffusers/ppdiffusers/schedulers/scheduling_ddim_inverse.py
 delete mode 100644 ppdiffusers/ppdiffusers/schedulers/scheduling_ddpm.py
 delete mode 100644 ppdiffusers/ppdiffusers/schedulers/scheduling_deis_multistep.py
 delete mode 100644 ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_multistep.py
 delete mode 100644 ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_singlestep.py
 delete mode 100644 ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_unidiffuser.py
 delete mode 100644 ppdiffusers/ppdiffusers/schedulers/scheduling_euler_ancestral_discrete.py
 delete mode 100644 ppdiffusers/ppdiffusers/schedulers/scheduling_euler_discrete.py
 delete mode 100644 ppdiffusers/ppdiffusers/schedulers/scheduling_heun_discrete.py
 delete mode 100644 ppdiffusers/ppdiffusers/schedulers/scheduling_ipndm.py
 delete mode 100644 ppdiffusers/ppdiffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
 delete mode 100644 ppdiffusers/ppdiffusers/schedulers/scheduling_k_dpm_2_discrete.py
 delete mode 100644 ppdiffusers/ppdiffusers/schedulers/scheduling_karras_ve.py
 delete mode 100644 ppdiffusers/ppdiffusers/schedulers/scheduling_lms_discrete.py
 delete mode 100644 ppdiffusers/ppdiffusers/schedulers/scheduling_pndm.py
 delete mode 100644 ppdiffusers/ppdiffusers/schedulers/scheduling_repaint.py
 delete mode 100644 ppdiffusers/ppdiffusers/schedulers/scheduling_sde_ve.py
 delete mode 100644 ppdiffusers/ppdiffusers/schedulers/scheduling_sde_vp.py
 delete mode 100644 ppdiffusers/ppdiffusers/schedulers/scheduling_unclip.py
 delete mode 100644 ppdiffusers/ppdiffusers/schedulers/scheduling_unipc_multistep.py
 delete mode 100644 ppdiffusers/ppdiffusers/schedulers/scheduling_utils.py
 delete mode 100644 ppdiffusers/ppdiffusers/schedulers/scheduling_vq_diffusion.py
 delete mode 100644 ppdiffusers/ppdiffusers/training_utils.py
 delete mode 100644 ppdiffusers/ppdiffusers/utils/__init__.py
 delete mode 100644 ppdiffusers/ppdiffusers/utils/constants.py
 delete mode 100644 ppdiffusers/ppdiffusers/utils/deprecation_utils.py
 delete mode 100644 ppdiffusers/ppdiffusers/utils/doc_utils.py
 delete mode 100644 ppdiffusers/ppdiffusers/utils/download_utils.py
 delete mode 100644 ppdiffusers/ppdiffusers/utils/dummy_fastdeploy_objects.py
 delete mode 100644 ppdiffusers/ppdiffusers/utils/dummy_note_seq_objects.py
 delete mode 100644 ppdiffusers/ppdiffusers/utils/dummy_paddle_and_einops_objects.py
 delete mode 100644 ppdiffusers/ppdiffusers/utils/dummy_paddle_and_librosa_objects.py
 delete mode 100644 ppdiffusers/ppdiffusers/utils/dummy_paddle_and_paddlenlp_and_einops_objects.py
 delete mode 100644 ppdiffusers/ppdiffusers/utils/dummy_paddle_and_paddlenlp_and_fastdeploy_objects.py
 delete mode 100644 ppdiffusers/ppdiffusers/utils/dummy_paddle_and_paddlenlp_and_k_diffusion_objects.py
 delete mode 100644 ppdiffusers/ppdiffusers/utils/dummy_paddle_and_paddlenlp_and_note_seq_objects.py
 delete mode 100644 ppdiffusers/ppdiffusers/utils/dummy_paddle_and_paddlenlp_objects.py
 delete mode 100644 ppdiffusers/ppdiffusers/utils/dummy_paddle_and_scipy_objects.py
 delete mode 100644 ppdiffusers/ppdiffusers/utils/dummy_paddle_objects.py
 delete mode 100644 ppdiffusers/ppdiffusers/utils/dynamic_modules_utils.py
 delete mode 100644 ppdiffusers/ppdiffusers/utils/hub_utils.py
 delete mode 100644 ppdiffusers/ppdiffusers/utils/import_utils.py
 delete mode 100644 ppdiffusers/ppdiffusers/utils/initializer_utils.py
 delete mode 100644 ppdiffusers/ppdiffusers/utils/load_utils.py
 delete mode 100644 ppdiffusers/ppdiffusers/utils/logging.py
 delete mode 100644 ppdiffusers/ppdiffusers/utils/model_card_template.md
 delete mode 100644 ppdiffusers/ppdiffusers/utils/outputs.py
 delete mode 100644 ppdiffusers/ppdiffusers/utils/paddle_utils.py
 delete mode 100644 ppdiffusers/ppdiffusers/utils/pil_utils.py
 delete mode 100644 ppdiffusers/ppdiffusers/utils/testing_utils.py
 delete mode 100644 ppdiffusers/ppdiffusers/version.py
 delete mode 100644 ppdiffusers/requirements.txt
 delete mode 100644 ppdiffusers/scripts/cocoeval_keypoints_score/README.md
 delete mode 120000 ppdiffusers/scripts/cocoeval_keypoints_score/annotator
 delete mode 100644 ppdiffusers/scripts/cocoeval_keypoints_score/cocoeval_keypoints.py
 delete mode 100644 ppdiffusers/scripts/cocoeval_keypoints_score/get_openpose_keypoints_result_coco_format.py
 delete mode 100644 ppdiffusers/scripts/convert_diffusers_model/README.md
 delete mode 100644 ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_StableDiffusionImageVariation_to_ppdiffusers.py
 delete mode 100644 ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_StableDiffusionUpscalePipeline_to_ppdiffusers.py
 delete mode 100644 ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_VersatileDiffusion_to_ppdiffusers.py
 delete mode 100644 ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_alt_diffusion_to_ppdiffusers.py
 delete mode 100644 ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_controlnet_to_ppdiffusers.py
 delete mode 100644 ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_latent_diffusion_model_to_ppdiffusers.py
 delete mode 100644 ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_paintbyexample_to_ppdiffusers.py
 delete mode 100644 ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion2.0_depth_to_ppdiffusers.py
 delete mode 100644 ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers.py
 delete mode 100644 ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion_to_ppdiffusers.py
 delete mode 100644 ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_unclip_to_ppdiffusers.py
 delete mode 100644 ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_vq_diffusion_to_ppdiffusers.py
 delete mode 100644 ppdiffusers/scripts/convert_diffusers_model/convert_orig_sd_ckpt_to_ppdiffusers.py
 delete mode 100644 ppdiffusers/scripts/convert_diffusers_model/convert_orig_stablediffusion2.0_ckpt_to_ppdiffusers.py
 delete mode 100644 ppdiffusers/scripts/convert_diffusers_model/convert_ppdiffusers_stable_diffusion_to_fastdeploy.py
 delete mode 100644 ppdiffusers/scripts/convert_diffusers_model/requirements.txt
 delete mode 100644 ppdiffusers/scripts/fid_clip_score/README.md
 delete mode 100644 ppdiffusers/scripts/fid_clip_score/compute_fid_clip_score.py
 delete mode 100755 ppdiffusers/scripts/fid_clip_score/fid_score.py
 delete mode 100644 ppdiffusers/scripts/fid_clip_score/inception.py
 delete mode 100644 ppdiffusers/setup.py
 delete mode 100644 ppdiffusers/tests/__init__.py
 delete mode 100644 ppdiffusers/tests/fixtures/custom_pipeline/__init__.py
 delete mode 100644 ppdiffusers/tests/fixtures/custom_pipeline/pipeline.py
 delete mode 100644 ppdiffusers/tests/fixtures/custom_pipeline/what_ever.py
 delete mode 100644 ppdiffusers/tests/fixtures/elise_format0.mid
 delete mode 100644 ppdiffusers/tests/models/__init__.py
 delete mode 100644 ppdiffusers/tests/models/test_attention_processor.py
 delete mode 100644 ppdiffusers/tests/models/test_layers_utils.py
 delete mode 100644 ppdiffusers/tests/models/test_lora_layers.py
 delete mode 100644 ppdiffusers/tests/models/test_modeling_common.py
 delete mode 100644 ppdiffusers/tests/models/test_models_unet_1d.py
 delete mode 100644 ppdiffusers/tests/models/test_models_unet_2d.py
 delete mode 100644 ppdiffusers/tests/models/test_models_unet_2d_condition.py
 delete mode 100644 ppdiffusers/tests/models/test_models_unet_3d_condition.py
 delete mode 100644 ppdiffusers/tests/models/test_models_vae.py
 delete mode 100644 ppdiffusers/tests/models/test_models_vq.py
 delete mode 100644 ppdiffusers/tests/models/test_unet_2d_blocks.py
 delete mode 100644 ppdiffusers/tests/models/test_unet_blocks_common.py
 delete mode 100644 ppdiffusers/tests/others/__init__.py
 delete mode 100644 ppdiffusers/tests/others/test_config.py
 delete mode 100644 ppdiffusers/tests/others/test_ema.py
 delete mode 100644 ppdiffusers/tests/others/test_hub_utils.py
 delete mode 100644 ppdiffusers/tests/others/test_image_processor.py
 delete mode 100644 ppdiffusers/tests/others/test_outputs.py
 delete mode 100644 ppdiffusers/tests/others/test_training.py
 delete mode 100644 ppdiffusers/tests/others/test_utils.py
 delete mode 100644 ppdiffusers/tests/pipelines/__init__.py
 delete mode 100644 ppdiffusers/tests/pipelines/altdiffusion/__init__.py
 delete mode 100644 ppdiffusers/tests/pipelines/altdiffusion/test_alt_diffusion.py
 delete mode 100644 ppdiffusers/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
 delete mode 100644 ppdiffusers/tests/pipelines/audio_diffusion/__init__.py
 delete mode 100644 ppdiffusers/tests/pipelines/audio_diffusion/test_audio_diffusion.py
 delete mode 100644 ppdiffusers/tests/pipelines/audioldm/__init__.py
 delete mode 100644 ppdiffusers/tests/pipelines/audioldm/test_audioldm.py
 delete mode 100644 ppdiffusers/tests/pipelines/dance_diffusion/__init__.py
 delete mode 100644 ppdiffusers/tests/pipelines/dance_diffusion/test_dance_diffusion.py
 delete mode 100644 ppdiffusers/tests/pipelines/ddim/__init__.py
 delete mode 100644 ppdiffusers/tests/pipelines/ddim/test_ddim.py
 delete mode 100644 ppdiffusers/tests/pipelines/ddpm/__init__.py
 delete mode 100644 ppdiffusers/tests/pipelines/ddpm/test_ddpm.py
 delete mode 100644 ppdiffusers/tests/pipelines/deepfloyd_if/__init__.py
 delete mode 100644 ppdiffusers/tests/pipelines/deepfloyd_if/test_if.py
 delete mode 100644 ppdiffusers/tests/pipelines/deepfloyd_if/test_if_img2img.py
 delete mode 100644 ppdiffusers/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py
 delete mode 100644 ppdiffusers/tests/pipelines/deepfloyd_if/test_if_inpainting.py
 delete mode 100644 ppdiffusers/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py
 delete mode 100644 ppdiffusers/tests/pipelines/deepfloyd_if/test_if_superresolution.py
 delete mode 100644 ppdiffusers/tests/pipelines/dit/__init__.py
 delete mode 100644 ppdiffusers/tests/pipelines/dit/test_dit.py
 delete mode 100644 ppdiffusers/tests/pipelines/karras_ve/__init__.py
 delete mode 100644 ppdiffusers/tests/pipelines/karras_ve/test_karras_ve.py
 delete mode 100644 ppdiffusers/tests/pipelines/latent_diffusion/__init__.py
 delete mode 100644 ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion.py
 delete mode 100644 ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py
 delete mode 100644 ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_uncond.py
 delete mode 100644 ppdiffusers/tests/pipelines/paint_by_example/__init__.py
 delete mode 100644 ppdiffusers/tests/pipelines/paint_by_example/test_paint_by_example.py
 delete mode 100644 ppdiffusers/tests/pipelines/pipeline_params.py
 delete mode 100644 ppdiffusers/tests/pipelines/pndm/__init__.py
 delete mode 100644 ppdiffusers/tests/pipelines/pndm/test_pndm.py
 delete mode 100644 ppdiffusers/tests/pipelines/repaint/__init__.py
 delete mode 100644 ppdiffusers/tests/pipelines/repaint/test_repaint.py
 delete mode 100644 ppdiffusers/tests/pipelines/score_sde_ve/__init__.py
 delete mode 100644 ppdiffusers/tests/pipelines/score_sde_ve/test_score_sde_ve.py
 delete mode 100644 ppdiffusers/tests/pipelines/semantic_stable_diffusion/__init__.py
 delete mode 100644 ppdiffusers/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py
 delete mode 100644 ppdiffusers/tests/pipelines/spectrogram_diffusion/__init__.py
 delete mode 100644 ppdiffusers/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
 delete mode 100644 ppdiffusers/tests/pipelines/stable_diffusion/__init__.py
 delete mode 100644 ppdiffusers/tests/pipelines/stable_diffusion/test_cycle_diffusion.py
 delete mode 100644 ppdiffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion.py
 delete mode 100644 ppdiffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py
 delete mode 100644 ppdiffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint.py
 delete mode 100644 ppdiffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint_legacy.py
 delete mode 100644 ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion.py
 delete mode 100644 ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_adapter.py
 delete mode 100644 ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
 delete mode 100644 ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
 delete mode 100644 ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
 delete mode 100644 ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
 delete mode 100644 ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
 delete mode 100644 ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py
 delete mode 100644 ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_k_diffusion.py
 delete mode 100644 ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py
 delete mode 100644 ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py
 delete mode 100644 ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py
 delete mode 100644 ppdiffusers/tests/pipelines/stable_diffusion_2/__init__.py
 delete mode 100644 ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
 delete mode 100644 ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py
 delete mode 100644 ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py
 delete mode 100644 ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py
 delete mode 100644 ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py
 delete mode 100644 ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py
 delete mode 100644 ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py
 delete mode 100644 ppdiffusers/tests/pipelines/stable_diffusion_safe/__init__.py
 delete mode 100644 ppdiffusers/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py
 delete mode 100644 ppdiffusers/tests/pipelines/stable_unclip/__init__.py
 delete mode 100644 ppdiffusers/tests/pipelines/stable_unclip/test_stable_unclip.py
 delete mode 100644 ppdiffusers/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py
 delete mode 100644 ppdiffusers/tests/pipelines/test_pipeline_utils.py
 delete mode 100644 ppdiffusers/tests/pipelines/test_pipelines.py
 delete mode 100644 ppdiffusers/tests/pipelines/test_pipelines_common.py
 delete mode 100644 ppdiffusers/tests/pipelines/text_to_video/__init__.py
 delete mode 100644 ppdiffusers/tests/pipelines/text_to_video/test_text_to_video.py
 delete mode 100644 ppdiffusers/tests/pipelines/text_to_video/test_text_to_video_zero.py
 delete mode 100644 ppdiffusers/tests/pipelines/unclip/__init__.py
 delete mode 100644 ppdiffusers/tests/pipelines/unclip/test_unclip.py
 delete mode 100644 ppdiffusers/tests/pipelines/unclip/test_unclip_image_variation.py
 delete mode 100644 ppdiffusers/tests/pipelines/unidiffuser/__init__.py
 delete mode 100644 ppdiffusers/tests/pipelines/versatile_diffusion/__init__.py
 delete mode 100644 ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_dual_guided.py
 delete mode 100644 ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_variation.py
 delete mode 100644 ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py
 delete mode 100644 ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_text_to_image.py
 delete mode 100644 ppdiffusers/tests/pipelines/vq_diffusion/__init__.py
 delete mode 100644 ppdiffusers/tests/pipelines/vq_diffusion/test_vq_diffusion.py
 delete mode 100644 ppdiffusers/tests/schedulers/__init__.py
 delete mode 100644 ppdiffusers/tests/schedulers/test_scheduler_ddim.py
 delete mode 100644 ppdiffusers/tests/schedulers/test_scheduler_ddpm.py
 delete mode 100644 ppdiffusers/tests/schedulers/test_scheduler_deis.py
 delete mode 100644 ppdiffusers/tests/schedulers/test_scheduler_dpm_multi.py
 delete mode 100644 ppdiffusers/tests/schedulers/test_scheduler_dpm_single.py
 delete mode 100644 ppdiffusers/tests/schedulers/test_scheduler_euler.py
 delete mode 100644 ppdiffusers/tests/schedulers/test_scheduler_euler_ancestral.py
 delete mode 100644 ppdiffusers/tests/schedulers/test_scheduler_heun.py
 delete mode 100644 ppdiffusers/tests/schedulers/test_scheduler_ipndm.py
 delete mode 100644 ppdiffusers/tests/schedulers/test_scheduler_kdpm2_ancestral.py
 delete mode 100644 ppdiffusers/tests/schedulers/test_scheduler_kdpm2_discrete.py
 delete mode 100644 ppdiffusers/tests/schedulers/test_scheduler_lms.py
 delete mode 100644 ppdiffusers/tests/schedulers/test_scheduler_pndm.py
 delete mode 100644 ppdiffusers/tests/schedulers/test_scheduler_score_sde_ve.py
 delete mode 100644 ppdiffusers/tests/schedulers/test_scheduler_unclip.py
 delete mode 100644 ppdiffusers/tests/schedulers/test_scheduler_unipc.py
 delete mode 100644 ppdiffusers/tests/schedulers/test_scheduler_vq_diffusion.py
 delete mode 100755 ppdiffusers/tests/schedulers/test_schedulers.py

diff --git a/ppdiffusers/LICENSE b/ppdiffusers/LICENSE
deleted file mode 100644
index 962fee016f4e..000000000000
--- a/ppdiffusers/LICENSE
+++ /dev/null
@@ -1,203 +0,0 @@
-
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
\ No newline at end of file
diff --git a/ppdiffusers/Makefile b/ppdiffusers/Makefile
deleted file mode 100644
index 160de104e005..000000000000
--- a/ppdiffusers/Makefile
+++ /dev/null
@@ -1,30 +0,0 @@
-
-.DEFAULT_GOAL := all
-
-.PHONY: all
-all: deploy-version build deploy
-
-.PHONY: build
-build:
-	python3 setup.py sdist bdist_wheel
-
-.PHONY: deploy
-deploy:
-	make deploy-version
-	twine upload --skip-existing dist/*
-
-.PHONY: deploy-version
-deploy-version:
-	echo "VERSION = '$$(cat VERSION)'" > ppdiffusers/version.py
-
-.PHONY: install
-install:
-	pip install -r requirements.txt
-
-.PHONY: version
-version:
-	@newVersion=$$(awk -F. '{print $$1"."$$2"."$$3+1}' < VERSION) \
-		&& echo $${newVersion} > VERSION \
-		&& git add VERSION \
-		&& git commit -m "🔥 update version to $${newVersion}" > /dev/null \
-		&& echo "Bumped version to $${newVersion}"
\ No newline at end of file
diff --git a/ppdiffusers/VERSION b/ppdiffusers/VERSION
deleted file mode 100644
index 2a0970ca757c..000000000000
--- a/ppdiffusers/VERSION
+++ /dev/null
@@ -1 +0,0 @@
-0.16.1
diff --git a/ppdiffusers/deploy/README.md b/ppdiffusers/deploy/README.md
deleted file mode 100644
index ffb6cb9cb16c..000000000000
--- a/ppdiffusers/deploy/README.md
+++ /dev/null
@@ -1,186 +0,0 @@
-# FastDeploy Stable Diffusion 模型高性能部署
-
- **目录**
-   * [部署模型准备](#部署模型准备)
-   * [环境依赖](#环境依赖)
-   * [快速体验](#快速体验)
-       * [文图生成（Text-to-Image Generation）](#文图生成)
-       * [文本引导的图像变换（Image-to-Image Text-Guided Generation）](#文本引导的图像变换)
-       * [文本引导的图像编辑（Text-Guided Image Inpainting）](#文本引导的图像编辑)
-
-⚡️[FastDeploy](https://github.com/PaddlePaddle/FastDeploy)是一款全场景、易用灵活、极致高效的AI推理部署工具，为开发者提供多硬件、多推理引擎后端的部署能力。开发者只需调用一行代码即可随意切换硬件、推理引擎后端。本示例展现如何通过 FastDeploy 将我们 PPDiffusers 训练好的 Stable Diffusion 模型进行多硬件、多推理引擎后端高性能部署。
-
-<a name="部署模型准备"></a>
-
-## 部署模型准备
-
-本示例需要使用训练模型导出后的部署模型，可参考[模型导出文档](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/ppdiffusers/deploy/export.md)导出部署模型。
-
-<a name="环境依赖"></a>
-
-## 环境依赖
-
-在示例中使用了 FastDeploy，需要执行以下命令安装依赖。
-
-```shell
-pip install fastdeploy-gpu-python -f https://www.paddlepaddle.org.cn/whl/fastdeploy.html
-```
-
-<a name="快速体验"></a>
-
-## 快速体验
-
-我们经过部署模型准备，可以开始进行测试。本目录提供 StableDiffusion 模型支持的三种任务，分别是文图生成、文本引导的图像变换以及文本引导的图像编辑。
-
-<a name="文图生成"></a>
-
-### 文图生成（Text-to-Image Generation）
-
-
-下面将指定模型目录，推理引擎后端，硬件以及 scheduler 类型，运行 `infer.py` 脚本，完成文图生成任务。
-
-```sh
-python infer.py --model_dir stable-diffusion-v1-4/ --scheduler "pndm" --backend paddle --task_name text2img
-```
-
-脚本的输入提示语句为 **"a photo of an astronaut riding a horse on mars"**， 得到的图像文件为 text2img.png。生成的图片示例如下（每次生成的图片都不相同，示例仅作参考）：
-
-![text2img.png](https://user-images.githubusercontent.com/10826371/200261112-68e53389-e0a0-42d1-8c3a-f35faa6627d7.png)
-
-如果使用 stable-diffusion-v1-5 模型，则可执行以下命令完成推理：
-
-```sh
-python infer.py --model_dir stable-diffusion-v1-5/ --scheduler "preconfig-euler-ancestral" --backend paddle_tensorrt --use_fp16 True --device gpu --task_name text2img
-```
-
-同时，我们还提供基于两阶段 HiresFix 的文图生成示例。下面将指定模型目录，指定任务名称为 `hiresfix` 后，运行 `infer.py` 脚本，完成`两阶段hiresfix任务`，在第一阶段我们生成了 `512x512分辨率` 的图片，然后在第二阶段我们在一阶段的基础上修复生成了 `768x768分辨率` 图片。
-
-|       without hiresfix       |       with hiresfix       |
-|:-------------------:|:-------------------:|
-|![][without-hiresfix]|![][with-hiresfix]|
-
-[without-hiresfix]: https://github.com/PaddlePaddle/PaddleNLP/assets/50394665/38ab6032-b960-4b76-8d69-0e0f8b5e1f42
-[with-hiresfix]: https://github.com/PaddlePaddle/PaddleNLP/assets/50394665/a472cb31-d8a2-451d-bf80-cd84c9ef0d08
-
-在80G A100上，ppdiffusers==0.16.1、fastdeploy==1.0.7、develop paddle、cuda11.7 的环境下，我们测出了如下的速度。
-- without hiresfix 的速度为：Mean latency: 1.930896 s, p50 latency: 1.932413 s, p90 latency: 1.933565 s, p95 latency: 1.933630 s.
-- with hiresfix 的速度为：Mean latency: 1.442178 s, p50 latency: 1.442885 s, p90 latency: 1.446133 s, p95 latency: 1.446285 s.
-
-```sh
-python infer.py --model_dir stable-diffusion-v1-5/ --scheduler "euler-ancestral" --backend paddle_tensorrt --use_fp16 True --device gpu --task_name hiresfix
-```
-
-<a name="文本引导的图像变换"></a>
-
-### 文本引导的图像变换（Image-to-Image Text-Guided Generation）
-
-下面将指定模型目录，推理引擎后端，硬件以及 scheduler 类型，运行 `infer.py` 脚本，完成文本引导的图像变换任务。
-
-```sh
-python infer.py --model_dir stable-diffusion-v1-4/ --scheduler "pndm" --backend paddle_tensorrt --use_fp16 True --device gpu --task_name img2img
-```
-
-脚本输入的提示语句为 **"A fantasy landscape, trending on artstation"**，运行得到的图像文件为 img2img.png。生成的图片示例如下（每次生成的图片都不相同，示例仅作参考）：
-
-|       input image       |       output image       |
-|:-------------------:|:-------------------:|
-|![][sketch-mountains-input]|![][fantasy_landscape]|
-
-[sketch-mountains-input]: https://user-images.githubusercontent.com/10826371/217207485-09ee54de-4ba2-4cff-9d6c-fd426d4c1831.png
-[fantasy_landscape]: https://user-images.githubusercontent.com/10826371/217200795-811a8c73-9fb3-4445-b363-b445c7ee52cd.png
-
-
-
-如果使用 stable-diffusion-v1-5 模型，则可执行以下命令完成推理：
-
-```sh
-python infer.py --model_dir stable-diffusion-v1-5/ --scheduler "euler-ancestral" --backend paddle_tensorrt --use_fp16 True --device gpu --task_name img2img
-```
-
-
-同时，我们还提供基于 CycleDiffusion 的文本引导的图像变换示例。下面将指定模型目录，运行 `infer.py` 脚本，完成文本引导的图像变换任务。
-
-```sh
-python infer.py --model_dir stable-diffusion-v1-4/ --backend paddle_tensorrt --use_fp16 True --device gpu --task_name cycle_diffusion
-```
-
-脚本输入的源提示语句为 **"An astronaut riding a horse"**，目标提示语句为 **"An astronaut riding an elephant"**，运行得到的图像文件为 cycle_diffusion.png。生成的图片示例如下（每次生成的图片都不相同，示例仅作参考）：
-
-|       input image       |       output image       |
-|:-------------------:|:-------------------:|
-|![][horse]|![][elephant]|
-
-[horse]: https://raw.githubusercontent.com/ChenWu98/cycle-diffusion/main/data/dalle2/An%20astronaut%20riding%20a%20horse.png
-[elephant]: https://user-images.githubusercontent.com/10826371/223315865-4490b586-1de7-4616-a245-9c008c3ffb6b.png
-
-<a name="文本引导的图像编辑"></a>
-
-### 文本引导的图像编辑（Text-Guided Image Inpainting）
-
-注意！当前有两种版本的图像编辑代码，一个是 Legacy 版本，一个是正式版本，下面将分别介绍两种版本的使用示例。
-
-#### Legacy 版本
-
-下面将指定模型目录，推理引擎后端，硬件以及 scheduler 类型，运行 `infer.py` 脚本，完成文本引导的图像编辑任务。
-
-```sh
-python infer.py --model_dir stable-diffusion-v1-4/ --scheduler euler-ancestral --backend paddle_tensorrt --use_fp16 True --device gpu --task_name inpaint_legacy
-```
-
-脚本输入的提示语为 **"Face of a yellow cat, high resolution, sitting on a park bench"**，运行得到的图像文件为 inpaint_legacy.png。生成的图片示例如下（每次生成的图片都不相同，示例仅作参考）：
-
-|       input image       |       mask image       |       output image
-|:-------------------:|:-------------------:|:-------------------:|
-|![][input]|![][mask]|![][output]|
-
-[input]: https://user-images.githubusercontent.com/10826371/217423470-b2a3f8ac-618b-41ee-93e2-121bddc9fd36.png
-[mask]: https://user-images.githubusercontent.com/10826371/217424068-99d0a97d-dbc3-4126-b80c-6409d2fd7ebc.png
-[output]: https://user-images.githubusercontent.com/10826371/217455594-187aa99c-b321-4535-aca0-9159ad658a97.png
-
-如果使用 stable-diffusion-v1-5 模型，则可执行以下命令完成推理：
-
-```sh
-python infer.py --model_dir stable-diffusion-v1-5/ --scheduler euler-ancestral --backend paddle_tensorrt --use_fp16 True --device gpu --task_name inpaint_legacy
-```
-
-#### 正式版本
-
-下面将指定模型目录，推理引擎后端，硬件以及 scheduler 类型，运行 `infer.py` 脚本，完成文本引导的图像编辑任务。
-
-```sh
-python infer.py --model_dir stable-diffusion-v1-5-inpainting/ --scheduler euler-ancestral --backend paddle_tensorrt --use_fp16 True --device gpu --task_name inpaint
-```
-
-脚本输入的提示语为 **"Face of a yellow cat, high resolution, sitting on a park bench"**，运行得到的图像文件为 inpaint.png。生成的图片示例如下（每次生成的图片都不相同，示例仅作参考）：
-
-|       input image       |       mask image       |       output image
-|:-------------------:|:-------------------:|:-------------------:|
-|![][input_2]|![][mask_2]|![][output_2]|
-
-[input_2]: https://user-images.githubusercontent.com/10826371/217423470-b2a3f8ac-618b-41ee-93e2-121bddc9fd36.png
-[mask_2]: https://user-images.githubusercontent.com/10826371/217424068-99d0a97d-dbc3-4126-b80c-6409d2fd7ebc.png
-[output_2]: https://user-images.githubusercontent.com/10826371/217454490-7d6c6a89-fde6-4393-af8e-05e84961b354.png
-
-#### 参数说明
-
-`infer.py` 除了以上示例的命令行参数，还支持更多命令行参数的设置。展开可查看各命令行参数的说明。
-
-
-| 参数 |参数说明 |
-|----------|--------------|
-| --model_dir | 导出后模型的目录。默认为 `runwayml/stable-diffusion-v1-5@fastdeploy` |
-| --backend | 推理引擎后端。默认为 `paddle_tensorrt`，可选列表：`['onnx_runtime', 'paddle', 'paddlelite', 'paddle_tensorrt', 'tensorrt']`。 |
-| --device | 运行设备。默认为 `gpu`，可选列表：`['cpu', 'gpu', 'huawei_ascend_npu', 'kunlunxin_xpu']`。 |
-| --device_id | `gpu` 设备的 id。若 `device_id` 为`-1`，视为使用 `cpu` 推理。 |
-| --inference_steps | `UNet` 模型运行的次数，默认为 `50`。 |
-| --benchmark_steps | `Benchmark` 运行的次数，默认为 `1`。 |
-| --use_fp16 | 是否使用 `fp16` 精度。默认为 `False`。使用 `paddle_tensorrt` 后端及 `kunlunxin_xpu` 设备时可以设为 `True` 开启。 |
-| --task_name | 任务类型，默认为`text2img`，可选列表：`['text2img', 'img2img', 'inpaint', 'inpaint_legacy', 'cycle_diffusion', 'hiresfix', 'all']`。 注意，当`task_name`为`inpaint`时候，我们需要配合`runwayml/stable-diffusion-inpainting@fastdeploy`权重才能正常使用。|
-| --scheduler | 采样器类型。默认为 `'preconfig-euler-ancestral'`。可选列表：`['pndm', 'lms', 'euler', 'euler-ancestral', 'preconfig-euler-ancestral', 'dpm-multi', 'dpm-single', 'unipc-multi', 'ddim', 'ddpm', 'deis-multi', 'heun', 'kdpm2-ancestral', 'kdpm2']`。|
-| --infer_op | 推理所采用的op，可选列表 `['zero_copy_infer', 'raw', 'all']`，`zero_copy_infer`推理速度更快，默认值为`zero_copy_infer`。 |
-| --parse_prompt_type | 处理prompt文本所使用的方法，可选列表 `['raw', 'lpw']`，`lpw`可强调句子中的单词，并且支持更长的文本输入，默认值为`lpw`。 |
-| --width | 生成图片的宽度，取值范围 512~768。默认值为 512。|
-| --height | 生成图片的高度，取值范围 512~768。默认值为 512。|
-| --hr_resize_width | hiresfix 所要生成的宽度，取值范围 512~768。默认值为 768。|
-| --hr_resize_height | hiresfix 所要生成的高度，取值范围 512~768。默认值为 768。|
-| --is_sd2_0 | 是否为sd2.0的模型？默认为 False 。|
diff --git a/ppdiffusers/deploy/controlnet/README.md b/ppdiffusers/deploy/controlnet/README.md
deleted file mode 100644
index 50ea7d7e18d4..000000000000
--- a/ppdiffusers/deploy/controlnet/README.md
+++ /dev/null
@@ -1,136 +0,0 @@
-# FastDeploy ControlNet 模型高性能部署
-
- **目录**
-   * [部署模型准备](#部署模型准备)
-   * [环境依赖](#环境依赖)
-   * [快速体验](#快速体验)
-       * [ControlNet文图生成（ControlNet-Text-to-Image Generation）](#ControlNet文图生成)
-       * [ControlNet文本引导的图像变换（ControlNet-Image-to-Image Text-Guided Generation）](#ControlNet文本引导的图像变换)
-       * [ControlNet文本引导的图像编辑（ControlNet-Text-Guided Image Inpainting）](#ControlNet文本引导的图像编辑)
-
-⚡️[FastDeploy](https://github.com/PaddlePaddle/FastDeploy) 是一款全场景、易用灵活、极致高效的AI推理部署工具，为开发者提供多硬件、多推理引擎后端的部署能力。开发者只需调用一行代码即可随意切换硬件、推理引擎后端。本示例展现如何通过 FastDeploy 将我们 PPDiffusers 训练好的 Stable Diffusion 模型进行多硬件、多推理引擎后端高性能部署。
-
-<a name="部署模型准备"></a>
-
-## 部署模型准备
-
-本示例需要使用训练模型导出后的部署模型，可参考[模型导出文档](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/ppdiffusers/deploy/controlnet/export.md)导出部署模型。
-
-<a name="环境依赖"></a>
-
-## 环境依赖
-
-在示例中使用了 FastDeploy，需要执行以下命令安装依赖。
-
-```shell
-pip install fastdeploy-gpu-python -f https://www.paddlepaddle.org.cn/whl/fastdeploy.html
-```
-
-<a name="快速体验"></a>
-
-## 快速体验
-
-我们经过部署模型准备，可以开始进行测试。本目录提供采用Canny边缘检测图片作为控制条件生成图片的教程 。
-
-<a name="ControlNet文图生成"></a>
-
-### ControlNet文图生成（ControlNet-Text-to-Image Generation）
-
-下面左图是我们提供的初始图片，右图是经过OpenCV中的Canny算法处理后得到的边缘检测图片。
-
-![bird](https://user-images.githubusercontent.com/50394665/225192117-3ec7a61c-227b-4056-a076-d37759f8411b.png)
-![control_bird_canny](https://user-images.githubusercontent.com/50394665/225192606-47ba975f-f6cc-4555-8d85-870dc1327b45.png)
-
-> Tips：为了能够跑出最快的推理速度，如果是使用`A卡GPU`的用户，请保证`低于8.5版本的TRT`不在`LD_LIBRARY_PATH`路径上。
-
-下面将指定模型目录，推理引擎后端，硬件以及 scheduler 类型，运行 `infer.py` 脚本，完成 `Canny to Image` 任务。
-
-```sh
-python infer.py --model_dir ./control_sd15_canny --scheduler "ddim" --backend paddle --task text2img_control
-```
-
-脚本的输入提示语句为 **"bird"**， 得到的图像文件为 `text2img_control.png`。生成的图片示例如下（每次生成的图片都不相同，示例仅作参考）：
-
-![text2img_control](https://github.com/PaddlePaddle/PaddleNLP/assets/50394665/c2f5e7f0-8abf-4a6c-bc38-bcaf8f58cac5)
-
-如果使用 `paddle_tensorrt` 推理引擎后端及开启`半精度推理`，则可执行以下命令完成推理：
-
-```sh
-python infer.py --model_dir control_sd15_canny --scheduler "preconfig-euler-ancestral" --backend paddle_tensorrt --device gpu --benchmark_steps 10 --use_fp16 True --task text2img_control
-```
-
-经测试，使用上述命令，在 80G A100 机器上能够跑出 `1.111716 s` 的成绩。
-
-同时，我们还提供基于两阶段 HiresFix 的可控文图生成示例。下面将指定模型目录，指定任务名称为 `hiresfix` 后，运行 `infer.py` 脚本，完成`两阶段hiresfix任务`，在第一阶段我们生成了 `512x512分辨率` 的图片，然后在第二阶段我们在一阶段的基础上修复生成了 `768x768分辨率` 图片。
-
-|       without hiresfix       |       with hiresfix       |
-|:-------------------:|:-------------------:|
-|![][without-hiresfix]|![][with-hiresfix]|
-
-[without-hiresfix]: https://github.com/PaddlePaddle/PaddleNLP/assets/50394665/2e3002bc-4a55-4b73-869f-b4e065e62644
-[with-hiresfix]: https://github.com/PaddlePaddle/PaddleNLP/assets/50394665/3f80ce29-8854-4877-911a-11da928a0559
-
-在80G A100上，ppdiffusers==0.16.1、fastdeploy==1.0.7、develop paddle、cuda11.7 的环境下，我们测出了如下的速度。
-- without hiresfix 的速度为：Mean latency: 2.715479 s, p50 latency: 2.715581 s, p90 latency: 2.717518 s, p95 latency: 2.719844 s.
-- with hiresfix 的速度为：Mean latency: 2.027131 s, p50 latency: 2.026837 s, p90 latency: 2.028943 s, p95 latency: 2.032201 s.
-
-<a name="ControlNet文本引导的图像变换"></a>
-
-### ControlNet文本引导的图像变换（ControlNet-Image-to-Image Text-Guided Generation）
-
-```sh
-python infer.py --model_dir ./control_sd15_canny --scheduler "euler-ancestral" --backend paddle_tensorrt --use_fp16 True --device gpu --task_name img2img_control
-```
-
-脚本输入的提示语句为 **"A fantasy landscape, trending on artstation"**，运行得到的图像文件为 img2img_control.png。生成的图片示例如下（每次生成的图片都不相同，示例仅作参考）：
-
-|       input image       |       output image       |
-|:-------------------:|:-------------------:|
-|![][sketch-mountains-input]|![][fantasy_landscape]|
-
-[sketch-mountains-input]: https://user-images.githubusercontent.com/10826371/217207485-09ee54de-4ba2-4cff-9d6c-fd426d4c1831.png
-[fantasy_landscape]: https://github.com/PaddlePaddle/PaddleNLP/assets/50394665/c3727ee3-2955-4ae9-9fbd-a434a9613eda
-
-<a name="ControlNet文本引导的图像编辑"></a>
-
-### ControlNet文本引导的图像编辑（ControlNet-Text-Guided Image Inpainting）
-
-```sh
-python infer.py ./control_sd15_canny --scheduler "euler-ancestral" --backend paddle_tensorrt --use_fp16 True --device gpu --task_name inpaint_legacy_control
-```
-
-脚本输入的提示语为 **"Face of a yellow cat, high resolution, sitting on a park bench"**，运行得到的图像文件为 inpaint_legacy_control.png。生成的图片示例如下（每次生成的图片都不相同，示例仅作参考）：
-
-|       input image       |       mask image       |       output image
-|:-------------------:|:-------------------:|:-------------------:|
-|![][input]|![][mask]|![][output]|
-
-[input]: https://user-images.githubusercontent.com/10826371/217423470-b2a3f8ac-618b-41ee-93e2-121bddc9fd36.png
-[mask]: https://user-images.githubusercontent.com/10826371/217424068-99d0a97d-dbc3-4126-b80c-6409d2fd7ebc.png
-[output]: https://github.com/PaddlePaddle/PaddleNLP/assets/50394665/63735f7d-038a-48d0-a688-7c1aa4912ab0
-
-
-#### 参数说明
-
-`infer.py` 除了以上示例的命令行参数，还支持更多命令行参数的设置。展开可查看各命令行参数的说明。
-
-| 参数 |参数说明 |
-|----------|--------------|
-| --model_dir | 导出后模型的目录。默认为 `runwayml/stable-diffusion-v1-5-canny@fastdeploy` |
-| --backend | 推理引擎后端。默认为 `paddle_tensorrt`，可选列表：`['onnx_runtime', 'paddle', 'paddlelite', 'paddle_tensorrt', 'tensorrt']`。 |
-| --device | 运行设备。默认为 `gpu`，可选列表：`['cpu', 'gpu', 'huawei_ascend_npu', 'kunlunxin_xpu']`。 |
-| --device_id | `gpu` 设备的 id。若 `device_id` 为`-1`，视为使用 `cpu` 推理。 |
-| --inference_steps | `UNet` 模型运行的次数，默认为 `50`。 |
-| --benchmark_steps | `Benchmark` 运行的次数，默认为 `1`。 |
-| --use_fp16 | 是否使用 `fp16` 精度。默认为 `False`。使用 `paddle_tensorrt` 后端及 `kunlunxin_xpu` 设备时可以设为 `True` 开启。 |
-| --task_name | 任务类型，默认为`text2img`，可选列表：`['text2img_control', 'img2img_control', 'inpaint_legacy_control', 'hiresfix_control', 'all']`。|
-| --scheduler | 采样器类型。默认为 `'preconfig-euler-ancestral'`。可选列表：`['pndm', 'lms', 'euler', 'euler-ancestral', 'preconfig-euler-ancestral', 'dpm-multi', 'dpm-single', 'unipc-multi', 'ddim', 'ddpm', 'deis-multi', 'heun', 'kdpm2-ancestral', 'kdpm2']`。|
-| --infer_op | 推理所采用的op，可选列表 `['zero_copy_infer', 'raw', 'all']`，`zero_copy_infer`推理速度更快，默认值为`zero_copy_infer`。 |
-| --parse_prompt_type | 处理prompt文本所使用的方法，可选列表 `['raw', 'lpw']`，`lpw`可强调句子中的单词，并且支持更长的文本输入，默认值为`lpw`。 |
-| --low_threshold | Canny算法最后一步中，小于该阈值的像素直接置为0，默认值为 100。 |
-| --high_threshold | Canny算法最后一步中，大于该阈值的像素直接置为255，默认值为 200。 |
-| --width | 生成图片的宽度，取值范围 512~768。默认值为 512。|
-| --height | 生成图片的高度，取值范围 512~768。默认值为 512。|
-| --hr_resize_width | hiresfix 所要生成的宽度，取值范围 512~768。默认值为 768。|
-| --hr_resize_height | hiresfix 所要生成的高度，取值范围 512~768。默认值为 768。|
-| --is_sd2_0 | 是否为sd2.0的模型？默认为 False 。|
diff --git a/ppdiffusers/deploy/controlnet/export.md b/ppdiffusers/deploy/controlnet/export.md
deleted file mode 100644
index 638b3aae5691..000000000000
--- a/ppdiffusers/deploy/controlnet/export.md
+++ /dev/null
@@ -1,55 +0,0 @@
-# ControlNet 模型导出教程
-
-
-[PPDiffusers](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers) 是一款支持跨模态（如图像与语音）训练和推理的扩散模型（Diffusion Model）工具箱，其借鉴了🤗 Huggingface 团队的 [Diffusers](https://github.com/huggingface/diffusers) 的优秀设计，并且依托 [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) 框架和 [PaddleNLP](https://github.com/PaddlePaddle/PaddleNLP) 自然语言处理库。下面将介绍如何将 PPDiffusers 提供的预训练模型进行模型导出。
-
-### 模型导出
-
-可执行以下命令行完成模型导出。
-
-```shell
-python export_model.py --pretrained_model_name_or_path runwayml/stable-diffusion-v1-5 --controlnet_pretrained_model_name_or_path  lllyasviel/sd-controlnet-canny --output_path control_sd15_canny --height=512 --width=512
-```
-注: 上述指令导出固定尺寸的模型，固定尺寸的导出模型有利于优化模型推理性能，但会牺牲一定灵活性。若要导出支持多种推理尺寸的模型，可取消参数--height和--width的设置。
-
-输出的模型目录结构如下：
-
-```shell
-control_sd15_canny/
-├── model_index.json
-├── scheduler
-│   └── scheduler_config.json
-├── tokenizer
-│   ├── tokenizer_config.json
-│   ├── merges.txt
-│   ├── vocab.json
-│   └── special_tokens_map.json
-├── text_encoder
-│   ├── inference.pdiparams
-│   ├── inference.pdiparams.info
-│   └── inference.pdmodel
-├── unet
-│   ├── inference.pdiparams
-│   ├── inference.pdiparams.info
-│   └── inference.pdmodel
-├── vae_decoder
-│   ├── inference.pdiparams
-│   ├── inference.pdiparams.info
-│   └── inference.pdmodel
-└── vae_encoder
-    ├── inference.pdiparams
-    ├── inference.pdiparams.info
-    └── inference.pdmodel
-```
-
-
-`export_model.py` 各命令行参数的说明。
-
-| 参数 |参数说明 |
-|----------|--------------|
-| <span style="display:inline-block;width: 230pt"> --pretrained_model_name_or_path </span> | ppdiffuers提供的diffusion预训练模型。默认为："runwayml/stable-diffusion-v1-5"。更多 StableDiffusion 预训练模型可参考 [ppdiffusers 模型列表](../README.md#ppdiffusers模型支持的权重)。|
-| <span style="display:inline-block;width: 230pt"> --controlnet_pretrained_model_name_or_path </span> | ppdiffuers提供的controlnet预训练模型。默认为："lllyasviel/sd-controlnet-canny"。更多 ControlNET 预训练模型可参考 [lllyasviel的huggingface hub](https://huggingface.co/lllyasviel)。|
-| --output_path | 导出的模型目录。 |
-| --sample | vae encoder 的输出是否调整为 sample 模式，注意：sample模式会引入随机因素，默认是 False。|
-| --height | 如果指定，则会固定导出模型的高度，即，在推理生成图片时只能生成该大小的图片，默认值为None。|
-| --width | 如果指定，则会固定导出模型的宽度，即，在推理生成图片时只能生成该大小的图片，默认值为None。|
diff --git a/ppdiffusers/deploy/controlnet/export_model.py b/ppdiffusers/deploy/controlnet/export_model.py
deleted file mode 100644
index d739ebea43bf..000000000000
--- a/ppdiffusers/deploy/controlnet/export_model.py
+++ /dev/null
@@ -1,234 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-from pathlib import Path
-from types import MethodType
-
-import paddle
-
-from ppdiffusers import (
-    ControlNetModel,
-    FastDeployRuntimeModel,
-    FastDeployStableDiffusionControlNetPipeline,
-    StableDiffusionControlNetPipeline,
-    UNet2DConditionModel,
-)
-
-
-class ControlNetWithUnetModel(paddle.nn.Layer):
-    def __init__(
-        self,
-        unet,
-        controlnet,
-    ):
-        super().__init__()
-        self.unet = unet
-        self.controlnet = controlnet
-
-    def forward(
-        self, sample, timestep, encoder_hidden_states, controlnet_cond, controlnet_conditioning_scale, return_dict=True
-    ):
-        down_block_res_samples, mid_block_res_sample = self.controlnet(
-            sample,
-            timestep,
-            encoder_hidden_states=encoder_hidden_states,
-            controlnet_cond=controlnet_cond,
-            conditioning_scale=controlnet_conditioning_scale,
-            return_dict=False,
-        )
-
-        noise_pred = self.unet(
-            sample,
-            timestep,
-            encoder_hidden_states=encoder_hidden_states,
-            down_block_additional_residuals=down_block_res_samples,
-            mid_block_additional_residual=mid_block_res_sample,
-            return_dict=return_dict,
-        )
-        return noise_pred
-
-
-def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(
-    model_path: str,
-    controlnet_model_path: str,
-    output_path: str,
-    sample: bool = False,
-    height: int = None,
-    width: int = None,
-):
-    unet_tmp = UNet2DConditionModel.from_pretrained(model_path, resnet_pre_temb_non_linearity=True, subfolder="unet")
-    controlnet_tmp = ControlNetModel.from_pretrained(controlnet_model_path, resnet_pre_temb_non_linearity=True)
-
-    pipeline = StableDiffusionControlNetPipeline.from_pretrained(
-        model_path,
-        unet=unet_tmp,
-        controlnet=controlnet_tmp,
-        safety_checker=None,
-        feature_extractor=None,
-        requires_safety_checker=False,
-    )
-    # make sure we disable xformers
-    pipeline.disable_xformers_memory_efficient_attention()
-    output_path = Path(output_path)
-    # calculate latent's H and W
-    latent_height = height // 8 if height is not None else None
-    latent_width = width // 8 if width is not None else None
-    # get arguments
-    cross_attention_dim = pipeline.unet.config.cross_attention_dim  # 768 or 1024 or 1280
-    unet_channels = pipeline.unet.config.in_channels  # 4
-    vae_in_channels = pipeline.vae.config.in_channels  # 3
-    vae_latent_channels = pipeline.vae.config.latent_channels  # 4
-    print(
-        f"cross_attention_dim: {cross_attention_dim}\n",
-        f"unet_in_channels: {unet_channels}\n",
-        f"vae_encoder_in_channels: {vae_in_channels}\n",
-        f"vae_decoder_latent_channels: {vae_latent_channels}",
-    )
-    # 1. Convert text_encoder
-    text_encoder = paddle.jit.to_static(
-        pipeline.text_encoder,
-        input_spec=[paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids")],  # input_ids
-    )
-    save_path = os.path.join(args.output_path, "text_encoder", "inference")
-    paddle.jit.save(text_encoder, save_path)
-    print(f"Save text_encoder model in {save_path} successfully.")
-    del pipeline.text_encoder
-
-    # wrap unet + controlnet
-    new_unet = ControlNetWithUnetModel(unet=pipeline.unet, controlnet=pipeline.controlnet)
-
-    # 2. Convert unet
-    unet = paddle.jit.to_static(
-        new_unet,
-        input_spec=[
-            paddle.static.InputSpec(
-                shape=[None, unet_channels, latent_height, latent_width], dtype="float32", name="sample"
-            ),  # sample
-            paddle.static.InputSpec(shape=[1], dtype="float32", name="timestep"),  # timestep
-            paddle.static.InputSpec(
-                shape=[None, None, cross_attention_dim], dtype="float32", name="encoder_hidden_states"
-            ),  # encoder_hidden_states
-            paddle.static.InputSpec(
-                shape=[None, vae_in_channels, height, width], dtype="float32", name="controlnet_cond"
-            ),  # controlnet_cond
-            paddle.static.InputSpec(
-                shape=[len(pipeline.unet.config.block_out_channels) * 3 + 1],
-                dtype="float32",
-                name="controlnet_conditioning_scale",
-            ),  # controlnet_conditioning_scale
-        ],
-    )
-
-    save_path = os.path.join(args.output_path, "unet", "inference")
-    paddle.jit.save(unet, save_path)
-    print(f"Save unet model in {save_path} successfully.")
-    del pipeline.unet
-    del new_unet
-
-    def forward_vae_encoder_mode(self, z):
-        return self.encode(z, True).latent_dist.mode()
-
-    def forward_vae_encoder_sample(self, z):
-        return self.encode(z, True).latent_dist.sample()
-
-    # 3. Convert vae encoder
-    vae_encoder = pipeline.vae
-    if sample:
-        vae_encoder.forward = MethodType(forward_vae_encoder_sample, vae_encoder)
-    else:
-        vae_encoder.forward = MethodType(forward_vae_encoder_mode, vae_encoder)
-
-    vae_encoder = paddle.jit.to_static(
-        vae_encoder,
-        input_spec=[
-            paddle.static.InputSpec(
-                shape=[None, vae_in_channels, height, width],
-                dtype="float32",
-                name="sample",  # N, C, H, W
-            ),  # latent
-        ],
-    )
-    # Save vae_encoder in static graph model.
-    save_path = os.path.join(args.output_path, "vae_encoder", "inference")
-    paddle.jit.save(vae_encoder, save_path)
-    print(f"Save vae_encoder model in {save_path} successfully.")
-
-    # 4. Convert vae encoder
-    vae_decoder = pipeline.vae
-
-    def forward_vae_decoder(self, z):
-        return self.decode(z, True).sample
-
-    vae_decoder.forward = MethodType(forward_vae_decoder, vae_decoder)
-    vae_decoder = paddle.jit.to_static(
-        vae_decoder,
-        input_spec=[
-            paddle.static.InputSpec(
-                shape=[None, vae_latent_channels, latent_height, latent_width], dtype="float32", name="latent_sample"
-            ),  # latent_sample
-        ],
-    )
-    # Save vae_decoder in static graph model.
-    save_path = os.path.join(args.output_path, "vae_decoder", "inference")
-    paddle.jit.save(vae_decoder, save_path)
-    print(f"Save vae_decoder model in {save_path} successfully.")
-    del pipeline.vae
-
-    fastdeploy_pipeline = FastDeployStableDiffusionControlNetPipeline(
-        vae_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_encoder"),
-        vae_decoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_decoder"),
-        text_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "text_encoder"),
-        unet=FastDeployRuntimeModel.from_pretrained(output_path / "unet"),
-        tokenizer=pipeline.tokenizer,
-        scheduler=pipeline.scheduler,
-        safety_checker=None,
-        feature_extractor=None,
-        requires_safety_checker=False,
-    )
-    fastdeploy_pipeline.save_pretrained(output_path)
-    print("FastDeploy pipeline saved to", output_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--pretrained_model_name_or_path",
-        type=str,
-        default="runwayml/stable-diffusion-v1-5",
-        help="Path to the `ppdiffusers` checkpoint to convert (either a local directory or on the bos).",
-    )
-    parser.add_argument(
-        "--controlnet_pretrained_model_name_or_path",
-        type=str,
-        default="lllyasviel/sd-controlnet-canny",
-        help="Path to the `ppdiffusers` controlnet_pretrained_model_name_or_path  checkpoint to convert (either a local directory or on the bos).",
-    )
-    parser.add_argument("--output_path", type=str, required=True, help="Path to the output model.")
-    parser.add_argument(
-        "--sample", action="store_true", default=False, help="Export the vae encoder in mode or sample"
-    )
-    parser.add_argument("--height", type=int, default=None, help="The height of output images. Default: None")
-    parser.add_argument("--width", type=int, default=None, help="The width of output images. Default: None")
-    args = parser.parse_args()
-
-    convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(
-        args.pretrained_model_name_or_path,
-        args.controlnet_pretrained_model_name_or_path,
-        args.output_path,
-        args.sample,
-        args.height,
-        args.width,
-    )
diff --git a/ppdiffusers/deploy/controlnet/infer.py b/ppdiffusers/deploy/controlnet/infer.py
deleted file mode 100644
index 1cd1bdbefc79..000000000000
--- a/ppdiffusers/deploy/controlnet/infer.py
+++ /dev/null
@@ -1,625 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-import time
-
-# isort: split
-import paddle
-
-# isort: split
-import cv2
-import fastdeploy as fd
-import numpy as np
-from PIL import Image
-from tqdm.auto import trange
-
-from paddlenlp.trainer.argparser import strtobool
-from ppdiffusers import DiffusionPipeline, FastDeployStableDiffusionMegaPipeline
-from ppdiffusers.utils import load_image
-
-
-def get_canny_image(image, args):
-    if isinstance(image, Image.Image):
-        image = np.array(image)
-    image = cv2.Canny(image, args.low_threshold, args.high_threshold)
-    image = image[:, :, None]
-    image = np.concatenate([image, image, image], axis=2)
-    canny_image = Image.fromarray(image)
-    return canny_image
-
-
-def parse_arguments():
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_dir",
-        default="runwayml/stable-diffusion-v1-5@fastdeploy",
-        help="The model directory of diffusion_model.",
-    )
-    parser.add_argument("--inference_steps", type=int, default=50, help="The number of unet inference steps.")
-    parser.add_argument("--benchmark_steps", type=int, default=1, help="The number of performance benchmark steps.")
-    parser.add_argument(
-        "--backend",
-        type=str,
-        default="paddle_tensorrt",
-        # Note(zhoushunjie): Will support 'tensorrt' soon.
-        choices=["onnx_runtime", "paddle", "paddlelite", "paddle_tensorrt"],
-        help="The inference runtime backend of unet model and text encoder model.",
-    )
-    parser.add_argument(
-        "--device",
-        type=str,
-        default="gpu",
-        # Note(shentanyue): Will support more devices.
-        choices=[
-            "cpu",
-            "gpu",
-            "huawei_ascend_npu",
-            "kunlunxin_xpu",
-        ],
-        help="The inference runtime device of models.",
-    )
-    parser.add_argument(
-        "--task_name",
-        type=str,
-        default="text2img_control",
-        choices=[
-            "text2img_control",
-            "img2img_control",
-            "inpaint_legacy_control",
-            "hiresfix_control",
-            "all",
-        ],
-        help="The task can be one of [text2img_control, img2img_control, inpaint_legacy_control, hiresfix_control, all]. ",
-    )
-    parser.add_argument(
-        "--parse_prompt_type",
-        type=str,
-        default="lpw",
-        choices=[
-            "raw",
-            "lpw",
-        ],
-        help="The parse_prompt_type can be one of [raw, lpw]. ",
-    )
-    parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode")
-    parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu")
-    parser.add_argument(
-        "--scheduler",
-        type=str,
-        default="preconfig-euler-ancestral",
-        choices=[
-            "pndm",
-            "lms",
-            "euler",
-            "euler-ancestral",
-            "preconfig-euler-ancestral",
-            "dpm-multi",
-            "dpm-single",
-            "unipc-multi",
-            "ddim",
-            "ddpm",
-            "deis-multi",
-            "heun",
-            "kdpm2-ancestral",
-            "kdpm2",
-        ],
-        help="The scheduler type of stable diffusion.",
-    )
-    parser.add_argument(
-        "--infer_op",
-        type=str,
-        default="zero_copy_infer",
-        choices=[
-            "zero_copy_infer",
-            "raw",
-            "all",
-        ],
-        help="The type of infer op.",
-    )
-    parser.add_argument("--height", type=int, default=512, help="Height of input image")
-    parser.add_argument("--width", type=int, default=512, help="Width of input image")
-    parser.add_argument("--hr_resize_height", type=int, default=768, help="HR Height of input image")
-    parser.add_argument("--hr_resize_width", type=int, default=768, help="HR Width of input image")
-    parser.add_argument("--is_sd2_0", type=strtobool, default=False, help="Is sd2_0 model?")
-    parser.add_argument("--low_threshold", type=int, default=100, help="The value of Canny low threshold.")
-    parser.add_argument("--high_threshold", type=int, default=200, help="The value of Canny high threshold.")
-    return parser.parse_args()
-
-
-def create_ort_runtime(device_id=0):
-    option = fd.RuntimeOption()
-    option.use_ort_backend()
-    if device_id == -1:
-        option.use_cpu()
-    else:
-        option.use_gpu(device_id)
-    return option
-
-
-def create_paddle_inference_runtime(
-    use_trt=False,
-    dynamic_shape=None,
-    use_fp16=False,
-    device_id=0,
-    disable_paddle_trt_ops=[],
-    disable_paddle_pass=[],
-    paddle_stream=None,
-    workspace=None,
-):
-    option = fd.RuntimeOption()
-    option.use_paddle_backend()
-    if device_id == -1:
-        option.use_cpu()
-    else:
-        option.use_gpu(device_id)
-    if paddle_stream is not None and use_trt:
-        option.set_external_raw_stream(paddle_stream)
-    for pass_name in disable_paddle_pass:
-        option.paddle_infer_option.delete_pass(pass_name)
-    if use_trt:
-        option.paddle_infer_option.disable_trt_ops(disable_paddle_trt_ops)
-        option.paddle_infer_option.enable_trt = True
-        if workspace is not None:
-            option.set_trt_max_workspace_size(workspace)
-        if use_fp16:
-            option.trt_option.enable_fp16 = True
-        else:
-            # Note(zhoushunjie): These four passes don't support fp32 now.
-            # Remove this line of code in future.
-            only_fp16_passes = [
-                "trt_cross_multihead_matmul_fuse_pass",
-                "trt_flash_multihead_matmul_fuse_pass",
-                "preln_elementwise_groupnorm_act_pass",
-                "elementwise_groupnorm_act_pass",
-            ]
-            for curr_pass in only_fp16_passes:
-                option.paddle_infer_option.delete_pass(curr_pass)
-
-        # Need to enable collect shape
-        if dynamic_shape is not None:
-            option.paddle_infer_option.collect_trt_shape = True
-            for key, shape_dict in dynamic_shape.items():
-                option.trt_option.set_shape(
-                    key, shape_dict["min_shape"], shape_dict.get("opt_shape", None), shape_dict.get("max_shape", None)
-                )
-    return option
-
-
-def create_paddle_lite_runtime(device="cpu", device_id=0, use_fp16=False):
-    option = fd.RuntimeOption()
-    option.use_paddle_lite_backend()
-    if device == "huawei_ascend_npu":
-        option.use_ascend()
-        option.set_lite_device_names(["huawei_ascend_npu"])
-        option.set_lite_context_properties(
-            "HUAWEI_ASCEND_NPU_SELECTED_DEVICE_IDS={};HUAWEI_ASCEND_NPU_PRECISION_MODE=allow_mix_precision".format(
-                device_id
-            )
-        )
-    elif device == "kunlunxin_xpu":
-        # TODO(shentanyue): Add kunlunxin_xpu code
-        # https://github.com/PaddlePaddle/FastDeploy/blob/4c3e7030e151528d304619901c794481bb2f6037/examples/multimodal/stable_diffusion/infer.py#L178-L195
-        option.use_kunlunxin(
-            device_id,
-            l3_workspace_size=(64 * 1024 * 1024 - 4 * 1024),
-            locked=False,
-            autotune=False,
-            autotune_file="",
-            precision="int16",
-            adaptive_seqlen=True,
-            enable_multi_stream=True,
-        )
-        if use_fp16:
-            option.enable_lite_fp16()
-    else:
-        pass
-    return option
-
-
-def create_trt_runtime(workspace=(1 << 31), dynamic_shape=None, use_fp16=False, device_id=0):
-    option = fd.RuntimeOption()
-    option.use_trt_backend()
-    option.use_gpu(device_id)
-    if use_fp16:
-        option.enable_trt_fp16()
-    if workspace is not None:
-        option.set_trt_max_workspace_size(workspace)
-    if dynamic_shape is not None:
-        for key, shape_dict in dynamic_shape.items():
-            option.set_trt_input_shape(
-                key,
-                min_shape=shape_dict["min_shape"],
-                opt_shape=shape_dict.get("opt_shape", None),
-                max_shape=shape_dict.get("max_shape", None),
-            )
-    # cache_file = os.path.join(model_dir, model_prefix, "inference.trt")
-    # option.set_trt_cache_file(cache_file)
-    return option
-
-
-def main(args):
-    if args.device_id == -1:
-        paddle.set_device("cpu")
-        paddle_stream = None
-    else:
-        paddle.set_device(f"gpu:{args.device_id}")
-        paddle_stream = paddle.device.cuda.current_stream(args.device_id).cuda_stream
-
-    infer_op_dict = {
-        "vae_encoder": args.infer_op,
-        "vae_decoder": args.infer_op,
-        "text_encoder": args.infer_op,
-        "unet": args.infer_op,
-    }
-    seed = 1024
-    vae_in_channels = 4
-    text_encoder_max_length = 77
-    unet_max_length = text_encoder_max_length * 3  # lpw support max_length is 77x3
-    min_image_size = 512
-    max_image_size = 768
-    max_image_size = max(min_image_size, max_image_size)
-    hidden_states = 1024 if args.is_sd2_0 else 768
-    unet_in_channels = 4
-
-    text_encoder_dynamic_shape = {
-        "input_ids": {
-            "min_shape": [1, text_encoder_max_length],
-            "max_shape": [1, text_encoder_max_length],
-            "opt_shape": [1, text_encoder_max_length],
-        }
-    }
-    vae_encoder_dynamic_shape = {
-        "sample": {
-            "min_shape": [1, 3, min_image_size, min_image_size],
-            "max_shape": [1, 3, max_image_size, max_image_size],
-            "opt_shape": [1, 3, min_image_size, min_image_size],
-        }
-    }
-    vae_decoder_dynamic_shape = {
-        "latent_sample": {
-            "min_shape": [1, vae_in_channels, min_image_size // 8, min_image_size // 8],
-            "max_shape": [1, vae_in_channels, max_image_size // 8, max_image_size // 8],
-            "opt_shape": [1, vae_in_channels, min_image_size // 8, min_image_size // 8],
-        }
-    }
-    unet_dynamic_shape = {
-        "sample": {
-            "min_shape": [1, unet_in_channels, min_image_size // 8, min_image_size // 8],
-            "max_shape": [2, unet_in_channels, max_image_size // 8, max_image_size // 8],
-            "opt_shape": [2, unet_in_channels, min_image_size // 8, min_image_size // 8],
-        },
-        "timestep": {
-            "min_shape": [1],
-            "max_shape": [1],
-            "opt_shape": [1],
-        },
-        "encoder_hidden_states": {
-            "min_shape": [1, text_encoder_max_length, hidden_states],
-            "max_shape": [2, unet_max_length, hidden_states],
-            "opt_shape": [2, text_encoder_max_length, hidden_states],
-        },
-        "controlnet_cond": {
-            "min_shape": [1, 3, min_image_size, min_image_size],
-            "max_shape": [2, 3, max_image_size, max_image_size],
-            "opt_shape": [2, 3, min_image_size, min_image_size],
-        },
-        "controlnet_conditioning_scale": {
-            "min_shape": [13],
-            "max_shape": [13],
-            "opt_shape": [13],
-        },
-    }
-    # 4. Init runtime
-    if args.backend == "onnx_runtime":
-        runtime_options = dict(
-            text_encoder=create_ort_runtime(device_id=args.device_id),
-            vae_encoder=create_ort_runtime(device_id=args.device_id),
-            vae_decoder=create_ort_runtime(device_id=args.device_id),
-            unet=create_ort_runtime(device_id=args.device_id),
-        )
-    elif args.backend == "paddlelite":
-        runtime_options = dict(
-            text_encoder=create_paddle_lite_runtime(device=args.device, device_id=args.device_id, use_fp16=False),
-            vae_encoder=create_paddle_lite_runtime(device=args.device, device_id=args.device_id, use_fp16=False),
-            vae_decoder=create_paddle_lite_runtime(device=args.device, device_id=args.device_id, use_fp16=False),
-            unet=create_paddle_lite_runtime(device=args.device, device_id=args.device_id, use_fp16=args.use_fp16),
-        )
-    elif args.backend == "tensorrt":
-        runtime_options = dict(
-            text_encoder=create_trt_runtime(
-                dynamic_shape=text_encoder_dynamic_shape, use_fp16=args.use_fp16, device_id=args.device_id
-            ),
-            vae_encoder=create_trt_runtime(
-                dynamic_shape=vae_encoder_dynamic_shape, use_fp16=args.use_fp16, device_id=args.device_id
-            ),
-            vae_decoder=create_trt_runtime(
-                dynamic_shape=vae_decoder_dynamic_shape, use_fp16=args.use_fp16, device_id=args.device_id
-            ),
-            unet=create_trt_runtime(
-                dynamic_shape=unet_dynamic_shape, use_fp16=args.use_fp16, device_id=args.device_id
-            ),
-        )
-    elif args.backend == "paddle" or args.backend == "paddle_tensorrt":
-        args.use_trt = args.backend == "paddle_tensorrt"
-        runtime_options = dict(
-            text_encoder=create_paddle_inference_runtime(
-                use_trt=args.use_trt,
-                dynamic_shape=text_encoder_dynamic_shape,
-                use_fp16=args.use_fp16,
-                device_id=args.device_id,
-                disable_paddle_trt_ops=["arg_max", "range", "lookup_table_v2"],
-                paddle_stream=paddle_stream,
-            ),
-            vae_encoder=create_paddle_inference_runtime(
-                use_trt=args.use_trt,
-                dynamic_shape=vae_encoder_dynamic_shape,
-                use_fp16=args.use_fp16,
-                device_id=args.device_id,
-                paddle_stream=paddle_stream,
-            ),
-            vae_decoder=create_paddle_inference_runtime(
-                use_trt=args.use_trt,
-                dynamic_shape=vae_decoder_dynamic_shape,
-                use_fp16=args.use_fp16,
-                device_id=args.device_id,
-                paddle_stream=paddle_stream,
-            ),
-            unet=create_paddle_inference_runtime(
-                use_trt=args.use_trt,
-                dynamic_shape=unet_dynamic_shape,
-                use_fp16=args.use_fp16,
-                device_id=args.device_id,
-                paddle_stream=paddle_stream,
-            ),
-        )
-    pipe = FastDeployStableDiffusionMegaPipeline.from_pretrained(
-        args.model_dir,
-        runtime_options=runtime_options,
-    )
-    pipe.set_progress_bar_config(disable=True)
-    pipe.change_scheduler(args.scheduler)
-    parse_prompt_type = args.parse_prompt_type
-    width = args.width
-    height = args.height
-    hr_resize_width = args.hr_resize_width
-    hr_resize_height = args.hr_resize_height
-
-    if args.infer_op == "all":
-        infer_op_list = ["zero_copy_infer", "raw"]
-    else:
-        infer_op_list = [args.infer_op]
-    if args.device == "kunlunxin_xpu" or args.backend == "paddle":
-        print("When device is kunlunxin_xpu or backend is paddle, we will use `raw` infer op.")
-        infer_op_list = ["raw"]
-
-    for infer_op in infer_op_list:
-        infer_op_dict = {
-            "vae_encoder": infer_op,
-            "vae_decoder": infer_op,
-            "text_encoder": infer_op,
-            "unet": infer_op,
-        }
-        folder = f"infer_op_{infer_op}_fp16" if args.use_fp16 else f"infer_op_{infer_op}_fp32"
-        os.makedirs(folder, exist_ok=True)
-
-        if args.task_name in ["text2img_control", "all"]:
-            init_image = load_image(
-                "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/control_bird_canny_demo.png"
-            )
-            controlnet_cond = get_canny_image(init_image, args)
-            # text2img
-            prompt = "bird"
-            time_costs = []
-            # warmup
-            pipe.text2img(
-                prompt,
-                num_inference_steps=10,
-                height=height,
-                width=width,
-                parse_prompt_type=parse_prompt_type,
-                controlnet_cond=controlnet_cond,
-                controlnet_conditioning_scale=1.0,
-                infer_op_dict=infer_op_dict,
-            )
-            print("==> Test text2img_control performance.")
-            for step in trange(args.benchmark_steps):
-                start = time.time()
-                paddle.seed(seed)
-                images = pipe.text2img(
-                    prompt,
-                    num_inference_steps=args.inference_steps,
-                    height=height,
-                    width=width,
-                    parse_prompt_type=parse_prompt_type,
-                    controlnet_cond=controlnet_cond,
-                    controlnet_conditioning_scale=1.0,
-                    infer_op_dict=infer_op_dict,
-                ).images
-                latency = time.time() - start
-                time_costs += [latency]
-                # print(f"No {step:3d} time cost: {latency:2f} s")
-            print(
-                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
-                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
-            )
-            images[0].save(f"{folder}/text2img_control.png")
-
-        if args.task_name in ["img2img_control", "all"]:
-            img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
-            init_image = load_image(img_url)
-            controlnet_cond = get_canny_image(init_image, args)
-            prompt = "A fantasy landscape, trending on artstation"
-            time_costs = []
-            # warmup
-            pipe.img2img(
-                prompt,
-                image=init_image,
-                num_inference_steps=20,
-                height=height,
-                width=width,
-                parse_prompt_type=parse_prompt_type,
-                controlnet_cond=controlnet_cond,
-                controlnet_conditioning_scale=1.0,
-                infer_op_dict=infer_op_dict,
-            )
-            print("==> Test img2img_control performance.")
-            for step in trange(args.benchmark_steps):
-                start = time.time()
-                paddle.seed(seed)
-                images = pipe.img2img(
-                    prompt,
-                    image=init_image,
-                    num_inference_steps=args.inference_steps,
-                    height=height,
-                    width=width,
-                    parse_prompt_type=parse_prompt_type,
-                    controlnet_cond=controlnet_cond,
-                    controlnet_conditioning_scale=1.0,
-                    infer_op_dict=infer_op_dict,
-                ).images
-                latency = time.time() - start
-                time_costs += [latency]
-                # print(f"No {step:3d} time cost: {latency:2f} s")
-            print(
-                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
-                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
-            )
-            images[0].save(f"{folder}/img2img_control.png")
-
-        if args.task_name in ["inpaint_legacy_control", "all"]:
-            img_url = (
-                "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
-            )
-            mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png"
-            init_image = load_image(img_url)
-            mask_image = load_image(mask_url)
-            controlnet_cond = get_canny_image(init_image, args)
-            prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
-            time_costs = []
-
-            pipe.inpaint_legacy(
-                prompt,
-                image=init_image,
-                mask_image=mask_image,
-                num_inference_steps=20,
-                height=height,
-                width=width,
-                parse_prompt_type=parse_prompt_type,
-                controlnet_cond=controlnet_cond,
-                controlnet_conditioning_scale=1.0,
-                infer_op_dict=infer_op_dict,
-            )
-            print("==> Test inpaint_legacy_control performance.")
-            for step in trange(args.benchmark_steps):
-                start = time.time()
-                paddle.seed(seed)
-                images = pipe.inpaint_legacy(
-                    prompt,
-                    image=init_image,
-                    mask_image=mask_image,
-                    num_inference_steps=args.inference_steps,
-                    height=height,
-                    width=width,
-                    parse_prompt_type=parse_prompt_type,
-                    controlnet_cond=controlnet_cond,
-                    controlnet_conditioning_scale=1.0,
-                    infer_op_dict=infer_op_dict,
-                ).images
-                latency = time.time() - start
-                time_costs += [latency]
-                # print(f"No {step:3d} time cost: {latency:2f} s")
-            print(
-                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
-                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
-            )
-            if args.task_name == "all":
-                task_name = "inpaint_legacy_control"
-            else:
-                task_name = args.task_name
-            images[0].save(f"{folder}/{task_name}.png")
-
-        if args.task_name in ["hiresfix_control", "all"]:
-            hiresfix_pipe = DiffusionPipeline.from_pretrained(
-                args.model_dir,
-                vae_encoder=pipe.vae_encoder,
-                vae_decoder=pipe.vae_decoder,
-                text_encoder=pipe.text_encoder,
-                tokenizer=pipe.tokenizer,
-                unet=pipe.unet,
-                scheduler=pipe.scheduler,
-                safety_checker=pipe.safety_checker,
-                feature_extractor=pipe.feature_extractor,
-                requires_safety_checker=pipe.requires_safety_checker,
-                custom_pipeline="pipeline_fastdeploy_stable_diffusion_hires_fix",
-            )
-            # custom_pipeline
-            # https://github.com/PaddlePaddle/PaddleNLP/blob/develop/ppdiffusers/examples/community/pipeline_fastdeploy_stable_diffusion_hires_fix.py
-            hiresfix_pipe._progress_bar_config = pipe._progress_bar_config
-            # hiresfix_control
-            init_image = load_image(
-                "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/control_bird_canny_demo.png"
-            )
-            controlnet_cond = get_canny_image(init_image, args)
-            # hiresfix_control
-            prompt = "a red bird"
-            time_costs = []
-            # warmup
-            hiresfix_pipe(
-                prompt,
-                height=height,
-                width=width,
-                num_inference_steps=20,
-                hires_ratio=0.5,
-                hr_resize_width=hr_resize_width,
-                hr_resize_height=hr_resize_height,
-                enable_hr=True,
-                controlnet_cond=controlnet_cond,
-                controlnet_conditioning_scale=1.0,
-                parse_prompt_type=parse_prompt_type,
-                infer_op_dict=infer_op_dict,
-            )
-            print("==> Test hiresfix_control performance.")
-            for step in trange(args.benchmark_steps):
-                start = time.time()
-                paddle.seed(seed)
-                images = hiresfix_pipe(
-                    prompt,
-                    height=height,
-                    width=width,
-                    num_inference_steps=args.inference_steps,
-                    hires_ratio=0.5,
-                    hr_resize_width=hr_resize_width,
-                    hr_resize_height=hr_resize_height,
-                    enable_hr=True,
-                    controlnet_cond=controlnet_cond,
-                    controlnet_conditioning_scale=1.0,
-                    parse_prompt_type=parse_prompt_type,
-                    infer_op_dict=infer_op_dict,
-                ).images
-                latency = time.time() - start
-                time_costs += [latency]
-                # print(f"No {step:3d} time cost: {latency:2f} s")
-            print(
-                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
-                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
-            )
-            images[0].save(f"{folder}/hiresfix_control.png")
-
-
-if __name__ == "__main__":
-    args = parse_arguments()
-    main(args)
diff --git a/ppdiffusers/deploy/controlnet/infer_dygraph.py b/ppdiffusers/deploy/controlnet/infer_dygraph.py
deleted file mode 100644
index 7ddb12b377fe..000000000000
--- a/ppdiffusers/deploy/controlnet/infer_dygraph.py
+++ /dev/null
@@ -1,360 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-import time
-import warnings
-
-import cv2
-import numpy as np
-import paddle
-from PIL import Image
-from tqdm.auto import trange
-
-from paddlenlp.trainer.argparser import strtobool
-from paddlenlp.utils.log import logger
-from ppdiffusers import ControlNetModel, DiffusionPipeline
-from ppdiffusers.utils import load_image
-
-logger.set_level("WARNING")
-
-
-def get_canny_image(image, args):
-    if isinstance(image, Image.Image):
-        image = np.array(image)
-    image = cv2.Canny(image, args.low_threshold, args.high_threshold)
-    image = image[:, :, None]
-    image = np.concatenate([image, image, image], axis=2)
-    canny_image = Image.fromarray(image)
-    return canny_image
-
-
-def parse_arguments():
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--pretrained_model_name_or_path",
-        type=str,
-        default="runwayml/stable-diffusion-v1-5",
-        help="Path to the `ppdiffusers` checkpoint to convert (either a local directory or on the bos).",
-    )
-    parser.add_argument(
-        "--controlnet_pretrained_model_name_or_path",
-        type=str,
-        default="lllyasviel/sd-controlnet-canny",
-        help="Path to the `ppdiffusers` controlnet_pretrained_model_name_or_path  checkpoint to convert (either a local directory or on the bos).",
-    )
-    parser.add_argument("--inference_steps", type=int, default=50, help="The number of unet inference steps.")
-    parser.add_argument("--benchmark_steps", type=int, default=1, help="The number of performance benchmark steps.")
-    parser.add_argument(
-        "--task_name",
-        type=str,
-        default="text2img_control",
-        choices=[
-            "text2img_control",
-            "img2img_control",
-            "inpaint_legacy_control",
-            "hiresfix_control",
-            "all",
-        ],
-        help="The task can be one of [text2img_control, img2img_control, inpaint_legacy_control, hiresfix_control, all]. ",
-    )
-    parser.add_argument(
-        "--parse_prompt_type",
-        type=str,
-        default="lpw",
-        choices=[
-            "raw",
-            "lpw",
-        ],
-        help="The parse_prompt_type can be one of [raw, lpw]. ",
-    )
-    parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode")
-    parser.add_argument(
-        "--guess_mode",
-        type=strtobool,
-        default=False,
-        help="In this mode, the ControlNet encoder will try best to recognize the content of the input image even if you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended.",
-    )
-    parser.add_argument(
-        "--attention_type", type=str, default="raw", choices=["raw", "cutlass", "flash", "all"], help="attention_type."
-    )
-    parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu")
-    parser.add_argument(
-        "--scheduler",
-        type=str,
-        default="euler-ancestral",
-        choices=[
-            "pndm",
-            "lms",
-            "euler",
-            "euler-ancestral",
-            "dpm-multi",
-            "dpm-single",
-            "unipc-multi",
-            "ddim",
-            "ddpm",
-            "deis-multi",
-            "heun",
-            "kdpm2-ancestral",
-            "kdpm2",
-        ],
-        help="The scheduler type of stable diffusion.",
-    )
-    parser.add_argument("--height", type=int, default=512, help="Height of input image")
-    parser.add_argument("--width", type=int, default=512, help="Width of input image")
-    parser.add_argument("--hr_resize_height", type=int, default=768, help="HR Height of input image")
-    parser.add_argument("--hr_resize_width", type=int, default=768, help="HR Width of input image")
-    parser.add_argument("--low_threshold", type=int, default=100, help="The value of Canny low threshold.")
-    parser.add_argument("--high_threshold", type=int, default=200, help="The value of Canny high threshold.")
-    return parser.parse_args()
-
-
-def main(args):
-    if args.device_id == -1:
-        paddle.set_device("cpu")
-    else:
-        paddle.set_device(f"gpu:{args.device_id}")
-    seed = 1024
-    paddle_dtype = paddle.float16 if args.use_fp16 else paddle.float32
-    controlnet = ControlNetModel.from_pretrained(
-        args.controlnet_pretrained_model_name_or_path, paddle_dtype=paddle_dtype
-    )
-    pipe = DiffusionPipeline.from_pretrained(
-        args.pretrained_model_name_or_path,
-        controlnet=controlnet,
-        safety_checker=None,
-        feature_extractor=None,
-        requires_safety_checker=False,
-        paddle_dtype=paddle_dtype,
-        custom_pipeline="stable_diffusion_mega",
-    )
-    pipe.set_progress_bar_config(disable=True)
-    pipe.change_scheduler(args.scheduler)
-    parse_prompt_type = args.parse_prompt_type
-
-    if args.attention_type == "all":
-        args.attention_type = ["raw", "cutlass", "flash"]
-    else:
-        args.attention_type = [args.attention_type]
-
-    for attention_type in args.attention_type:
-        if attention_type == "raw":
-            pipe.disable_xformers_memory_efficient_attention()
-        else:
-            try:
-                pipe.enable_xformers_memory_efficient_attention(attention_type)
-            except Exception as e:
-                if attention_type == "flash":
-                    warnings.warn(
-                        "Attention type flash is not supported on your GPU! We need to use 3060、3070、3080、3090、4060、4070、4080、4090、A30、A100 etc."
-                    )
-                    continue
-                else:
-                    raise ValueError(e)
-        guess_mode = args.guess_mode
-        width = args.width
-        height = args.height
-        hr_resize_width = args.hr_resize_width
-        hr_resize_height = args.hr_resize_height
-        folder = f"attn_{attention_type}_fp16" if args.use_fp16 else f"attn_{attention_type}_fp32"
-        os.makedirs(folder, exist_ok=True)
-        if args.task_name in ["text2img_control", "all"]:
-            init_image = load_image(
-                "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/control_bird_canny_demo.png"
-            )
-            controlnet_cond = get_canny_image(init_image, args)
-            # text2img
-            prompt = "bird"
-            time_costs = []
-            # warmup
-            pipe.text2img(
-                prompt,
-                num_inference_steps=10,
-                height=height,
-                width=width,
-                controlnet_cond=controlnet_cond,
-                controlnet_conditioning_scale=1.0,
-                guess_mode=guess_mode,
-                parse_prompt_type=parse_prompt_type,
-            )
-            print("==> Test text2img_control performance.")
-            for step in trange(args.benchmark_steps):
-                start = time.time()
-                paddle.seed(seed)
-                images = pipe.text2img(
-                    prompt,
-                    num_inference_steps=args.inference_steps,
-                    height=height,
-                    width=width,
-                    controlnet_cond=controlnet_cond,
-                    controlnet_conditioning_scale=1.0,
-                    guess_mode=guess_mode,
-                    parse_prompt_type=parse_prompt_type,
-                ).images
-                latency = time.time() - start
-                time_costs += [latency]
-                # print(f"No {step:3d} time cost: {latency:2f} s")
-            print(
-                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
-                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
-            )
-            images[0].save(f"{folder}/text2img_control.png")
-
-        if args.task_name in ["img2img_control", "all"]:
-            img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
-            init_image = load_image(img_url)
-            controlnet_cond = get_canny_image(init_image, args)
-            prompt = "A fantasy landscape, trending on artstation"
-            time_costs = []
-            # warmup
-            pipe.img2img(
-                prompt,
-                image=init_image,
-                num_inference_steps=20,
-                height=height,
-                width=width,
-                controlnet_cond=controlnet_cond,
-                controlnet_conditioning_scale=1.0,
-                guess_mode=guess_mode,
-                parse_prompt_type=parse_prompt_type,
-            )
-            print("==> Test img2img_control performance.")
-            for step in trange(args.benchmark_steps):
-                start = time.time()
-                paddle.seed(seed)
-                images = pipe.img2img(
-                    prompt,
-                    image=init_image,
-                    num_inference_steps=args.inference_steps,
-                    height=height,
-                    width=width,
-                    controlnet_cond=controlnet_cond,
-                    controlnet_conditioning_scale=1.0,
-                    guess_mode=guess_mode,
-                    parse_prompt_type=parse_prompt_type,
-                ).images
-                latency = time.time() - start
-                time_costs += [latency]
-                # print(f"No {step:3d} time cost: {latency:2f} s")
-            print(
-                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
-                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
-            )
-            images[0].save(f"{folder}/img2img_control.png")
-
-        if args.task_name in ["inpaint_legacy_control", "all"]:
-            img_url = (
-                "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
-            )
-            mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png"
-            init_image = load_image(img_url)
-            mask_image = load_image(mask_url)
-            controlnet_cond = get_canny_image(init_image, args)
-            prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
-            time_costs = []
-            task_name = "inpaint_legacy_control"
-            pipe.inpaint_legacy(
-                prompt,
-                image=init_image,
-                mask_image=mask_image,
-                num_inference_steps=20,
-                height=height,
-                width=width,
-                controlnet_cond=controlnet_cond,
-                controlnet_conditioning_scale=1.0,
-                guess_mode=guess_mode,
-                parse_prompt_type=parse_prompt_type,
-            )
-            print(f"==> Test {task_name} performance.")
-            for step in trange(args.benchmark_steps):
-                start = time.time()
-                paddle.seed(seed)
-                images = pipe.inpaint_legacy(
-                    prompt,
-                    image=init_image,
-                    mask_image=mask_image,
-                    num_inference_steps=args.inference_steps,
-                    height=height,
-                    width=width,
-                    controlnet_cond=controlnet_cond,
-                    controlnet_conditioning_scale=1.0,
-                    guess_mode=guess_mode,
-                    parse_prompt_type=parse_prompt_type,
-                ).images
-                latency = time.time() - start
-                time_costs += [latency]
-                # print(f"No {step:3d} time cost: {latency:2f} s")
-            print(
-                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
-                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
-            )
-            images[0].save(f"{folder}/{task_name}.png")
-
-        if args.task_name in ["hiresfix_control", "all"]:
-            # hiresfix_control
-            init_image = load_image(
-                "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/control_bird_canny_demo.png"
-            )
-            controlnet_cond = get_canny_image(init_image, args)
-            # hiresfix_control
-            prompt = "a red bird"
-            time_costs = []
-            # warmup
-            pipe.hires_fix(
-                prompt,
-                height=height,
-                width=width,
-                num_inference_steps=20,
-                hires_ratio=0.5,
-                hr_resize_width=hr_resize_width,
-                hr_resize_height=hr_resize_height,
-                enable_hr=True,
-                controlnet_cond=controlnet_cond,
-                controlnet_conditioning_scale=1.0,
-                guess_mode=guess_mode,
-                parse_prompt_type=parse_prompt_type,
-            )
-            print("==> Test hiresfix_control performance.")
-            for step in trange(args.benchmark_steps):
-                start = time.time()
-                paddle.seed(seed)
-                images = pipe.hires_fix(
-                    prompt,
-                    height=height,
-                    width=width,
-                    num_inference_steps=args.inference_steps,
-                    hires_ratio=0.5,
-                    hr_resize_width=hr_resize_width,
-                    hr_resize_height=hr_resize_height,
-                    enable_hr=True,
-                    controlnet_cond=controlnet_cond,
-                    controlnet_conditioning_scale=1.0,
-                    guess_mode=guess_mode,
-                    parse_prompt_type=parse_prompt_type,
-                ).images
-                latency = time.time() - start
-                time_costs += [latency]
-                # print(f"No {step:3d} time cost: {latency:2f} s")
-            print(
-                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
-                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
-            )
-            images[0].save(f"{folder}/hiresfix_control.png")
-
-
-if __name__ == "__main__":
-    args = parse_arguments()
-    main(args)
diff --git a/ppdiffusers/deploy/controlnet/infer_dygraph_toch.py b/ppdiffusers/deploy/controlnet/infer_dygraph_toch.py
deleted file mode 100644
index 20dc65d42741..000000000000
--- a/ppdiffusers/deploy/controlnet/infer_dygraph_toch.py
+++ /dev/null
@@ -1,419 +0,0 @@
-# Copyright (c) 2023 torchtorch Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-import time
-
-import torch
-
-torch.nn.functional.scaled_dot_product_attention_ = torch.nn.functional.scaled_dot_product_attention
-delattr(torch.nn.functional, "scaled_dot_product_attention")
-
-import cv2
-import numpy as np
-from diffusers import (
-    ControlNetModel,
-    DDIMScheduler,
-    DDPMScheduler,
-    DEISMultistepScheduler,
-    DPMSolverMultistepScheduler,
-    DPMSolverSinglestepScheduler,
-    EulerAncestralDiscreteScheduler,
-    EulerDiscreteScheduler,
-    HeunDiscreteScheduler,
-    KDPM2AncestralDiscreteScheduler,
-    KDPM2DiscreteScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    StableDiffusionControlNetImg2ImgPipeline,
-    StableDiffusionControlNetInpaintPipeline,
-    StableDiffusionControlNetPipeline,
-    UniPCMultistepScheduler,
-)
-from diffusers.models.attention_processor import AttnProcessor, AttnProcessor2_0
-from diffusers.utils import load_image
-from PIL import Image
-from tqdm.auto import trange
-
-
-def get_canny_image(image, args):
-    if isinstance(image, Image.Image):
-        image = np.array(image)
-    image = cv2.Canny(image, args.low_threshold, args.high_threshold)
-    image = image[:, :, None]
-    image = np.concatenate([image, image, image], axis=2)
-    canny_image = Image.fromarray(image)
-    return canny_image
-
-
-def strtobool(v):
-    if isinstance(v, bool):
-        return v
-    if v.lower() in ("yes", "true", "t", "y", "1"):
-        return True
-    elif v.lower() in ("no", "false", "f", "n", "0"):
-        return False
-    else:
-        raise ValueError(
-            f"Truthy value expected: got {v} but expected one of yes/no, true/false, t/f, y/n, 1/0 (case insensitive)."
-        )
-
-
-def change_scheduler(self, scheduler_type="ddim"):
-    self.orginal_scheduler_config = self.scheduler.config
-    scheduler_type = scheduler_type.lower()
-    if scheduler_type == "pndm":
-        scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True)
-    elif scheduler_type == "lms":
-        scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config)
-    elif scheduler_type == "heun":
-        scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config)
-    elif scheduler_type == "euler":
-        scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config)
-    elif scheduler_type == "euler-ancestral":
-        scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
-    elif scheduler_type == "dpm-multi":
-        scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config)
-    elif scheduler_type == "dpm-single":
-        scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config)
-    elif scheduler_type == "kdpm2-ancestral":
-        scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
-    elif scheduler_type == "kdpm2":
-        scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config)
-    elif scheduler_type == "unipc-multi":
-        scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config)
-    elif scheduler_type == "ddim":
-        scheduler = DDIMScheduler.from_config(
-            self.orginal_scheduler_config,
-            steps_offset=1,
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-    elif scheduler_type == "ddpm":
-        scheduler = DDPMScheduler.from_config(
-            self.orginal_scheduler_config,
-        )
-    elif scheduler_type == "deis-multi":
-        scheduler = DEISMultistepScheduler.from_config(
-            self.orginal_scheduler_config,
-        )
-    else:
-        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
-    return scheduler
-
-
-def parse_arguments():
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--pretrained_model_name_or_path",
-        type=str,
-        default="runwayml/stable-diffusion-v1-5",
-        help="Path to the `ppdiffusers` checkpoint to convert (either a local directory or on the bos).",
-    )
-    parser.add_argument(
-        "--controlnet_pretrained_model_name_or_path",
-        type=str,
-        default="lllyasviel/sd-controlnet-canny",
-        help="Path to the `ppdiffusers` controlnet_pretrained_model_name_or_path  checkpoint to convert (either a local directory or on the bos).",
-    )
-    parser.add_argument("--inference_steps", type=int, default=50, help="The number of unet inference steps.")
-    parser.add_argument("--benchmark_steps", type=int, default=10, help="The number of performance benchmark steps.")
-    parser.add_argument(
-        "--task_name",
-        type=str,
-        default="all",
-        choices=[
-            "text2img_control",
-            "img2img_control",
-            "inpaint_legacy_control",
-            "all",
-        ],
-        help="The task can be one of [text2img_control, img2img_control, inpaint_legacy_control, all]. ",
-    )
-    parser.add_argument(
-        "--parse_prompt_type",
-        type=str,
-        default="raw",
-        choices=[
-            "raw",
-            "lpw",
-        ],
-        help="The parse_prompt_type can be one of [raw, lpw]. ",
-    )
-    parser.add_argument("--channels_last", type=strtobool, default=False, help="Wheter to use channels_last")
-    parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode")
-    parser.add_argument("--tf32", type=strtobool, default=True, help="tf32")
-    parser.add_argument("--compile", type=strtobool, default=False, help="compile")
-    parser.add_argument(
-        "--attention_type",
-        type=str,
-        default="sdp",
-        choices=[
-            "raw",
-            "sdp",
-        ],
-        help="attention_type.",
-    )
-    parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu")
-    parser.add_argument(
-        "--scheduler",
-        type=str,
-        default="euler-ancestral",
-        choices=[
-            "pndm",
-            "lms",
-            "euler",
-            "euler-ancestral",
-            "dpm-multi",
-            "dpm-single",
-            "unipc-multi",
-            "ddim",
-            "ddpm",
-            "deis-multi",
-            "heun",
-            "kdpm2-ancestral",
-            "kdpm2",
-        ],
-        help="The scheduler type of stable diffusion.",
-    )
-    parser.add_argument("--height", type=int, default=512, help="Height of input image")
-    parser.add_argument("--width", type=int, default=512, help="Width of input image")
-    parser.add_argument("--low_threshold", type=int, default=100, help="The value of Canny low threshold.")
-    parser.add_argument("--high_threshold", type=int, default=200, help="The value of Canny high threshold.")
-    return parser.parse_args()
-
-
-def attn_processors(self):
-    processors = {}
-
-    def fn_recursive_add_processors(name: str, module, processors):
-        if hasattr(module, "set_processor"):
-            processors[f"{name}.processor"] = module.processor
-
-        for sub_name, child in module.named_children():
-            fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-        return processors
-
-    for name, module in self.named_children():
-        fn_recursive_add_processors(name, module, processors)
-
-    return processors
-
-
-def set_attn_processor(self, processor):
-    count = len(attn_processors(self).keys())
-
-    if isinstance(processor, dict) and len(processor) != count:
-        raise ValueError(
-            f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-            f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-        )
-
-    def fn_recursive_attn_processor(name: str, module, processor):
-        if hasattr(module, "set_processor"):
-            if not isinstance(processor, dict):
-                module.set_processor(processor)
-            else:
-                module.set_processor(processor.pop(f"{name}.processor"))
-
-        for sub_name, child in module.named_children():
-            fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-    for name, module in self.named_children():
-        fn_recursive_attn_processor(name, module, processor)
-
-
-def main(args):
-    if args.tf32:
-        torch.backends.cuda.matmul.allow_tf32 = True
-    else:
-        torch.backends.cuda.matmul.allow_tf32 = False
-
-    seed = 1024
-    torch_dtype = torch.float16 if args.use_fp16 else torch.float32
-    controlnet = ControlNetModel.from_pretrained(
-        args.controlnet_pretrained_model_name_or_path, torch_dtype=torch_dtype
-    )
-    pipe = StableDiffusionControlNetPipeline.from_pretrained(
-        args.pretrained_model_name_or_path,
-        controlnet=controlnet,
-        safety_checker=None,
-        feature_extractor=None,
-        requires_safety_checker=False,
-        torch_dtype=torch_dtype,
-    )
-    scheduler = change_scheduler(pipe, args.scheduler)
-    pipe.scheduler = scheduler
-    if args.device_id >= 0:
-        pipe.to(f"cuda:{args.device_id}")
-
-    if args.attention_type == "all":
-        args.attention_type = ["raw", "sdp"]
-    else:
-        args.attention_type = [args.attention_type]
-
-    for attention_type in args.attention_type:
-        attn_prrocessor_cls = AttnProcessor if attention_type == "raw" else AttnProcessor2_0
-        if attention_type == "sdp":
-            torch.nn.functional.scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention_
-        set_attn_processor(pipe.unet, attn_prrocessor_cls())
-        set_attn_processor(pipe.vae, attn_prrocessor_cls())
-        set_attn_processor(pipe.controlnet, attn_prrocessor_cls())
-
-        if args.channels_last:
-            pipe.unet.to(memory_format=torch.channels_last)
-            pipe.controlnet.to(memory_format=torch.channels_last)
-
-        if args.compile:
-            print("Run torch compile")
-            pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
-            pipe.controlnet = torch.compile(pipe.controlnet, mode="reduce-overhead", fullgraph=True)
-
-        width = args.width
-        height = args.height
-        pipe.set_progress_bar_config(disable=True)
-
-        folder = f"torch_attn_{attention_type}_fp16" if args.use_fp16 else f"torch_attn_{attention_type}_fp32"
-        os.makedirs(folder, exist_ok=True)
-        if args.task_name in ["text2img_control", "all"]:
-            init_image = load_image(
-                "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/control_bird_canny_demo.png"
-            )
-            controlnet_cond = get_canny_image(init_image, args).resize((width, height))
-            # text2img
-            prompt = "bird"
-            time_costs = []
-            # warmup
-            pipe(
-                prompt,
-                num_inference_steps=10,
-                height=height,
-                width=width,
-                image=controlnet_cond,
-                controlnet_conditioning_scale=1.0,
-            )
-            print("==> Test text2img_control performance.")
-            for step in trange(args.benchmark_steps):
-                start = time.time()
-                torch.cuda.manual_seed(seed)
-                images = pipe(
-                    prompt,
-                    num_inference_steps=args.inference_steps,
-                    height=height,
-                    width=width,
-                    image=controlnet_cond,
-                    controlnet_conditioning_scale=1.0,
-                ).images
-                latency = time.time() - start
-                time_costs += [latency]
-                # print(f"No {step:3d} time cost: {latency:2f} s")
-            print(
-                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
-                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
-            )
-            images[0].save(f"{folder}/text2img_control.png")
-
-        if args.task_name in ["img2img_control", "all"]:
-            pipe_img2img = StableDiffusionControlNetImg2ImgPipeline(**pipe.components)
-            pipe_img2img.set_progress_bar_config(disable=True)
-            img_url = "sketch-mountains-input.png"
-            init_image = load_image(img_url).resize((width, height))
-            controlnet_cond = get_canny_image(init_image, args).resize((width, height))
-            prompt = "A fantasy landscape, trending on artstation"
-            time_costs = []
-            # warmup
-            pipe_img2img(
-                prompt,
-                image=init_image,
-                num_inference_steps=20,
-                height=height,
-                width=width,
-                control_image=controlnet_cond,
-                controlnet_conditioning_scale=1.0,
-            )
-            print("==> Test img2img_control performance.")
-            for step in trange(args.benchmark_steps):
-                start = time.time()
-                torch.cuda.manual_seed(seed)
-                images = pipe_img2img(
-                    prompt,
-                    image=init_image,
-                    num_inference_steps=args.inference_steps,
-                    height=height,
-                    width=width,
-                    control_image=controlnet_cond,
-                    controlnet_conditioning_scale=1.0,
-                ).images
-                latency = time.time() - start
-                time_costs += [latency]
-                # print(f"No {step:3d} time cost: {latency:2f} s")
-            print(
-                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
-                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
-            )
-            images[0].save(f"{folder}/img2img_control.png")
-
-        if args.task_name in ["inpaint_legacy_control", "all"]:
-            pipe_inpaint = StableDiffusionControlNetInpaintPipeline(**pipe.components)
-            pipe_inpaint.set_progress_bar_config(disable=True)
-            img_url = (
-                "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
-            )
-            mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png"
-            init_image = load_image(img_url).resize((width, height))
-            mask_image = load_image(mask_url).resize((width, height))
-            controlnet_cond = get_canny_image(init_image, args).resize((width, height))
-            prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
-            time_costs = []
-            task_name = "inpaint_legacy_control"
-            pipe_inpaint(
-                prompt,
-                image=init_image,
-                mask_image=mask_image,
-                num_inference_steps=20,
-                height=height,
-                width=width,
-                control_image=controlnet_cond,
-                controlnet_conditioning_scale=1.0,
-            )
-            print(f"==> Test {task_name} performance.")
-            for step in trange(args.benchmark_steps):
-                start = time.time()
-                torch.cuda.manual_seed(seed)
-                images = pipe_inpaint(
-                    prompt,
-                    image=init_image,
-                    mask_image=mask_image,
-                    num_inference_steps=args.inference_steps,
-                    height=height,
-                    width=width,
-                    control_image=controlnet_cond,
-                    controlnet_conditioning_scale=1.0,
-                ).images
-                latency = time.time() - start
-                time_costs += [latency]
-                # print(f"No {step:3d} time cost: {latency:2f} s")
-            print(
-                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
-                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
-            )
-            images[0].save(f"{folder}/{task_name}.png")
-
-
-if __name__ == "__main__":
-    args = parse_arguments()
-    main(args)
diff --git a/ppdiffusers/deploy/export.md b/ppdiffusers/deploy/export.md
deleted file mode 100644
index 9103849b9147..000000000000
--- a/ppdiffusers/deploy/export.md
+++ /dev/null
@@ -1,65 +0,0 @@
-# Diffusion 模型导出教程
-
-
-[PPDiffusers](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers) 是一款支持跨模态（如图像与语音）训练和推理的扩散模型（Diffusion Model）工具箱，其借鉴了🤗 Huggingface 团队的 [Diffusers](https://github.com/huggingface/diffusers) 的优秀设计，并且依托 [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) 框架和 [PaddleNLP](https://github.com/PaddlePaddle/PaddleNLP) 自然语言处理库。下面将介绍如何将 PPDiffusers 提供的预训练模型进行模型导出。
-
-### 模型导出
-
-___注意：模型导出过程中，需要下载 StableDiffusion 模型。为了使用该模型与权重，你必须接受该模型所要求的 License，请访问 HuggingFace 的[model card](https://huggingface.co/runwayml/stable-diffusion-v1-5), 仔细阅读里面的 License，然后签署该协议。___
-
-___Tips: Stable Diffusion 是基于以下的 License: The CreativeML OpenRAIL M license is an Open RAIL M license, adapted from the work that BigScience and the RAIL Initiative are jointly carrying in the area of responsible AI licensing. See also the article about the BLOOM Open RAIL license on which this license is based.___
-
-可执行以下命令行完成模型导出。
-
-```shell
-python export_model.py --pretrained_model_name_or_path runwayml/stable-diffusion-v1-5 --output_path stable-diffusion-v1-5 --height=512 --width=512
-```
-注: 上述指令导出固定尺寸的模型，固定尺寸的导出模型有利于优化模型推理性能，但会牺牲一定灵活性。若要导出支持多种推理尺寸的模型，可取消参数--height和--width的设置。
-
-输出的模型目录结构如下：
-
-```shell
-stable-diffusion-v1-5/
-├── model_index.json
-├── scheduler
-│   └── scheduler_config.json
-├── tokenizer
-│   ├── tokenizer_config.json
-│   ├── merges.txt
-│   ├── vocab.json
-│   └── special_tokens_map.json
-├── text_encoder
-│   ├── inference.pdiparams
-│   ├── inference.pdiparams.info
-│   └── inference.pdmodel
-├── unet
-│   ├── inference.pdiparams
-│   ├── inference.pdiparams.info
-│   └── inference.pdmodel
-├── vae_decoder
-│   ├── inference.pdiparams
-│   ├── inference.pdiparams.info
-│   └── inference.pdmodel
-└── vae_encoder
-    ├── inference.pdiparams
-    ├── inference.pdiparams.info
-    └── inference.pdmodel
-```
-
-#### Inpaint 任务模型导出
-
-除了支持常规 StableDiffusion 文生图、图生图任务的模型导出以外，还支持Inpaint任务模型 (注意：这个不是 legacy 版本的 inpaint) 的导出、如果需要导出 inpaint 模型，可以执行以下命令：
-
-```shell
-python export_model.py --pretrained_model_name_or_path runwayml/stable-diffusion-inpainting --output_path stable-diffusion-v1-5-inpainting
-```
-
-#### 参数说明
-
-`export_model.py` 各命令行参数的说明。
-
-| 参数 |参数说明 |
-|----------|--------------|
-| <span style="display:inline-block;width: 230pt"> --pretrained_model_name_or_path </span> | ppdiffuers提供的diffusion预训练模型。默认为："CompVis/stable-diffusion-v1-4"。更多 StableDiffusion 预训练模型可参考 [ppdiffusers 模型列表](../README.md#ppdiffusers模型支持的权重)。|
-| --output_path | 导出的模型目录。 |
-| --sample | vae encoder 的输出是否调整为 sample 模式，注意：sample模式会引入随机因素，默认是 False。|
diff --git a/ppdiffusers/deploy/export_model.py b/ppdiffusers/deploy/export_model.py
deleted file mode 100644
index cb4731c68bca..000000000000
--- a/ppdiffusers/deploy/export_model.py
+++ /dev/null
@@ -1,176 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-from pathlib import Path
-from types import MethodType
-
-import paddle
-
-from ppdiffusers import (
-    FastDeployRuntimeModel,
-    FastDeployStableDiffusionInpaintPipeline,
-    FastDeployStableDiffusionMegaPipeline,
-    StableDiffusionPipeline,
-    UNet2DConditionModel,
-)
-
-
-def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(
-    model_path: str,
-    output_path: str,
-    sample: bool = False,
-    height: int = None,
-    width: int = None,
-):
-    # specify unet model with unet pre_temb_act opt enabled.
-    unet_model = UNet2DConditionModel.from_pretrained(model_path, resnet_pre_temb_non_linearity=True, subfolder="unet")
-    pipeline = StableDiffusionPipeline.from_pretrained(
-        model_path, unet=unet_model, safety_checker=None, feature_extractor=None
-    )
-    # make sure we disable xformers
-    pipeline.disable_xformers_memory_efficient_attention()
-    output_path = Path(output_path)
-    # calculate latent's H and W
-    latent_height = height // 8 if height is not None else None
-    latent_width = width // 8 if width is not None else None
-    # get arguments
-    cross_attention_dim = pipeline.unet.config.cross_attention_dim  # 768 or 1024 or 1280
-    unet_channels = pipeline.unet.config.in_channels  # 4 or 9
-    vae_in_channels = pipeline.vae.config.in_channels  # 3
-    vae_latent_channels = pipeline.vae.config.latent_channels  # 4
-    print(
-        f"cross_attention_dim: {cross_attention_dim}\n",
-        f"unet_in_channels: {unet_channels}\n",
-        f"vae_encoder_in_channels: {vae_in_channels}\n",
-        f"vae_decoder_latent_channels: {vae_latent_channels}",
-    )
-    # 1. Convert text_encoder
-    text_encoder = paddle.jit.to_static(
-        pipeline.text_encoder,
-        input_spec=[paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids")],  # input_ids
-    )
-    save_path = os.path.join(args.output_path, "text_encoder", "inference")
-    paddle.jit.save(text_encoder, save_path)
-    print(f"Save text_encoder model in {save_path} successfully.")
-    del pipeline.text_encoder
-
-    # 2. Convert unet
-    unet = paddle.jit.to_static(
-        pipeline.unet,
-        input_spec=[
-            paddle.static.InputSpec(
-                shape=[None, unet_channels, latent_height, latent_width], dtype="float32", name="sample"
-            ),  # sample
-            paddle.static.InputSpec(shape=[1], dtype="float32", name="timestep"),  # timestep
-            paddle.static.InputSpec(
-                shape=[None, None, cross_attention_dim], dtype="float32", name="encoder_hidden_states"
-            ),  # encoder_hidden_states
-        ],
-    )
-    save_path = os.path.join(args.output_path, "unet", "inference")
-    paddle.jit.save(unet, save_path)
-    print(f"Save unet model in {save_path} successfully.")
-    del pipeline.unet
-
-    def forward_vae_encoder_mode(self, z):
-        return self.encode(z, True).latent_dist.mode()
-
-    def forward_vae_encoder_sample(self, z):
-        return self.encode(z, True).latent_dist.sample()
-
-    # 3. Convert vae encoder
-    vae_encoder = pipeline.vae
-    if sample:
-        vae_encoder.forward = MethodType(forward_vae_encoder_sample, vae_encoder)
-    else:
-        vae_encoder.forward = MethodType(forward_vae_encoder_mode, vae_encoder)
-
-    vae_encoder = paddle.jit.to_static(
-        vae_encoder,
-        input_spec=[
-            paddle.static.InputSpec(
-                shape=[None, vae_in_channels, height, width],
-                dtype="float32",
-                name="sample",  # N, C, H, W
-            ),  # latent
-        ],
-    )
-    # Save vae_encoder in static graph model.
-    save_path = os.path.join(args.output_path, "vae_encoder", "inference")
-    paddle.jit.save(vae_encoder, save_path)
-    print(f"Save vae_encoder model in {save_path} successfully.")
-
-    # 4. Convert vae encoder
-    vae_decoder = pipeline.vae
-
-    def forward_vae_decoder(self, z):
-        return self.decode(z, True).sample
-
-    vae_decoder.forward = MethodType(forward_vae_decoder, vae_decoder)
-    vae_decoder = paddle.jit.to_static(
-        vae_decoder,
-        input_spec=[
-            paddle.static.InputSpec(
-                shape=[None, vae_latent_channels, latent_height, latent_width], dtype="float32", name="latent_sample"
-            ),  # latent_sample
-        ],
-    )
-    # Save vae_decoder in static graph model.
-    save_path = os.path.join(args.output_path, "vae_decoder", "inference")
-    paddle.jit.save(vae_decoder, save_path)
-    print(f"Save vae_decoder model in {save_path} successfully.")
-    del pipeline.vae
-
-    if "inpainting" in model_path:
-        fd_pipe_cls = FastDeployStableDiffusionInpaintPipeline
-    else:
-        fd_pipe_cls = FastDeployStableDiffusionMegaPipeline
-
-    fastdeploy_pipeline = fd_pipe_cls(
-        vae_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_encoder"),
-        vae_decoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_decoder"),
-        text_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "text_encoder"),
-        unet=FastDeployRuntimeModel.from_pretrained(output_path / "unet"),
-        tokenizer=pipeline.tokenizer,
-        scheduler=pipeline.scheduler,
-        safety_checker=None,
-        feature_extractor=None,
-        requires_safety_checker=False,
-    )
-    fastdeploy_pipeline.save_pretrained(output_path)
-    print("FastDeploy pipeline saved to", output_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--pretrained_model_name_or_path",
-        type=str,
-        required=True,
-        help="Path to the `ppdiffusers` checkpoint to convert (either a local directory or on the bos).",
-    )
-    parser.add_argument("--output_path", type=str, required=True, help="Path to the output model.")
-    parser.add_argument(
-        "--sample", action="store_true", default=False, help="Export the vae encoder in mode or sample"
-    )
-    parser.add_argument("--height", type=int, default=None, help="The height of output images. Default: None")
-    parser.add_argument("--width", type=int, default=None, help="The width of output images. Default: None")
-    args = parser.parse_args()
-
-    convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(
-        args.pretrained_model_name_or_path, args.output_path, args.sample, args.height, args.width
-    )
diff --git a/ppdiffusers/deploy/infer.py b/ppdiffusers/deploy/infer.py
deleted file mode 100644
index a031376914ed..000000000000
--- a/ppdiffusers/deploy/infer.py
+++ /dev/null
@@ -1,708 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-import time
-
-# isort: split
-import paddle
-
-# isort: split
-import fastdeploy as fd
-import numpy as np
-from tqdm.auto import trange
-
-from paddlenlp.trainer.argparser import strtobool
-from ppdiffusers import DiffusionPipeline, FastDeployStableDiffusionMegaPipeline
-from ppdiffusers.utils import load_image
-
-
-def parse_arguments():
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_dir",
-        default="runwayml/stable-diffusion-v1-5@fastdeploy",
-        help="The model directory of diffusion_model.",
-    )
-    parser.add_argument("--inference_steps", type=int, default=50, help="The number of unet inference steps.")
-    parser.add_argument("--benchmark_steps", type=int, default=1, help="The number of performance benchmark steps.")
-    parser.add_argument(
-        "--backend",
-        type=str,
-        default="paddle_tensorrt",
-        # Note(zhoushunjie): Will support 'tensorrt' soon.
-        choices=["onnx_runtime", "paddle", "paddlelite", "paddle_tensorrt"],
-        help="The inference runtime backend of unet model and text encoder model.",
-    )
-    parser.add_argument(
-        "--device",
-        type=str,
-        default="gpu",
-        # Note(shentanyue): Will support more devices.
-        choices=[
-            "cpu",
-            "gpu",
-            "huawei_ascend_npu",
-            "kunlunxin_xpu",
-        ],
-        help="The inference runtime device of models.",
-    )
-    parser.add_argument(
-        "--task_name",
-        type=str,
-        default="text2img",
-        choices=[
-            "text2img",
-            "img2img",
-            "inpaint",
-            "inpaint_legacy",
-            "cycle_diffusion",
-            "hiresfix",
-            "mixture_tiling",
-            "all",
-        ],
-        help="The task can be one of [text2img, img2img, inpaint, inpaint_legacy, cycle_diffusion, hiresfix, mixture_tiling, all]. ",
-    )
-    parser.add_argument(
-        "--parse_prompt_type",
-        type=str,
-        default="lpw",
-        choices=[
-            "raw",
-            "lpw",
-        ],
-        help="The parse_prompt_type can be one of [raw, lpw]. ",
-    )
-    parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode")
-    parser.add_argument("--use_bf16", type=strtobool, default=False, help="Wheter to use BF16 mode")
-    parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu")
-    parser.add_argument(
-        "--scheduler",
-        type=str,
-        default="preconfig-euler-ancestral",
-        choices=[
-            "pndm",
-            "lms",
-            "euler",
-            "euler-ancestral",
-            "preconfig-euler-ancestral",
-            "dpm-multi",
-            "dpm-single",
-            "unipc-multi",
-            "ddim",
-            "ddpm",
-            "deis-multi",
-            "heun",
-            "kdpm2-ancestral",
-            "kdpm2",
-        ],
-        help="The scheduler type of stable diffusion.",
-    )
-    parser.add_argument(
-        "--infer_op",
-        type=str,
-        default="zero_copy_infer",
-        choices=[
-            "zero_copy_infer",
-            "raw",
-            "all",
-        ],
-        help="The type of infer op.",
-    )
-    parser.add_argument("--height", type=int, default=512, help="Height of input image")
-    parser.add_argument("--width", type=int, default=512, help="Width of input image")
-    parser.add_argument("--hr_resize_height", type=int, default=768, help="HR Height of input image")
-    parser.add_argument("--hr_resize_width", type=int, default=768, help="HR Width of input image")
-    parser.add_argument("--is_sd2_0", type=strtobool, default=False, help="Is sd2_0 model?")
-
-    return parser.parse_args()
-
-
-def create_ort_runtime(device_id=0):
-    option = fd.RuntimeOption()
-    option.use_ort_backend()
-    if device_id == -1:
-        option.use_cpu()
-    else:
-        option.use_gpu(device_id)
-    return option
-
-
-def create_paddle_inference_runtime(
-    use_trt=False,
-    dynamic_shape=None,
-    use_fp16=False,
-    use_bf16=False,
-    device_id=0,
-    disable_paddle_trt_ops=[],
-    disable_paddle_pass=[],
-    paddle_stream=None,
-    workspace=None,
-):
-    assert not use_fp16 or not use_bf16, "use_fp16 and use_bf16 are mutually exclusive"
-    option = fd.RuntimeOption()
-    option.use_paddle_backend()
-    if device_id == -1:
-        option.use_cpu()
-    else:
-        option.use_gpu(device_id)
-    if paddle_stream is not None and use_trt:
-        option.set_external_raw_stream(paddle_stream)
-    for pass_name in disable_paddle_pass:
-        option.paddle_infer_option.delete_pass(pass_name)
-    if use_bf16:
-        option.paddle_infer_option.inference_precision = "bfloat16"
-    if use_trt:
-        option.paddle_infer_option.disable_trt_ops(disable_paddle_trt_ops)
-        option.paddle_infer_option.enable_trt = True
-        if workspace is not None:
-            option.set_trt_max_workspace_size(workspace)
-        if use_fp16:
-            option.trt_option.enable_fp16 = True
-        else:
-            # Note(zhoushunjie): These four passes don't support fp32 now.
-            # Remove this line of code in future.
-            only_fp16_passes = [
-                "trt_cross_multihead_matmul_fuse_pass",
-                "trt_flash_multihead_matmul_fuse_pass",
-                "preln_elementwise_groupnorm_act_pass",
-                "elementwise_groupnorm_act_pass",
-            ]
-            for curr_pass in only_fp16_passes:
-                option.paddle_infer_option.delete_pass(curr_pass)
-
-        # Need to enable collect shape
-        if dynamic_shape is not None:
-            option.paddle_infer_option.collect_trt_shape = True
-            for key, shape_dict in dynamic_shape.items():
-                option.trt_option.set_shape(
-                    key, shape_dict["min_shape"], shape_dict.get("opt_shape", None), shape_dict.get("max_shape", None)
-                )
-    return option
-
-
-def create_paddle_lite_runtime(device="cpu", device_id=0, use_fp16=False):
-    option = fd.RuntimeOption()
-    option.use_paddle_lite_backend()
-    if device == "huawei_ascend_npu":
-        option.use_ascend()
-        option.set_lite_device_names(["huawei_ascend_npu"])
-        option.set_lite_context_properties(
-            "HUAWEI_ASCEND_NPU_SELECTED_DEVICE_IDS={};HUAWEI_ASCEND_NPU_PRECISION_MODE=allow_mix_precision".format(
-                device_id
-            )
-        )
-    elif device == "kunlunxin_xpu":
-        # TODO(shentanyue): Add kunlunxin_xpu code
-        # https://github.com/PaddlePaddle/FastDeploy/blob/4c3e7030e151528d304619901c794481bb2f6037/examples/multimodal/stable_diffusion/infer.py#L178-L195
-        option.use_kunlunxin(
-            device_id,
-            l3_workspace_size=(64 * 1024 * 1024 - 4 * 1024),
-            locked=False,
-            autotune=False,
-            autotune_file="",
-            precision="int16",
-            adaptive_seqlen=True,
-            enable_multi_stream=True,
-        )
-        if use_fp16:
-            option.enable_lite_fp16()
-    else:
-        pass
-    return option
-
-
-def create_trt_runtime(workspace=(1 << 31), dynamic_shape=None, use_fp16=False, device_id=0):
-    option = fd.RuntimeOption()
-    option.use_trt_backend()
-    option.use_gpu(device_id)
-    if use_fp16:
-        option.enable_trt_fp16()
-    if workspace is not None:
-        option.set_trt_max_workspace_size(workspace)
-    if dynamic_shape is not None:
-        for key, shape_dict in dynamic_shape.items():
-            option.set_trt_input_shape(
-                key,
-                min_shape=shape_dict["min_shape"],
-                opt_shape=shape_dict.get("opt_shape", None),
-                max_shape=shape_dict.get("max_shape", None),
-            )
-    return option
-
-
-def main(args):
-    if args.device_id == -1:
-        paddle.set_device("cpu")
-        paddle_stream = None
-    else:
-        paddle.set_device(f"gpu:{args.device_id}")
-        paddle_stream = paddle.device.cuda.current_stream(args.device_id).cuda_stream
-
-    seed = 1024
-    vae_in_channels = 4
-    text_encoder_max_length = 77
-    unet_max_length = text_encoder_max_length * 3  # lpw support max_length is 77x3
-    min_image_size = 512
-    max_image_size = 768
-    max_image_size = max(min_image_size, max_image_size)
-    hidden_states = 1024 if args.is_sd2_0 else 768
-    unet_in_channels = 9 if args.task_name == "inpaint" else 4
-
-    if args.task_name == "cycle_diffusion":
-        bs = 4
-        min_image_size = max_image_size = 512
-    else:
-        bs = 2
-
-    text_encoder_dynamic_shape = {
-        "input_ids": {
-            "min_shape": [1, text_encoder_max_length],
-            "max_shape": [1, text_encoder_max_length],
-            "opt_shape": [1, text_encoder_max_length],
-        }
-    }
-
-    vae_encoder_dynamic_shape = {
-        "sample": {
-            "min_shape": [1, 3, min_image_size, min_image_size],
-            "max_shape": [1, 3, max_image_size, max_image_size],
-            "opt_shape": [1, 3, min_image_size, min_image_size],
-        }
-    }
-
-    vae_decoder_dynamic_shape = {
-        "latent_sample": {
-            "min_shape": [1, vae_in_channels, min_image_size // 8, min_image_size // 8],
-            "max_shape": [1, vae_in_channels, max_image_size // 8, max_image_size // 8],
-            "opt_shape": [1, vae_in_channels, min_image_size // 8, min_image_size // 8],
-        }
-    }
-
-    unet_dynamic_shape = {
-        "sample": {
-            "min_shape": [1, unet_in_channels, min_image_size // 8, min_image_size // 8],
-            "max_shape": [bs, unet_in_channels, max_image_size // 8, max_image_size // 8],
-            "opt_shape": [2, unet_in_channels, min_image_size // 8, min_image_size // 8],
-        },
-        "timestep": {
-            "min_shape": [1],
-            "max_shape": [1],
-            "opt_shape": [1],
-        },
-        "encoder_hidden_states": {
-            "min_shape": [1, text_encoder_max_length, hidden_states],
-            "max_shape": [bs, unet_max_length, hidden_states],
-            "opt_shape": [2, text_encoder_max_length, hidden_states],
-        },
-    }
-    # 4. Init runtime
-    if args.backend == "onnx_runtime":
-        runtime_options = dict(
-            text_encoder=create_ort_runtime(device_id=args.device_id),
-            vae_encoder=create_ort_runtime(device_id=args.device_id),
-            vae_decoder=create_ort_runtime(device_id=args.device_id),
-            unet=create_ort_runtime(device_id=args.device_id),
-        )
-    elif args.backend == "paddlelite":
-        runtime_options = dict(
-            text_encoder=create_paddle_lite_runtime(device=args.device, device_id=args.device_id, use_fp16=False),
-            vae_encoder=create_paddle_lite_runtime(device=args.device, device_id=args.device_id, use_fp16=False),
-            vae_decoder=create_paddle_lite_runtime(device=args.device, device_id=args.device_id, use_fp16=False),
-            unet=create_paddle_lite_runtime(device=args.device, device_id=args.device_id, use_fp16=args.use_fp16),
-        )
-    elif args.backend == "tensorrt":
-        runtime_options = dict(
-            text_encoder=create_trt_runtime(
-                dynamic_shape=text_encoder_dynamic_shape, use_fp16=args.use_fp16, device_id=args.device_id
-            ),
-            vae_encoder=create_trt_runtime(
-                dynamic_shape=vae_encoder_dynamic_shape, use_fp16=args.use_fp16, device_id=args.device_id
-            ),
-            vae_decoder=create_trt_runtime(
-                dynamic_shape=vae_decoder_dynamic_shape, use_fp16=args.use_fp16, device_id=args.device_id
-            ),
-            unet=create_trt_runtime(
-                dynamic_shape=unet_dynamic_shape, use_fp16=args.use_fp16, device_id=args.device_id
-            ),
-        )
-    elif args.backend == "paddle" or args.backend == "paddle_tensorrt":
-        args.use_trt = args.backend == "paddle_tensorrt"
-        runtime_options = dict(
-            text_encoder=create_paddle_inference_runtime(
-                use_trt=args.use_trt,
-                dynamic_shape=text_encoder_dynamic_shape,
-                use_fp16=args.use_fp16,
-                use_bf16=args.use_bf16,
-                device_id=args.device_id,
-                disable_paddle_trt_ops=["arg_max", "range", "lookup_table_v2"],
-                paddle_stream=paddle_stream,
-            ),
-            vae_encoder=create_paddle_inference_runtime(
-                use_trt=args.use_trt,
-                dynamic_shape=vae_encoder_dynamic_shape,
-                use_fp16=args.use_fp16,
-                use_bf16=args.use_bf16,
-                device_id=args.device_id,
-                paddle_stream=paddle_stream,
-            ),
-            vae_decoder=create_paddle_inference_runtime(
-                use_trt=args.use_trt,
-                dynamic_shape=vae_decoder_dynamic_shape,
-                use_fp16=args.use_fp16,
-                use_bf16=args.use_bf16,
-                device_id=args.device_id,
-                paddle_stream=paddle_stream,
-            ),
-            unet=create_paddle_inference_runtime(
-                use_trt=args.use_trt,
-                dynamic_shape=unet_dynamic_shape,
-                use_fp16=args.use_fp16,
-                use_bf16=args.use_bf16,
-                device_id=args.device_id,
-                paddle_stream=paddle_stream,
-            ),
-        )
-    pipe = FastDeployStableDiffusionMegaPipeline.from_pretrained(
-        args.model_dir,
-        runtime_options=runtime_options,
-    )
-    pipe.set_progress_bar_config(disable=True)
-    pipe.change_scheduler(args.scheduler)
-    parse_prompt_type = args.parse_prompt_type
-    width = args.width
-    height = args.height
-    hr_resize_width = args.hr_resize_width
-    hr_resize_height = args.hr_resize_height
-
-    if args.infer_op == "all":
-        infer_op_list = ["zero_copy_infer", "raw"]
-    else:
-        infer_op_list = [args.infer_op]
-    if args.device == "kunlunxin_xpu" or args.backend == "paddle":
-        print("When device is kunlunxin_xpu or backend is paddle, we will use `raw` infer op.")
-        infer_op_list = ["raw"]
-
-    for infer_op in infer_op_list:
-        infer_op_dict = {
-            "vae_encoder": infer_op,
-            "vae_decoder": infer_op,
-            "text_encoder": infer_op,
-            "unet": infer_op,
-        }
-        folder = f"infer_op_{infer_op}_fp16" if args.use_fp16 else f"infer_op_{infer_op}_fp32"
-        os.makedirs(folder, exist_ok=True)
-        if args.task_name in ["text2img", "all"]:
-            # text2img
-            prompt = "a photo of an astronaut riding a horse on mars"
-            time_costs = []
-            # warmup
-            pipe.text2img(
-                prompt,
-                num_inference_steps=10,
-                height=height,
-                width=width,
-                parse_prompt_type=parse_prompt_type,
-                infer_op_dict=infer_op_dict,
-            )
-            print("==> Test text2img performance.")
-            for step in trange(args.benchmark_steps):
-                start = time.time()
-                paddle.seed(seed)
-                images = pipe.text2img(
-                    prompt,
-                    num_inference_steps=args.inference_steps,
-                    height=height,
-                    width=width,
-                    parse_prompt_type=parse_prompt_type,
-                    infer_op_dict=infer_op_dict,
-                ).images
-                latency = time.time() - start
-                time_costs += [latency]
-                # print(f"No {step:3d} time cost: {latency:2f} s")
-            print(
-                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
-                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
-            )
-            images[0].save(f"{folder}/text2img.png")
-
-        if args.task_name in ["img2img", "all"]:
-            # img2img
-            img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
-            init_image = load_image(img_url)
-            prompt = "A fantasy landscape, trending on artstation"
-            time_costs = []
-            # warmup
-            pipe.img2img(
-                prompt,
-                image=init_image,
-                num_inference_steps=20,
-                height=height,
-                width=width,
-                parse_prompt_type=parse_prompt_type,
-                infer_op_dict=infer_op_dict,
-            )
-            print("==> Test img2img performance.")
-            for step in trange(args.benchmark_steps):
-                start = time.time()
-                paddle.seed(seed)
-                images = pipe.img2img(
-                    prompt,
-                    image=init_image,
-                    num_inference_steps=args.inference_steps,
-                    height=height,
-                    width=width,
-                    parse_prompt_type=parse_prompt_type,
-                    infer_op_dict=infer_op_dict,
-                ).images
-                latency = time.time() - start
-                time_costs += [latency]
-                # print(f"No {step:3d} time cost: {latency:2f} s")
-            print(
-                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
-                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
-            )
-            images[0].save(f"{folder}/img2img.png")
-
-        if args.task_name in ["inpaint", "inpaint_legacy", "all"]:
-            img_url = (
-                "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
-            )
-            mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png"
-            init_image = load_image(img_url)
-            mask_image = load_image(mask_url)
-            prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
-            time_costs = []
-            # warmup
-            if args.task_name in ["inpaint_legacy", "all"]:
-                call_fn = pipe.inpaint_legacy
-                task_name = "inpaint_legacy"
-            else:
-                call_fn = pipe.inpaint
-                task_name = "inpaint"
-            call_fn(
-                prompt,
-                image=init_image,
-                mask_image=mask_image,
-                num_inference_steps=20,
-                height=height,
-                width=width,
-                parse_prompt_type=parse_prompt_type,
-                infer_op_dict=infer_op_dict,
-            )
-            print(f"==> Test {task_name} performance.")
-            for step in trange(args.benchmark_steps):
-                start = time.time()
-                paddle.seed(seed)
-                images = call_fn(
-                    prompt,
-                    image=init_image,
-                    mask_image=mask_image,
-                    num_inference_steps=args.inference_steps,
-                    height=height,
-                    width=width,
-                    parse_prompt_type=parse_prompt_type,
-                    infer_op_dict=infer_op_dict,
-                ).images
-                latency = time.time() - start
-                time_costs += [latency]
-                # print(f"No {step:3d} time cost: {latency:2f} s")
-            print(
-                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
-                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
-            )
-
-            images[0].save(f"{folder}/{task_name}.png")
-
-        if args.task_name in ["hiresfix", "all"]:
-            hiresfix_pipe = DiffusionPipeline.from_pretrained(
-                args.model_dir,
-                vae_encoder=pipe.vae_encoder,
-                vae_decoder=pipe.vae_decoder,
-                text_encoder=pipe.text_encoder,
-                tokenizer=pipe.tokenizer,
-                unet=pipe.unet,
-                scheduler=pipe.scheduler,
-                safety_checker=pipe.safety_checker,
-                feature_extractor=pipe.feature_extractor,
-                requires_safety_checker=pipe.requires_safety_checker,
-                custom_pipeline="pipeline_fastdeploy_stable_diffusion_hires_fix",
-            )
-            # custom_pipeline
-            # https://github.com/PaddlePaddle/PaddleNLP/blob/develop/ppdiffusers/examples/community/pipeline_fastdeploy_stable_diffusion_hires_fix.py
-            hiresfix_pipe._progress_bar_config = pipe._progress_bar_config
-            # hiresfix
-            prompt = "a photo of an astronaut riding a horse on mars"
-            time_costs = []
-            # warmup
-            hiresfix_pipe(
-                prompt,
-                height=height,
-                width=width,
-                num_inference_steps=20,
-                hires_ratio=0.5,
-                hr_resize_width=hr_resize_width,
-                hr_resize_height=hr_resize_height,
-                enable_hr=True,
-                parse_prompt_type=parse_prompt_type,
-                infer_op_dict=infer_op_dict,
-            )
-            print("==> Test hiresfix performance.")
-            for step in trange(args.benchmark_steps):
-                start = time.time()
-                paddle.seed(seed)
-                images = hiresfix_pipe(
-                    prompt,
-                    height=height,
-                    width=width,
-                    num_inference_steps=args.inference_steps,
-                    hires_ratio=0.5,
-                    hr_resize_width=hr_resize_width,
-                    hr_resize_height=hr_resize_height,
-                    enable_hr=True,
-                    infer_op_dict=infer_op_dict,
-                ).images
-                latency = time.time() - start
-                time_costs += [latency]
-                # print(f"No {step:3d} time cost: {latency:2f} s")
-            print(
-                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
-                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
-            )
-            images[0].save(f"{folder}/hiresfix.png")
-
-        if args.task_name in ["cycle_diffusion"]:
-            pipe.change_scheduler("ddim")
-            image_url = (
-                "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/ride_on_horse.png"
-            )
-            init_image = load_image(image_url)
-            source_prompt = "An astronaut riding a horse"
-            prompt = "An astronaut riding an elephant"
-            time_costs = []
-            # warmup
-            pipe.cycle_diffusion(
-                prompt=prompt,
-                source_prompt=source_prompt,
-                image=init_image,
-                num_inference_steps=10,
-                eta=0.1,
-                strength=0.8,
-                guidance_scale=2,
-                source_guidance_scale=1,
-                height=height,
-                width=width,
-                parse_prompt_type=parse_prompt_type,
-                infer_op_dict=infer_op_dict,
-            ).images[0]
-            print("==> Test cycle diffusion performance.")
-            for step in trange(args.benchmark_steps):
-                start = time.time()
-                paddle.seed(seed)
-                images = pipe.cycle_diffusion(
-                    prompt=prompt,
-                    source_prompt=source_prompt,
-                    image=init_image,
-                    num_inference_steps=args.inference_steps,
-                    eta=0.1,
-                    strength=0.8,
-                    guidance_scale=2,
-                    source_guidance_scale=1,
-                    height=height,
-                    width=width,
-                    parse_prompt_type=parse_prompt_type,
-                    infer_op_dict=infer_op_dict,
-                ).images
-                latency = time.time() - start
-                time_costs += [latency]
-                # print(f"No {step:3d} time cost: {latency:2f} s")
-            print(
-                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
-                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
-            )
-            images[0].save(f"{folder}/cycle_diffusion.png")
-
-        if args.task_name in ["mixture_tiling", "all"]:
-            print("mixture_tiling yes yes yes")
-            mixture_tiling_pipe = DiffusionPipeline.from_pretrained(
-                args.model_dir,
-                vae_encoder=pipe.vae_encoder,
-                vae_decoder=pipe.vae_decoder,
-                text_encoder=pipe.text_encoder,
-                tokenizer=pipe.tokenizer,
-                unet=pipe.unet,
-                scheduler=pipe.scheduler,
-                safety_checker=pipe.safety_checker,
-                feature_extractor=pipe.feature_extractor,
-                requires_safety_checker=pipe.requires_safety_checker,
-                # custom_pipeline="pipeline_fastdeploy_stable_diffusion_mixture_tiling",
-                custom_pipeline="/root/project/paddlenlp/ppdiffusers_upgrade/PaddleNLP/ppdiffusers/examples/community/pipeline_fastdeploy_stable_diffusion_mixture_tiling.py",
-            )
-            # custom_pipeline
-            mixture_tiling_pipe._progress_bar_config = pipe._progress_bar_config
-            # mixture_tiling
-            time_costs = []
-            # warmup
-            mixture_tiling_pipe(
-                prompt=[
-                    [
-                        "A charming house in the countryside, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
-                        # "A dirt road in the countryside crossing pastures, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
-                        # "An old and rusty giant robot lying on a dirt road, by jakub rozalski, dark sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
-                    ]
-                ],
-                tile_height=512,
-                tile_width=512,
-                tile_row_overlap=0,
-                tile_col_overlap=0,
-                guidance_scale=8,
-                seed=7178915308,
-                num_inference_steps=50,
-                infer_op_dict=None,
-            )
-            print("==> Test mixture tiling.")
-            for step in trange(args.benchmark_steps):
-                start = time.time()
-                images = mixture_tiling_pipe(
-                    prompt=[
-                        [
-                            "A charming house in the countryside, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
-                            # "A dirt road in the countryside crossing pastures, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
-                            # "An old and rusty giant robot lying on a dirt road, by jakub rozalski, dark sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
-                        ]
-                    ],
-                    tile_height=512,
-                    tile_width=512,
-                    tile_row_overlap=0,
-                    tile_col_overlap=0,
-                    guidance_scale=8,
-                    seed=7178915308,
-                    num_inference_steps=50,
-                    infer_op_dict=None,
-                )["images"]
-                latency = time.time() - start
-                time_costs += [latency]
-                # print(f"No {step:3d} time cost: {latency:2f} s")
-            print(
-                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
-                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
-            )
-            images[0].save("mixture_tiling.png")
-
-
-if __name__ == "__main__":
-    args = parse_arguments()
-    main(args)
diff --git a/ppdiffusers/deploy/infer_dygraph.py b/ppdiffusers/deploy/infer_dygraph.py
deleted file mode 100644
index cd1ff74e6e0e..000000000000
--- a/ppdiffusers/deploy/infer_dygraph.py
+++ /dev/null
@@ -1,358 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-import time
-import warnings
-
-import numpy as np
-import paddle
-from tqdm.auto import trange
-
-from paddlenlp.trainer.argparser import strtobool
-from paddlenlp.utils.log import logger
-from ppdiffusers import DiffusionPipeline
-from ppdiffusers.utils import load_image
-
-logger.set_level("WARNING")
-
-
-def parse_arguments():
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_dir",
-        default="runwayml/stable-diffusion-v1-5",
-        help="The model directory of diffusion_model.",
-    )
-    parser.add_argument("--inference_steps", type=int, default=50, help="The number of unet inference steps.")
-    parser.add_argument("--benchmark_steps", type=int, default=1, help="The number of performance benchmark steps.")
-    parser.add_argument(
-        "--task_name",
-        type=str,
-        default="text2img",
-        choices=[
-            "text2img",
-            "img2img",
-            "inpaint",
-            "inpaint_legacy",
-            "cycle_diffusion",
-            "hiresfix",
-            "all",
-        ],
-        help="The task can be one of [text2img, img2img, inpaint, inpaint_legacy, cycle_diffusion, hiresfix, all]. ",
-    )
-    parser.add_argument(
-        "--parse_prompt_type",
-        type=str,
-        default="lpw",
-        choices=[
-            "raw",
-            "lpw",
-        ],
-        help="The parse_prompt_type can be one of [raw, lpw]. ",
-    )
-    parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode")
-    parser.add_argument(
-        "--attention_type", type=str, default="raw", choices=["raw", "cutlass", "flash", "all"], help="attention_type."
-    )
-    parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu")
-    parser.add_argument(
-        "--scheduler",
-        type=str,
-        default="euler-ancestral",
-        choices=[
-            "pndm",
-            "lms",
-            "euler",
-            "euler-ancestral",
-            "dpm-multi",
-            "dpm-single",
-            "unipc-multi",
-            "ddim",
-            "ddpm",
-            "deis-multi",
-            "heun",
-            "kdpm2-ancestral",
-            "kdpm2",
-        ],
-        help="The scheduler type of stable diffusion.",
-    )
-    parser.add_argument("--height", type=int, default=512, help="Height of input image")
-    parser.add_argument("--width", type=int, default=512, help="Width of input image")
-    parser.add_argument("--hr_resize_height", type=int, default=768, help="HR Height of input image")
-    parser.add_argument("--hr_resize_width", type=int, default=768, help="HR Width of input image")
-    return parser.parse_args()
-
-
-def main(args):
-    if args.device_id == -1:
-        paddle.set_device("cpu")
-    else:
-        paddle.set_device(f"gpu:{args.device_id}")
-
-    seed = 1024
-    paddle_dtype = paddle.float16 if args.use_fp16 else paddle.float32
-    pipe = DiffusionPipeline.from_pretrained(
-        args.model_dir,
-        safety_checker=None,
-        feature_extractor=None,
-        requires_safety_checker=False,
-        paddle_dtype=paddle_dtype,
-        custom_pipeline="stable_diffusion_mega",
-    )
-    pipe.set_progress_bar_config(disable=True)
-    pipe.change_scheduler(args.scheduler)
-    parse_prompt_type = args.parse_prompt_type
-    if args.attention_type == "all":
-        args.attention_type = ["raw", "cutlass", "flash"]
-    else:
-        args.attention_type = [args.attention_type]
-
-    for attention_type in args.attention_type:
-        if attention_type == "raw":
-            pipe.disable_xformers_memory_efficient_attention()
-        else:
-            try:
-                pipe.enable_xformers_memory_efficient_attention(attention_type)
-            except Exception as e:
-                if attention_type == "flash":
-                    warnings.warn(
-                        "Attention type flash is not supported on your GPU! We need to use 3060、3070、3080、3090、4060、4070、4080、4090、A30、A100 etc."
-                    )
-                    continue
-                else:
-                    raise ValueError(e)
-
-        width = args.width
-        height = args.height
-        hr_resize_width = args.hr_resize_width
-        hr_resize_height = args.hr_resize_height
-        folder = f"attn_{attention_type}_fp16" if args.use_fp16 else f"attn_{attention_type}_fp32"
-        os.makedirs(folder, exist_ok=True)
-        if args.task_name in ["text2img", "all"]:
-            # text2img
-            prompt = "a photo of an astronaut riding a horse on mars"
-            time_costs = []
-            # warmup
-            pipe.text2img(
-                prompt,
-                num_inference_steps=10,
-                height=height,
-                width=width,
-                parse_prompt_type=parse_prompt_type,
-            )
-            print("==> Test text2img performance.")
-            paddle.seed(seed)
-            for step in trange(args.benchmark_steps):
-                start = time.time()
-                images = pipe.text2img(
-                    prompt,
-                    num_inference_steps=args.inference_steps,
-                    height=height,
-                    width=width,
-                    parse_prompt_type=parse_prompt_type,
-                ).images
-                latency = time.time() - start
-                time_costs += [latency]
-                # print(f"No {step:3d} time cost: {latency:2f} s")
-            print(
-                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
-                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
-            )
-            images[0].save(f"{folder}/text2img.png")
-
-        if args.task_name in ["img2img", "all"]:
-            # img2img
-            img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
-            init_image = load_image(img_url)
-            prompt = "A fantasy landscape, trending on artstation"
-            time_costs = []
-            # warmup
-            pipe.img2img(
-                prompt,
-                image=init_image,
-                num_inference_steps=20,
-                height=height,
-                width=width,
-                parse_prompt_type=parse_prompt_type,
-            )
-            print("==> Test img2img performance.")
-            for step in trange(args.benchmark_steps):
-                start = time.time()
-                paddle.seed(seed)
-                images = pipe.img2img(
-                    prompt,
-                    image=init_image,
-                    num_inference_steps=args.inference_steps,
-                    height=height,
-                    width=width,
-                    parse_prompt_type=parse_prompt_type,
-                ).images
-                latency = time.time() - start
-                time_costs += [latency]
-                # print(f"No {step:3d} time cost: {latency:2f} s")
-            print(
-                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
-                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
-            )
-            images[0].save(f"{folder}/img2img.png")
-
-        if args.task_name in ["inpaint", "inpaint_legacy", "all"]:
-            img_url = (
-                "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
-            )
-            mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png"
-            init_image = load_image(img_url)
-            mask_image = load_image(mask_url)
-            prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
-            time_costs = []
-            # warmup
-            if args.task_name in ["inpaint_legacy", "all"]:
-                call_fn = pipe.inpaint_legacy
-                task_name = "inpaint_legacy"
-            else:
-                call_fn = pipe.inpaint
-                task_name = args.task_name
-            if pipe.unet.config.in_channels == 4:
-                task_name = "inpaint_legacy"
-            elif pipe.unet.config.in_channels == 9:
-                task_name = "inpaint"
-
-            call_fn(
-                prompt,
-                image=init_image,
-                mask_image=mask_image,
-                num_inference_steps=20,
-                height=height,
-                width=width,
-                parse_prompt_type=parse_prompt_type,
-            )
-            print(f"==> Test {task_name} performance.")
-            for step in trange(args.benchmark_steps):
-                start = time.time()
-                paddle.seed(seed)
-                images = call_fn(
-                    prompt,
-                    image=init_image,
-                    mask_image=mask_image,
-                    num_inference_steps=args.inference_steps,
-                    height=height,
-                    width=width,
-                    parse_prompt_type=parse_prompt_type,
-                ).images
-                latency = time.time() - start
-                time_costs += [latency]
-                # print(f"No {step:3d} time cost: {latency:2f} s")
-            print(
-                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
-                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
-            )
-
-            images[0].save(f"{folder}/{task_name}.png")
-
-        if args.task_name in ["cycle_diffusion", "all"]:
-            pipe.change_scheduler("ddim")
-            image_url = (
-                "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/ride_on_horse.png"
-            )
-            init_image = load_image(image_url)
-            source_prompt = "An astronaut riding a horse"
-            prompt = "An astronaut riding an elephant"
-            time_costs = []
-            # warmup
-            pipe.cycle_diffusion(
-                prompt=prompt,
-                source_prompt=source_prompt,
-                image=init_image,
-                num_inference_steps=10,
-                eta=0.1,
-                strength=0.8,
-                guidance_scale=2,
-                source_guidance_scale=1,
-                height=height,
-                width=width,
-                parse_prompt_type=parse_prompt_type,
-            ).images[0]
-            print("==> Test cycle diffusion performance.")
-            for step in trange(args.benchmark_steps):
-                start = time.time()
-                paddle.seed(seed)
-                images = pipe.cycle_diffusion(
-                    prompt=prompt,
-                    source_prompt=source_prompt,
-                    image=init_image,
-                    num_inference_steps=args.inference_steps,
-                    eta=0.1,
-                    strength=0.8,
-                    guidance_scale=2,
-                    source_guidance_scale=1,
-                    height=height,
-                    width=width,
-                    parse_prompt_type=parse_prompt_type,
-                ).images
-                latency = time.time() - start
-                time_costs += [latency]
-                # print(f"No {step:3d} time cost: {latency:2f} s")
-            print(
-                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
-                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
-            )
-            images[0].save(f"{folder}/cycle_diffusion.png")
-
-        if args.task_name in ["hiresfix", "all"]:
-            # hiresfix
-            prompt = "a photo of an astronaut riding a horse on mars"
-            time_costs = []
-            # warmup
-            pipe.hires_fix(
-                prompt,
-                height=height,
-                width=width,
-                num_inference_steps=20,
-                hires_ratio=0.5,
-                hr_resize_width=hr_resize_width,
-                hr_resize_height=hr_resize_height,
-                enable_hr=True,
-                parse_prompt_type=parse_prompt_type,
-            )
-            print("==> Test hiresfix performance.")
-            for step in trange(args.benchmark_steps):
-                start = time.time()
-                paddle.seed(seed)
-                images = pipe.hires_fix(
-                    prompt,
-                    height=height,
-                    width=width,
-                    num_inference_steps=args.inference_steps,
-                    hires_ratio=0.5,
-                    hr_resize_width=hr_resize_width,
-                    hr_resize_height=hr_resize_height,
-                    enable_hr=True,
-                    parse_prompt_type=parse_prompt_type,
-                ).images
-                latency = time.time() - start
-                time_costs += [latency]
-                # print(f"No {step:3d} time cost: {latency:2f} s")
-            print(
-                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
-                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
-            )
-            images[0].save(f"{folder}/hiresfix.png")
-
-
-if __name__ == "__main__":
-    args = parse_arguments()
-    main(args)
diff --git a/ppdiffusers/deploy/infer_dygraph_torch.py b/ppdiffusers/deploy/infer_dygraph_torch.py
deleted file mode 100644
index d648052371d3..000000000000
--- a/ppdiffusers/deploy/infer_dygraph_torch.py
+++ /dev/null
@@ -1,432 +0,0 @@
-# Copyright (c) 2023 torchtorch Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-import time
-
-import torch
-
-torch.nn.functional.scaled_dot_product_attention_ = torch.nn.functional.scaled_dot_product_attention
-delattr(torch.nn.functional, "scaled_dot_product_attention")
-import numpy as np
-from diffusers import (
-    CycleDiffusionPipeline,
-    DDIMScheduler,
-    DDPMScheduler,
-    DEISMultistepScheduler,
-    DiffusionPipeline,
-    DPMSolverMultistepScheduler,
-    DPMSolverSinglestepScheduler,
-    EulerAncestralDiscreteScheduler,
-    EulerDiscreteScheduler,
-    HeunDiscreteScheduler,
-    KDPM2AncestralDiscreteScheduler,
-    KDPM2DiscreteScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    UniPCMultistepScheduler,
-)
-from diffusers.models.attention_processor import AttnProcessor, AttnProcessor2_0
-from diffusers.utils import load_image
-from tqdm.auto import trange
-
-
-def strtobool(v):
-    if isinstance(v, bool):
-        return v
-    if v.lower() in ("yes", "true", "t", "y", "1"):
-        return True
-    elif v.lower() in ("no", "false", "f", "n", "0"):
-        return False
-    else:
-        raise ValueError(
-            f"Truthy value expected: got {v} but expected one of yes/no, true/false, t/f, y/n, 1/0 (case insensitive)."
-        )
-
-
-def change_scheduler(self, scheduler_type="ddim"):
-    self.orginal_scheduler_config = self.scheduler.config
-    scheduler_type = scheduler_type.lower()
-    if scheduler_type == "pndm":
-        scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True)
-    elif scheduler_type == "lms":
-        scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config)
-    elif scheduler_type == "heun":
-        scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config)
-    elif scheduler_type == "euler":
-        scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config)
-    elif scheduler_type == "euler-ancestral":
-        scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
-    elif scheduler_type == "dpm-multi":
-        scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config)
-    elif scheduler_type == "dpm-single":
-        scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config)
-    elif scheduler_type == "kdpm2-ancestral":
-        scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
-    elif scheduler_type == "kdpm2":
-        scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config)
-    elif scheduler_type == "unipc-multi":
-        scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config)
-    elif scheduler_type == "ddim":
-        scheduler = DDIMScheduler.from_config(
-            self.orginal_scheduler_config,
-            steps_offset=1,
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-    elif scheduler_type == "ddpm":
-        scheduler = DDPMScheduler.from_config(
-            self.orginal_scheduler_config,
-        )
-    elif scheduler_type == "deis-multi":
-        scheduler = DEISMultistepScheduler.from_config(
-            self.orginal_scheduler_config,
-        )
-    else:
-        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
-    return scheduler
-
-
-def parse_arguments():
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--pretrained_model_name_or_path",
-        default="runwayml/stable-diffusion-v1-5",
-        help="The model directory of diffusion_model.",
-    )
-    parser.add_argument("--inference_steps", type=int, default=50, help="The number of unet inference steps.")
-    parser.add_argument("--benchmark_steps", type=int, default=10, help="The number of performance benchmark steps.")
-    parser.add_argument(
-        "--task_name",
-        type=str,
-        default="all",
-        choices=[
-            "text2img",
-            "img2img",
-            "inpaint",
-            "inpaint_legacy",
-            "cycle_diffusion",
-            "all",
-        ],
-        help="The task can be one of [text2img, img2img, inpaint, inpaint_legacy, cycle_diffusion, hiresfix, all]. ",
-    )
-    parser.add_argument(
-        "--parse_prompt_type",
-        type=str,
-        default="raw",
-        choices=[
-            "raw",
-            "lpw",
-        ],
-        help="The parse_prompt_type can be one of [raw, lpw]. ",
-    )
-    parser.add_argument("--channels_last", type=strtobool, default=False, help="Wheter to use channels_last")
-    parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode")
-    parser.add_argument("--tf32", type=strtobool, default=True, help="tf32")
-    parser.add_argument("--compile", type=strtobool, default=False, help="compile")
-    parser.add_argument(
-        "--attention_type",
-        type=str,
-        default="sdp",
-        choices=[
-            "raw",
-            "sdp",
-        ],
-        help="attention_type.",
-    )
-    parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu")
-    parser.add_argument(
-        "--scheduler",
-        type=str,
-        default="euler-ancestral",
-        choices=[
-            "pndm",
-            "lms",
-            "euler",
-            "euler-ancestral",
-            "dpm-multi",
-            "dpm-single",
-            "unipc-multi",
-            "ddim",
-            "ddpm",
-            "deis-multi",
-            "heun",
-            "kdpm2-ancestral",
-            "kdpm2",
-        ],
-        help="The scheduler type of stable diffusion.",
-    )
-    parser.add_argument("--height", type=int, default=512, help="Height of input image")
-    parser.add_argument("--width", type=int, default=512, help="Width of input image")
-    return parser.parse_args()
-
-
-def attn_processors(self):
-    processors = {}
-
-    def fn_recursive_add_processors(name: str, module, processors):
-        if hasattr(module, "set_processor"):
-            processors[f"{name}.processor"] = module.processor
-
-        for sub_name, child in module.named_children():
-            fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-        return processors
-
-    for name, module in self.named_children():
-        fn_recursive_add_processors(name, module, processors)
-
-    return processors
-
-
-def set_attn_processor(self, processor):
-    count = len(attn_processors(self).keys())
-
-    if isinstance(processor, dict) and len(processor) != count:
-        raise ValueError(
-            f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-            f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-        )
-
-    def fn_recursive_attn_processor(name: str, module, processor):
-        if hasattr(module, "set_processor"):
-            if not isinstance(processor, dict):
-                module.set_processor(processor)
-            else:
-                module.set_processor(processor.pop(f"{name}.processor"))
-
-        for sub_name, child in module.named_children():
-            fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-    for name, module in self.named_children():
-        fn_recursive_attn_processor(name, module, processor)
-
-
-def main(args):
-    if args.tf32:
-        torch.backends.cuda.matmul.allow_tf32 = True
-    else:
-        torch.backends.cuda.matmul.allow_tf32 = False
-
-    seed = 1024
-    torch_dtype = torch.float16 if args.use_fp16 else torch.float32
-    pipe = DiffusionPipeline.from_pretrained(
-        args.pretrained_model_name_or_path,
-        safety_checker=None,
-        feature_extractor=None,
-        requires_safety_checker=False,
-        torch_dtype=torch_dtype,
-        custom_pipeline="stable_diffusion_mega" if args.parse_prompt_type == "raw" else "lpw_stable_diffusion",
-    )
-    scheduler = change_scheduler(pipe, args.scheduler)
-    pipe.scheduler = scheduler
-    if args.device_id >= 0:
-        pipe.to(f"cuda:{args.device_id}")
-
-    if args.attention_type == "all":
-        args.attention_type = ["raw", "sdp"]
-    else:
-        args.attention_type = [args.attention_type]
-
-    for attention_type in args.attention_type:
-        attn_prrocessor_cls = AttnProcessor if attention_type == "raw" else AttnProcessor2_0
-        if attention_type == "sdp":
-            torch.nn.functional.scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention_
-
-        set_attn_processor(pipe.unet, attn_prrocessor_cls())
-        set_attn_processor(pipe.vae, attn_prrocessor_cls())
-        if args.channels_last:
-            pipe.unet.to(memory_format=torch.channels_last)
-
-        if args.compile:
-            print("Run torch compile")
-            pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
-
-        width = args.width
-        height = args.height
-        pipe.set_progress_bar_config(disable=True)
-
-        folder = f"torch_attn_{attention_type}_fp16" if args.use_fp16 else f"torch_attn_{attention_type}_fp32"
-        os.makedirs(folder, exist_ok=True)
-        if args.task_name in ["text2img", "all"]:
-            # text2img
-            prompt = "a photo of an astronaut riding a horse on mars"
-            time_costs = []
-            # warmup
-            pipe.text2img(
-                prompt,
-                num_inference_steps=10,
-                height=height,
-                width=width,
-            )
-            print("==> Test text2img performance.")
-            torch.cuda.manual_seed(seed)
-            for step in trange(args.benchmark_steps):
-                start = time.time()
-                images = pipe.text2img(
-                    prompt,
-                    num_inference_steps=args.inference_steps,
-                    height=height,
-                    width=width,
-                ).images
-                latency = time.time() - start
-                time_costs += [latency]
-                # print(f"No {step:3d} time cost: {latency:2f} s")
-            print(
-                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
-                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
-            )
-            images[0].save(f"{folder}/text2img.png")
-
-        if args.task_name in ["img2img", "all"]:
-            # img2img
-            img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
-            init_image = load_image(img_url).resize((width, height))
-            prompt = "A fantasy landscape, trending on artstation"
-            time_costs = []
-            # warmup
-            pipe.img2img(
-                prompt,
-                image=init_image,
-                num_inference_steps=20,
-                height=height,
-                width=width,
-            )
-            print("==> Test img2img performance.")
-            for step in trange(args.benchmark_steps):
-                start = time.time()
-                torch.cuda.manual_seed(seed)
-                images = pipe.img2img(
-                    prompt,
-                    image=init_image,
-                    num_inference_steps=args.inference_steps,
-                    height=height,
-                    width=width,
-                ).images
-                latency = time.time() - start
-                time_costs += [latency]
-                # print(f"No {step:3d} time cost: {latency:2f} s")
-            print(
-                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
-                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
-            )
-            images[0].save(f"{folder}/img2img.png")
-
-        if args.task_name in ["inpaint", "inpaint_legacy", "all"]:
-            img_url = (
-                "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
-            )
-            mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png"
-            init_image = load_image(img_url).resize((width, height))
-            mask_image = load_image(mask_url).resize((width, height))
-            prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
-            time_costs = []
-            # warmup
-            if args.task_name in ["inpaint_legacy", "all"]:
-                call_fn = pipe.inpaint
-                task_name = "inpaint_legacy"
-            else:
-                call_fn = pipe.inpaint
-                task_name = args.task_name
-            if pipe.unet.config.in_channels == 4:
-                task_name = "inpaint_legacy"
-            elif pipe.unet.config.in_channels == 9:
-                task_name = "inpaint"
-
-            call_fn(
-                prompt,
-                image=init_image,
-                mask_image=mask_image,
-                num_inference_steps=20,
-            )
-            print(f"==> Test {task_name} performance.")
-            for step in trange(args.benchmark_steps):
-                start = time.time()
-                torch.cuda.manual_seed(seed)
-                images = call_fn(
-                    prompt,
-                    image=init_image,
-                    mask_image=mask_image,
-                    num_inference_steps=args.inference_steps,
-                ).images
-                latency = time.time() - start
-                time_costs += [latency]
-                # print(f"No {step:3d} time cost: {latency:2f} s")
-            print(
-                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
-                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
-            )
-
-            images[0].save(f"{folder}/{task_name}.png")
-
-        if args.task_name in ["cycle_diffusion", "all"]:
-            # need fix diffuers=0.17.1, self.unet return_dict=False!
-            cycle_pipe = CycleDiffusionPipeline(
-                vae=pipe.vae,
-                text_encoder=pipe.text_encoder,
-                tokenizer=pipe.tokenizer,
-                unet=pipe.unet,
-                scheduler=scheduler,
-                safety_checker=None,
-                feature_extractor=None,
-                requires_safety_checker=False,
-            )
-            cycle_pipe.set_progress_bar_config(disable=True)
-            scheduler = change_scheduler(cycle_pipe, "ddim")
-            cycle_pipe.scheduler = scheduler
-            image_url = "ride_on_horse.png"
-            init_image = load_image(image_url).resize((width, height))
-            source_prompt = "An astronaut riding a horse"
-            prompt = "An astronaut riding an elephant"
-            time_costs = []
-            # warmup
-            cycle_pipe(
-                prompt=prompt,
-                source_prompt=source_prompt,
-                image=init_image,
-                num_inference_steps=10,
-                eta=0.1,
-                strength=0.8,
-                guidance_scale=2,
-                source_guidance_scale=1,
-            ).images[0]
-            print("==> Test cycle diffusion performance.")
-            for step in trange(args.benchmark_steps):
-                start = time.time()
-                torch.cuda.manual_seed(seed)
-                images = cycle_pipe(
-                    prompt=prompt,
-                    source_prompt=source_prompt,
-                    image=init_image,
-                    num_inference_steps=args.inference_steps,
-                    eta=0.1,
-                    strength=0.8,
-                    guidance_scale=2,
-                    source_guidance_scale=1,
-                ).images
-                latency = time.time() - start
-                time_costs += [latency]
-                # print(f"No {step:3d} time cost: {latency:2f} s")
-            print(
-                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
-                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
-            )
-            images[0].save(f"{folder}/cycle_diffusion.png")
-
-
-if __name__ == "__main__":
-    args = parse_arguments()
-    main(args)
diff --git a/ppdiffusers/deploy/requirements.txt b/ppdiffusers/deploy/requirements.txt
deleted file mode 100644
index 13116611a171..000000000000
--- a/ppdiffusers/deploy/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-ppdiffusers>=0.16.1
diff --git a/ppdiffusers/deploy/scripts/test_controlnet_infer_dygraph.sh b/ppdiffusers/deploy/scripts/test_controlnet_infer_dygraph.sh
deleted file mode 100644
index ba507688c947..000000000000
--- a/ppdiffusers/deploy/scripts/test_controlnet_infer_dygraph.sh
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-cd ../controlnet
-
-# FP16
-python infer_dygraph.py --pretrained_model_name_or_path runwayml/stable-diffusion-v1-5 \
-    --controlnet_pretrained_model_name_or_path lllyasviel/sd-controlnet-canny \
-    --task_name all --use_fp16 True \
-    --attention_type all --benchmark_steps 5 --device_id 0 --parse_prompt_type lpw
-
-# nohup sh test_controlnet_infer_dygraph.sh  1> test_controlnet_infer_dygraph.log 2>&1 & 
-
-# FP32
-# python infer_dygraph.py --pretrained_model_name_or_path runwayml/stable-diffusion-v1-5 \
-#     --controlnet_pretrained_model_name_or_path lllyasviel/sd-controlnet-canny \
-#     --task_name all --use_fp16 False \
-#     --attention_type all --benchmark_steps 5 --device_id 0
-
-# # raw
-# ==> Test text2img_control performance.
-# Mean latency: 2.765781 s, p50 latency: 2.765087 s, p90 latency: 2.767249 s, p95 latency: 2.767289 s.
-# ==> Test img2img_control performance.
-# Mean latency: 2.249113 s, p50 latency: 2.247525 s, p90 latency: 2.252412 s, p95 latency: 2.253428 s.
-# ==> Test inpaint_legacy_control performance.
-# Mean latency: 2.798517 s, p50 latency: 2.798506 s, p90 latency: 2.799392 s, p95 latency: 2.799600 s.
-# ==> Test hiresfix_control performance.
-# Mean latency: 5.625017 s, p50 latency: 5.624560 s, p90 latency: 5.626832 s, p95 latency: 5.627412 s.
-
-# # cutlass
-# ==> Test text2img_control performance.
-# Mean latency: 2.221491 s, p50 latency: 2.219046 s, p90 latency: 2.226926 s, p95 latency: 2.227513 s.
-# ==> Test img2img_control performance.
-# Mean latency: 1.845735 s, p50 latency: 1.845492 s, p90 latency: 1.847032 s, p95 latency: 1.847059 s.
-# ==> Test inpaint_legacy_control performance.
-# Mean latency: 2.299109 s, p50 latency: 2.299197 s, p90 latency: 2.300616 s, p95 latency: 2.300845 s.
-# ==> Test hiresfix_control performance.
-# Mean latency: 3.397126 s, p50 latency: 3.397122 s, p90 latency: 3.398814 s, p95 latency: 3.399294 s.
-
-# # flash
-# ==> Test text2img_control performance.
-# Mean latency: 2.247151 s, p50 latency: 2.245066 s, p90 latency: 2.251284 s, p95 latency: 2.252165 s.
-# ==> Test img2img_control performance.
-# Mean latency: 1.831867 s, p50 latency: 1.832337 s, p90 latency: 1.832757 s, p95 latency: 1.832806 s.
-# ==> Test inpaint_legacy_control performance.
-# Mean latency: 2.291782 s, p50 latency: 2.290435 s, p90 latency: 2.298838 s, p95 latency: 2.300244 s.
-# ==> Test hiresfix_control performance.
-# Mean latency: 3.166110 s, p50 latency: 3.164805 s, p90 latency: 3.171124 s, p95 latency: 3.172821 s.
diff --git a/ppdiffusers/deploy/scripts/test_controlnet_infer_dygraph_torch.sh b/ppdiffusers/deploy/scripts/test_controlnet_infer_dygraph_torch.sh
deleted file mode 100644
index 48ca96df9bb3..000000000000
--- a/ppdiffusers/deploy/scripts/test_controlnet_infer_dygraph_torch.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-cd ../controlnet
-pip install diffusers==0.17.1
-
-python infer_dygraph_toch.py --pretrained_model_name_or_path runwayml/stable-diffusion-v1-5 \
-    --controlnet_pretrained_model_name_or_path lllyasviel/sd-controlnet-canny \
-    --task_name all --use_fp16 True \
-    --attention_type raw --benchmark_steps 10 --device_id 0 --parse_prompt_type raw
-
-python infer_dygraph_toch.py --pretrained_model_name_or_path runwayml/stable-diffusion-v1-5 \
-    --controlnet_pretrained_model_name_or_path lllyasviel/sd-controlnet-canny \
-    --task_name all --use_fp16 True \
-    --attention_type sdp --benchmark_steps 10 --device_id 0 --parse_prompt_type raw
-
-python infer_dygraph_toch.py --pretrained_model_name_or_path runwayml/stable-diffusion-v1-5 \
-    --controlnet_pretrained_model_name_or_path lllyasviel/sd-controlnet-canny \
-    --task_name all --use_fp16 True \
-    --attention_type sdp --benchmark_steps 10 --device_id 0 --parse_prompt_type raw --compile True --channels_last True
-
-# nohup sh test_controlnet_infer_dygraph_torch.sh  1> test_controlnet_infer_dygraph_torch.log 2>&1 & 
diff --git a/ppdiffusers/deploy/scripts/test_controlnet_infer_fd.sh b/ppdiffusers/deploy/scripts/test_controlnet_infer_fd.sh
deleted file mode 100644
index a600920e0dc9..000000000000
--- a/ppdiffusers/deploy/scripts/test_controlnet_infer_fd.sh
+++ /dev/null
@@ -1,65 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-cd ../controlnet
-# export LD_LIBRARY_PATH=/usr/local/cuda-11.7/targets/x86_64-linux/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
-
-# test control_sd15_canny
-python export_model.py \
-    --pretrained_model_name_or_path runwayml/stable-diffusion-v1-5 \
-    --controlnet_pretrained_model_name_or_path lllyasviel/sd-controlnet-canny \
-    --output_path control_sd15_canny
-
-python infer.py \
-    --model_dir control_sd15_canny \
-    --scheduler preconfig-euler-ancestral \
-    --backend paddle_tensorrt \
-    --use_fp16 True \
-    --device gpu \
-    --device_id 0 \
-    --task_name all \
-    --infer_op zero_copy_infer \
-    --benchmark_steps 10 --parse_prompt_type lpw
-
-python infer.py \
-    --model_dir control_sd15_canny \
-    --scheduler preconfig-euler-ancestral \
-    --backend paddle_tensorrt \
-    --use_fp16 True \
-    --device gpu \
-    --device_id 0 \
-    --task_name all \
-    --infer_op raw \
-    --benchmark_steps 10 --parse_prompt_type lpw
-# nohup sh test_controlnet_infer_fd.sh  1> test_controlnet_infer_fd.log 2>&1 & 
-
-## zero_copy_infer lpw
-# ==> Test text2img_control performance.
-# Mean latency: 1.116596 s, p50 latency: 1.116411 s, p90 latency: 1.117224 s, p95 latency: 1.117570 s.
-# ==> Test img2img_control performance.
-# Mean latency: 0.923659 s, p50 latency: 0.923360 s, p90 latency: 0.924491 s, p95 latency: 0.924889 s.
-# ==> Test inpaint_legacy_control performance.
-# Mean latency: 0.990658 s, p50 latency: 0.990560 s, p90 latency: 0.991377 s, p95 latency: 0.991382 s.
-# ==> Test hiresfix_control performance.
-# Mean latency: 1.941339 s, p50 latency: 1.940671 s, p90 latency: 1.943967 s, p95 latency: 1.944597 s.
-
-## raw lpw
-# ==> Test text2img_control performance.
-# Mean latency: 1.316932 s, p50 latency: 1.316731 s, p90 latency: 1.318196 s, p95 latency: 1.318324 s.
-# ==> Test img2img_control performance.
-# Mean latency: 1.081752 s, p50 latency: 1.081734 s, p90 latency: 1.081981 s, p95 latency: 1.082050 s.
-# ==> Test inpaint_legacy_control performance.
-# Mean latency: 1.088706 s, p50 latency: 1.088135 s, p90 latency: 1.090777 s, p95 latency: 1.091583 s.
-# ==> Test hiresfix_control performance.
-# Mean latency: 2.213738 s, p50 latency: 2.212821 s, p90 latency: 2.214525 s, p95 latency: 2.218274 s.
diff --git a/ppdiffusers/deploy/scripts/test_infer_dygraph.sh b/ppdiffusers/deploy/scripts/test_infer_dygraph.sh
deleted file mode 100644
index b2d69f7e21e4..000000000000
--- a/ppdiffusers/deploy/scripts/test_infer_dygraph.sh
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-cd ..
-
-# FP16
-python infer_dygraph.py --model_dir runwayml/stable-diffusion-v1-5 \
-    --task_name all --use_fp16 True \
-    --attention_type all --benchmark_steps 5 --device_id 0 --parse_prompt_type lpw
-python infer_dygraph.py --model_dir runwayml/stable-diffusion-inpainting \
-    --task_name inpaint --use_fp16 True \
-    --attention_type all --benchmark_steps 5 --device_id 0 --parse_prompt_type lpw
-
-# nohup sh test_infer_dygraph.sh  1> test_infer_dygraph.log 2>&1 & 
-
-# # FP32
-# python infer_dygraph.py --model_dir runwayml/stable-diffusion-v1-5 \
-#     --task_name all --use_fp16 False \
-#     --attention_type all --benchmark_steps 5 --device_id 0
-# python infer_dygraph.py --model_dir runwayml/stable-diffusion-inpainting \
-#     --task_name inpaint --use_fp16 False \
-#     --attention_type all --benchmark_steps 5 --device_id 0
-
-## raw
-# ==> Test text2img performance.
-# Mean latency: 1.966795 s, p50 latency: 1.966347 s, p90 latency: 1.967994 s, p95 latency: 1.968173 s.
-# ==> Test img2img performance.
-# Mean latency: 1.602304 s, p50 latency: 1.601587 s, p90 latency: 1.603798 s, p95 latency: 1.604156 s.
-# ==> Test inpaint_legacy performance.
-# Mean latency: 2.002676 s, p50 latency: 2.002807 s, p90 latency: 2.003454 s, p95 latency: 2.003653 s.
-# ==> Test cycle diffusion performance.
-# Mean latency: 2.728374 s, p50 latency: 2.728487 s, p90 latency: 2.729091 s, p95 latency: 2.729288 s.
-# ==> Test hiresfix performance.
-# Mean latency: 4.037808 s, p50 latency: 4.037807 s, p90 latency: 4.039283 s, p95 latency: 4.039706 s.
-
-## cutlass
-# ==> Test text2img performance.
-# Mean latency: 1.600378 s, p50 latency: 1.600454 s, p90 latency: 1.602175 s, p95 latency: 1.602312 s.
-# ==> Test img2img performance.
-# Mean latency: 1.316588 s, p50 latency: 1.315549 s, p90 latency: 1.318886 s, p95 latency: 1.319499 s.
-# ==> Test inpaint_legacy performance.
-# Mean latency: 1.624272 s, p50 latency: 1.617674 s, p90 latency: 1.638277 s, p95 latency: 1.645022 s.
-# ==> Test cycle diffusion performance.
-# Mean latency: 2.010290 s, p50 latency: 2.010423 s, p90 latency: 2.010763 s, p95 latency: 2.010806 s.
-# ==> Test hiresfix performance.
-# Mean latency: 2.413675 s, p50 latency: 2.413513 s, p90 latency: 2.414645 s, p95 latency: 2.414994 s.
-
-## flash
-# ==> Test text2img performance.
-# Mean latency: 1.552317 s, p50 latency: 1.550014 s, p90 latency: 1.556045 s, p95 latency: 1.556484 s.
-# ==> Test img2img performance.
-# Mean latency: 1.263727 s, p50 latency: 1.263574 s, p90 latency: 1.266098 s, p95 latency: 1.266871 s.
-# ==> Test inpaint_legacy performance.
-# Mean latency: 1.592698 s, p50 latency: 1.591540 s, p90 latency: 1.595203 s, p95 latency: 1.595825 s.
-# ==> Test cycle diffusion performance.
-# Mean latency: 1.914255 s, p50 latency: 1.914288 s, p90 latency: 1.914754 s, p95 latency: 1.914828 s.
-# ==> Test hiresfix performance.
-# Mean latency: 2.230694 s, p50 latency: 2.230958 s, p90 latency: 2.231327 s, p95 latency: 2.231398 s.
-
-# ==> Test inpaint performance.
-# Mean latency: 1.991679 s, p50 latency: 1.991460 s, p90 latency: 1.992477 s, p95 latency: 1.992665 s.
-# ==> Test inpaint performance.
-# Mean latency: 1.550563 s, p50 latency: 1.550274 s, p90 latency: 1.552293 s, p95 latency: 1.552326 s.
-# ==> Test inpaint performance.
-# Mean latency: 1.538872 s, p50 latency: 1.539402 s, p90 latency: 1.540718 s, p95 latency: 1.541104 s.
diff --git a/ppdiffusers/deploy/scripts/test_infer_dygraph_torch.sh b/ppdiffusers/deploy/scripts/test_infer_dygraph_torch.sh
deleted file mode 100644
index 21561c369352..000000000000
--- a/ppdiffusers/deploy/scripts/test_infer_dygraph_torch.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-cd ..
-pip install diffusers==0.17.1
-
-python infer_dygraph_toch.py --pretrained_model_name_or_path runwayml/stable-diffusion-v1-5 \
-    --task_name all --use_fp16 True \
-    --attention_type raw --benchmark_steps 10 --device_id 0 --parse_prompt_type raw
-
-python infer_dygraph_toch.py --pretrained_model_name_or_path runwayml/stable-diffusion-v1-5 \
-    --task_name all --use_fp16 True \
-    --attention_type sdp --benchmark_steps 10 --device_id 0 --parse_prompt_type raw
-
-python infer_dygraph_toch.py --pretrained_model_name_or_path runwayml/stable-diffusion-v1-5 \
-    --task_name all --use_fp16 True \
-    --attention_type sdp --benchmark_steps 10 --device_id 0 --parse_prompt_type raw --compile True --channels_last True
-
-# nohup sh test_infer_dygraph_torch.sh  1> test_infer_dygraph_torch.log 2>&1 & 
diff --git a/ppdiffusers/deploy/scripts/test_infer_fd.sh b/ppdiffusers/deploy/scripts/test_infer_fd.sh
deleted file mode 100644
index 9c6f96f2710e..000000000000
--- a/ppdiffusers/deploy/scripts/test_infer_fd.sh
+++ /dev/null
@@ -1,130 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-cd ..
-# export LD_LIBRARY_PATH=/usr/local/cuda-11.7/targets/x86_64-linux/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
-
-# test stable-diffusion-v1-5
-python export_model.py \
-    --pretrained_model_name_or_path runwayml/stable-diffusion-v1-5 \
-    --output_path stable-diffusion-v1-5
-
-python infer.py --model_dir stable-diffusion-v1-5/ \
-    --scheduler preconfig-euler-ancestral \
-    --backend paddle_tensorrt \
-    --use_fp16 True \
-    --device gpu \
-    --device_id 0 \
-    --task_name all \
-    --infer_op zero_copy_infer \
-    --benchmark_steps 10 --parse_prompt_type lpw
-
-python infer.py --model_dir stable-diffusion-v1-5/ \
-    --scheduler preconfig-euler-ancestral \
-    --backend paddle_tensorrt \
-    --use_fp16 True \
-    --device gpu \
-    --device_id 0 \
-    --task_name all \
-    --infer_op raw \
-    --benchmark_steps 10 --parse_prompt_type lpw
-
-# test cycle_diffusion
-python export_model.py \
-    --pretrained_model_name_or_path runwayml/stable-diffusion-v1-5 \
-    --output_path stable-diffusion-v1-5-cycle_diffusion
-
-python infer.py --model_dir stable-diffusion-v1-5-cycle_diffusion/ \
-    --scheduler preconfig-euler-ancestral \
-    --backend paddle_tensorrt \
-    --use_fp16 True \
-    --device gpu \
-    --device_id 0 \
-    --task_name cycle_diffusion \
-    --infer_op zero_copy_infer \
-    --benchmark_steps 10 --parse_prompt_type lpw
-
-python infer.py --model_dir stable-diffusion-v1-5-cycle_diffusion/ \
-    --scheduler preconfig-euler-ancestral \
-    --backend paddle_tensorrt \
-    --use_fp16 True \
-    --device gpu \
-    --device_id 0 \
-    --task_name cycle_diffusion \
-    --infer_op raw \
-    --benchmark_steps 10 --parse_prompt_type lpw
-
-# test stable-diffusion-v1-5-inpainting
-python export_model.py \
-    --pretrained_model_name_or_path runwayml/stable-diffusion-inpainting \
-    --output_path stable-diffusion-v1-5-inpainting
-
-python infer.py \
-    --model_dir stable-diffusion-v1-5-inpainting \
-    --scheduler preconfig-euler-ancestral \
-    --backend paddle_tensorrt \
-    --use_fp16 True \
-    --device gpu \
-    --device_id 0 \
-    --task_name inpaint \
-    --infer_op zero_copy_infer \
-    --benchmark_steps 10 --parse_prompt_type lpw
-
-python infer.py \
-    --model_dir stable-diffusion-v1-5-inpainting \
-    --scheduler preconfig-euler-ancestral \
-    --backend paddle_tensorrt \
-    --use_fp16 True \
-    --device gpu \
-    --device_id 0 \
-    --task_name inpaint \
-    --infer_op raw \
-    --benchmark_steps 10 --parse_prompt_type lpw
-
-# nohup sh test_infer_fd.sh  1> test_infer_fd.log 2>&1 & 
-
-## zero_copy_infer lpw
-# ==> Test text2img performance.
-# Mean latency: 0.793726 s, p50 latency: 0.793276 s, p90 latency: 0.795765 s, p95 latency: 0.795784 s.
-# ==> Test img2img performance.
-# Mean latency: 0.662667 s, p50 latency: 0.662543 s, p90 latency: 0.663988 s, p95 latency: 0.664083 s.
-# ==> Test inpaint_legacy performance.
-# Mean latency: 0.713983 s, p50 latency: 0.713049 s, p90 latency: 0.714997 s, p95 latency: 0.719060 s.
-# ==> Test hiresfix performance.
-# Mean latency: 1.379235 s, p50 latency: 1.378860 s, p90 latency: 1.381117 s, p95 latency: 1.381354 s.
-
-## raw lpw
-# ==> Test text2img performance.
-# Mean latency: 0.858216 s, p50 latency: 0.856819 s, p90 latency: 0.863076 s, p95 latency: 0.865202 s.
-# ==> Test img2img performance.
-# Mean latency: 0.715608 s, p50 latency: 0.713970 s, p90 latency: 0.722402 s, p95 latency: 0.723554 s.
-# ==> Test inpaint_legacy performance.
-# Mean latency: 0.723306 s, p50 latency: 0.723204 s, p90 latency: 0.725012 s, p95 latency: 0.726074 s.
-# ==> Test hiresfix performance.
-# Mean latency: 1.449147 s, p50 latency: 1.447849 s, p90 latency: 1.454234 s, p95 latency: 1.454289 s.
-
-## zero_copy_infer lpw ddim no preconfig
-# ==> Test cycle diffusion performance.
-# Mean latency: 1.347564 s, p50 latency: 1.347331 s, p90 latency: 1.350945 s, p95 latency: 1.352116 s.
-## raw lpw
-# ==> Test cycle diffusion performance.
-# Mean latency: 1.358652 s, p50 latency: 1.356717 s, p90 latency: 1.364285 s, p95 latency: 1.364509 s.
-
-## zero_copy_infer lpw
-# ==> Test inpaint performance.
-# Mean latency: 0.663649 s, p50 latency: 0.663355 s, p90 latency: 0.664185 s, p95 latency: 0.665098 s.
-
-## raw lpw
-# ==> Test inpaint performance.
-# Mean latency: 0.716995 s, p50 latency: 0.716940 s, p90 latency: 0.717639 s, p95 latency: 0.717953 s.
diff --git a/ppdiffusers/deploy/stable_diffusion_image_variation/export_model.py b/ppdiffusers/deploy/stable_diffusion_image_variation/export_model.py
deleted file mode 100644
index 05b3c4466419..000000000000
--- a/ppdiffusers/deploy/stable_diffusion_image_variation/export_model.py
+++ /dev/null
@@ -1,171 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-from pathlib import Path
-from types import MethodType
-
-import paddle
-
-from ppdiffusers import (
-    FastDeployRuntimeModel,
-    FastDeployStableDiffusionImageVariationPipeline,
-    StableDiffusionImageVariationPipeline,
-    UNet2DConditionModel,
-)
-
-
-def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(
-    model_path: str,
-    output_path: str,
-    sample: bool = False,
-    height: int = None,
-    width: int = None,
-):
-    # specify unet model with unet pre_temb_act opt enabled.
-    unet_model = UNet2DConditionModel.from_pretrained(model_path, resnet_pre_temb_non_linearity=True, subfolder="unet")
-    pipeline = StableDiffusionImageVariationPipeline.from_pretrained(model_path, unet=unet_model, safety_checker=None)
-    # make sure we disable xformers
-    pipeline.disable_xformers_memory_efficient_attention()
-    output_path = Path(output_path)
-    # calculate latent's H and W
-    latent_height = height // 8 if height is not None else None
-    latent_width = width // 8 if width is not None else None
-    # get arguments
-    cross_attention_dim = pipeline.unet.config.cross_attention_dim  # 768 or 1024 or 1280
-    unet_channels = pipeline.unet.config.in_channels  # 4 or 9
-    vae_in_channels = pipeline.vae.config.in_channels  # 3
-    vae_latent_channels = pipeline.vae.config.latent_channels  # 4
-    print(
-        f"cross_attention_dim: {cross_attention_dim}\n",
-        f"unet_in_channels: {unet_channels}\n",
-        f"vae_encoder_in_channels: {vae_in_channels}\n",
-        f"vae_decoder_latent_channels: {vae_latent_channels}",
-    )
-    # 1. Convert image_encoder
-    image_encoder = paddle.jit.to_static(
-        pipeline.image_encoder,
-        input_spec=[
-            paddle.static.InputSpec(shape=[None, 3, 224, 224], dtype="float32", name="pixel_values")
-        ],  # pixel_values
-    )
-    save_path = os.path.join(args.output_path, "image_encoder", "inference")
-    paddle.jit.save(image_encoder, save_path)
-    print(f"Save image_encoder model in {save_path} successfully.")
-    del pipeline.image_encoder
-
-    # 2. Convert unet
-    unet = paddle.jit.to_static(
-        pipeline.unet,
-        input_spec=[
-            paddle.static.InputSpec(
-                shape=[None, unet_channels, latent_height, latent_width], dtype="float32", name="sample"
-            ),  # sample
-            paddle.static.InputSpec(shape=[1], dtype="float32", name="timestep"),  # timestep
-            paddle.static.InputSpec(
-                shape=[None, None, cross_attention_dim], dtype="float32", name="encoder_hidden_states"
-            ),  # encoder_hidden_states
-        ],
-    )
-    save_path = os.path.join(args.output_path, "unet", "inference")
-    paddle.jit.save(unet, save_path)
-    print(f"Save unet model in {save_path} successfully.")
-    del pipeline.unet
-
-    def forward_vae_encoder_mode(self, z):
-        return self.encode(z, True).latent_dist.mode()
-
-    def forward_vae_encoder_sample(self, z):
-        return self.encode(z, True).latent_dist.sample()
-
-    # 3. Convert vae encoder
-    vae_encoder = pipeline.vae
-    if sample:
-        vae_encoder.forward = MethodType(forward_vae_encoder_sample, vae_encoder)
-    else:
-        vae_encoder.forward = MethodType(forward_vae_encoder_mode, vae_encoder)
-
-    vae_encoder = paddle.jit.to_static(
-        vae_encoder,
-        input_spec=[
-            paddle.static.InputSpec(
-                shape=[None, vae_in_channels, height, width],
-                dtype="float32",
-                name="sample",  # N, C, H, W
-            ),  # latent
-        ],
-    )
-    # Save vae_encoder in static graph model.
-    save_path = os.path.join(args.output_path, "vae_encoder", "inference")
-    paddle.jit.save(vae_encoder, save_path)
-    print(f"Save vae_encoder model in {save_path} successfully.")
-
-    # 4. Convert vae encoder
-    vae_decoder = pipeline.vae
-
-    def forward_vae_decoder(self, z):
-        return self.decode(z, True).sample
-
-    vae_decoder.forward = MethodType(forward_vae_decoder, vae_decoder)
-    vae_decoder = paddle.jit.to_static(
-        vae_decoder,
-        input_spec=[
-            paddle.static.InputSpec(
-                shape=[None, vae_latent_channels, latent_height, latent_width], dtype="float32", name="latent_sample"
-            ),  # latent_sample
-        ],
-    )
-    # Save vae_decoder in static graph model.
-    save_path = os.path.join(args.output_path, "vae_decoder", "inference")
-    paddle.jit.save(vae_decoder, save_path)
-    print(f"Save vae_decoder model in {save_path} successfully.")
-    del pipeline.vae
-
-    fd_pipe_cls = FastDeployStableDiffusionImageVariationPipeline
-
-    fastdeploy_pipeline = fd_pipe_cls(
-        vae_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_encoder"),
-        vae_decoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_decoder"),
-        image_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "image_encoder"),
-        unet=FastDeployRuntimeModel.from_pretrained(output_path / "unet"),
-        scheduler=pipeline.scheduler,
-        safety_checker=None,
-        feature_extractor=pipeline.feature_extractor,
-        requires_safety_checker=False,
-    )
-    fastdeploy_pipeline.save_pretrained(output_path)
-    print("FastDeploy pipeline saved to", output_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--pretrained_model_name_or_path",
-        type=str,
-        required=True,
-        help="Path to the `ppdiffusers` checkpoint to convert (either a local directory or on the bos).",
-    )
-    parser.add_argument("--output_path", type=str, required=True, help="Path to the output model.")
-    parser.add_argument(
-        "--sample", action="store_true", default=False, help="Export the vae encoder in mode or sample"
-    )
-    parser.add_argument("--height", type=int, default=None, help="The height of output images. Default: None")
-    parser.add_argument("--width", type=int, default=None, help="The width of output images. Default: None")
-    args = parser.parse_args()
-
-    convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(
-        args.pretrained_model_name_or_path, args.output_path, args.sample, args.height, args.width
-    )
diff --git a/ppdiffusers/deploy/stable_diffusion_image_variation/infer.py b/ppdiffusers/deploy/stable_diffusion_image_variation/infer.py
deleted file mode 100644
index 7cf8fc680780..000000000000
--- a/ppdiffusers/deploy/stable_diffusion_image_variation/infer.py
+++ /dev/null
@@ -1,419 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-import time
-
-import fastdeploy as fd
-import numpy as np
-import paddle
-from tqdm.auto import trange
-
-from paddlenlp.trainer.argparser import strtobool
-from ppdiffusers import FastDeployStableDiffusionImageVariationPipeline
-from ppdiffusers.utils import load_image
-
-
-def parse_arguments():
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_dir",
-        default="lambdalabs/sd-image-variations-diffusers@fastdeploy",
-        help="The model directory of diffusion_model.",
-    )
-    parser.add_argument("--inference_steps", type=int, default=50, help="The number of unet inference steps.")
-    parser.add_argument("--benchmark_steps", type=int, default=1, help="The number of performance benchmark steps.")
-    parser.add_argument(
-        "--backend",
-        type=str,
-        default="paddle_tensorrt",
-        # Note(zhoushunjie): Will support 'tensorrt' soon.
-        choices=["onnx_runtime", "paddle", "paddlelite", "paddle_tensorrt"],
-        help="The inference runtime backend of unet model and text encoder model.",
-    )
-    parser.add_argument(
-        "--device",
-        type=str,
-        default="gpu",
-        # Note(shentanyue): Will support more devices.
-        choices=[
-            "cpu",
-            "gpu",
-            "huawei_ascend_npu",
-            "kunlunxin_xpu",
-        ],
-        help="The inference runtime device of models.",
-    )
-    parser.add_argument(
-        "--parse_prompt_type",
-        type=str,
-        default="lpw",
-        choices=[
-            "raw",
-            "lpw",
-        ],
-        help="The parse_prompt_type can be one of [raw, lpw]. ",
-    )
-    parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode")
-    parser.add_argument("--use_bf16", type=strtobool, default=False, help="Wheter to use BF16 mode")
-    parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu")
-    parser.add_argument(
-        "--scheduler",
-        type=str,
-        default="preconfig-euler-ancestral",
-        choices=[
-            "pndm",
-            "lms",
-            "euler",
-            "euler-ancestral",
-            "preconfig-euler-ancestral",
-            "dpm-multi",
-            "dpm-single",
-            "unipc-multi",
-            "ddim",
-            "ddpm",
-            "deis-multi",
-            "heun",
-            "kdpm2-ancestral",
-            "kdpm2",
-        ],
-        help="The scheduler type of stable diffusion.",
-    )
-    parser.add_argument(
-        "--infer_op",
-        type=str,
-        default="zero_copy_infer",
-        choices=[
-            "zero_copy_infer",
-            "raw",
-            "all",
-        ],
-        help="The type of infer op.",
-    )
-    parser.add_argument("--height", type=int, default=512, help="Height of input image")
-    parser.add_argument("--width", type=int, default=512, help="Width of input image")
-    parser.add_argument("--hr_resize_height", type=int, default=768, help="HR Height of input image")
-    parser.add_argument("--hr_resize_width", type=int, default=768, help="HR Width of input image")
-    parser.add_argument("--is_sd2_0", type=strtobool, default=False, help="Is sd2_0 model?")
-
-    return parser.parse_args()
-
-
-def create_ort_runtime(device_id=0):
-    option = fd.RuntimeOption()
-    option.use_ort_backend()
-    if device_id == -1:
-        option.use_cpu()
-    else:
-        option.use_gpu(device_id)
-    return option
-
-
-def create_paddle_inference_runtime(
-    use_trt=False,
-    dynamic_shape=None,
-    use_fp16=False,
-    use_bf16=False,
-    device_id=0,
-    disable_paddle_trt_ops=[],
-    disable_paddle_pass=[],
-    paddle_stream=None,
-    workspace=None,
-):
-    assert not use_fp16 or not use_bf16, "use_fp16 and use_bf16 are mutually exclusive"
-    option = fd.RuntimeOption()
-    option.use_paddle_backend()
-    if device_id == -1:
-        option.use_cpu()
-    else:
-        option.use_gpu(device_id)
-    if paddle_stream is not None and use_trt:
-        option.set_external_raw_stream(paddle_stream)
-    for pass_name in disable_paddle_pass:
-        option.paddle_infer_option.delete_pass(pass_name)
-    if use_bf16:
-        option.paddle_infer_option.inference_precision = "bfloat16"
-    if use_trt:
-        option.paddle_infer_option.disable_trt_ops(disable_paddle_trt_ops)
-        option.paddle_infer_option.enable_trt = True
-        if workspace is not None:
-            option.set_trt_max_workspace_size(workspace)
-        if use_fp16:
-            option.trt_option.enable_fp16 = True
-        else:
-            # Note(zhoushunjie): These four passes don't support fp32 now.
-            # Remove this line of code in future.
-            only_fp16_passes = [
-                "trt_cross_multihead_matmul_fuse_pass",
-                "trt_flash_multihead_matmul_fuse_pass",
-                "preln_elementwise_groupnorm_act_pass",
-                "elementwise_groupnorm_act_pass",
-            ]
-            for curr_pass in only_fp16_passes:
-                option.paddle_infer_option.delete_pass(curr_pass)
-
-        # Need to enable collect shape
-        if dynamic_shape is not None:
-            option.paddle_infer_option.collect_trt_shape = True
-            for key, shape_dict in dynamic_shape.items():
-                option.trt_option.set_shape(
-                    key, shape_dict["min_shape"], shape_dict.get("opt_shape", None), shape_dict.get("max_shape", None)
-                )
-    return option
-
-
-def create_paddle_lite_runtime(device="cpu", device_id=0, use_fp16=False):
-    option = fd.RuntimeOption()
-    option.use_paddle_lite_backend()
-    if device == "huawei_ascend_npu":
-        option.use_ascend()
-        option.set_lite_device_names(["huawei_ascend_npu"])
-        option.set_lite_context_properties(
-            "HUAWEI_ASCEND_NPU_SELECTED_DEVICE_IDS={};HUAWEI_ASCEND_NPU_PRECISION_MODE=allow_mix_precision".format(
-                device_id
-            )
-        )
-    elif device == "kunlunxin_xpu":
-        # TODO(shentanyue): Add kunlunxin_xpu code
-        # https://github.com/PaddlePaddle/FastDeploy/blob/4c3e7030e151528d304619901c794481bb2f6037/examples/multimodal/stable_diffusion/infer.py#L178-L195
-        option.use_kunlunxin(
-            device_id,
-            l3_workspace_size=(64 * 1024 * 1024 - 4 * 1024),
-            locked=False,
-            autotune=False,
-            autotune_file="",
-            precision="int16",
-            adaptive_seqlen=True,
-            enable_multi_stream=True,
-        )
-        if use_fp16:
-            option.enable_lite_fp16()
-    else:
-        pass
-    return option
-
-
-def create_trt_runtime(workspace=(1 << 31), dynamic_shape=None, use_fp16=False, device_id=0):
-    option = fd.RuntimeOption()
-    option.use_trt_backend()
-    option.use_gpu(device_id)
-    if use_fp16:
-        option.enable_trt_fp16()
-    if workspace is not None:
-        option.set_trt_max_workspace_size(workspace)
-    if dynamic_shape is not None:
-        for key, shape_dict in dynamic_shape.items():
-            option.set_trt_input_shape(
-                key,
-                min_shape=shape_dict["min_shape"],
-                opt_shape=shape_dict.get("opt_shape", None),
-                max_shape=shape_dict.get("max_shape", None),
-            )
-    return option
-
-
-def main(args):
-    if args.device_id == -1:
-        paddle.set_device("cpu")
-        paddle_stream = None
-    else:
-        paddle.set_device(f"gpu:{args.device_id}")
-        paddle_stream = paddle.device.cuda.current_stream(args.device_id).cuda_stream
-
-    seed = 1024
-    vae_in_channels = 4
-    min_image_size = 512
-    max_image_size = 768
-    max_image_size = max(min_image_size, max_image_size)
-    unet_in_channels = 4
-    bs = 2
-
-    image_encoder_dynamic_shape = {
-        "pixel_values": {
-            "min_shape": [1, 3, 224, 224],
-            "max_shape": [1, 3, 224, 224],
-            "opt_shape": [1, 3, 224, 224],
-        }
-    }
-
-    vae_encoder_dynamic_shape = {
-        "sample": {
-            "min_shape": [1, 3, min_image_size, min_image_size],
-            "max_shape": [1, 3, max_image_size, max_image_size],
-            "opt_shape": [1, 3, min_image_size, min_image_size],
-        }
-    }
-
-    vae_decoder_dynamic_shape = {
-        "latent_sample": {
-            "min_shape": [1, vae_in_channels, min_image_size // 8, min_image_size // 8],
-            "max_shape": [1, vae_in_channels, max_image_size // 8, max_image_size // 8],
-            "opt_shape": [1, vae_in_channels, min_image_size // 8, min_image_size // 8],
-        }
-    }
-
-    unet_dynamic_shape = {
-        "sample": {
-            "min_shape": [1, unet_in_channels, min_image_size // 8, min_image_size // 8],
-            "max_shape": [bs, unet_in_channels, max_image_size // 8, max_image_size // 8],
-            "opt_shape": [2, unet_in_channels, min_image_size // 8, min_image_size // 8],
-        },
-        "timestep": {
-            "min_shape": [1],
-            "max_shape": [1],
-            "opt_shape": [1],
-        },
-        "encoder_hidden_states": {
-            "min_shape": [1, 1, 768],
-            "max_shape": [bs, 1, 768],
-            "opt_shape": [2, 1, 768],
-        },
-    }
-    # 4. Init runtime
-    if args.backend == "onnx_runtime":
-        runtime_options = dict(
-            text_encoder=create_ort_runtime(device_id=args.device_id),
-            vae_encoder=create_ort_runtime(device_id=args.device_id),
-            vae_decoder=create_ort_runtime(device_id=args.device_id),
-            unet=create_ort_runtime(device_id=args.device_id),
-        )
-    elif args.backend == "paddlelite":
-        runtime_options = dict(
-            text_encoder=create_paddle_lite_runtime(device=args.device, device_id=args.device_id, use_fp16=False),
-            vae_encoder=create_paddle_lite_runtime(device=args.device, device_id=args.device_id, use_fp16=False),
-            vae_decoder=create_paddle_lite_runtime(device=args.device, device_id=args.device_id, use_fp16=False),
-            unet=create_paddle_lite_runtime(device=args.device, device_id=args.device_id, use_fp16=args.use_fp16),
-        )
-    elif args.backend == "tensorrt":
-        runtime_options = dict(
-            image_encoder=create_trt_runtime(
-                dynamic_shape=image_encoder_dynamic_shape, use_fp16=args.use_fp16, device_id=args.device_id
-            ),
-            vae_encoder=create_trt_runtime(
-                dynamic_shape=vae_encoder_dynamic_shape, use_fp16=args.use_fp16, device_id=args.device_id
-            ),
-            vae_decoder=create_trt_runtime(
-                dynamic_shape=vae_decoder_dynamic_shape, use_fp16=args.use_fp16, device_id=args.device_id
-            ),
-            unet=create_trt_runtime(
-                dynamic_shape=unet_dynamic_shape, use_fp16=args.use_fp16, device_id=args.device_id
-            ),
-        )
-    elif args.backend == "paddle" or args.backend == "paddle_tensorrt":
-        args.use_trt = args.backend == "paddle_tensorrt"
-        runtime_options = dict(
-            image_encoder=create_paddle_inference_runtime(
-                use_trt=args.use_trt,
-                dynamic_shape=image_encoder_dynamic_shape,
-                use_fp16=args.use_fp16,
-                use_bf16=args.use_bf16,
-                device_id=args.device_id,
-                disable_paddle_trt_ops=["arg_max", "range", "lookup_table_v2"],
-                paddle_stream=paddle_stream,
-            ),
-            vae_encoder=create_paddle_inference_runtime(
-                use_trt=args.use_trt,
-                dynamic_shape=vae_encoder_dynamic_shape,
-                use_fp16=args.use_fp16,
-                use_bf16=args.use_bf16,
-                device_id=args.device_id,
-                paddle_stream=paddle_stream,
-            ),
-            vae_decoder=create_paddle_inference_runtime(
-                use_trt=args.use_trt,
-                dynamic_shape=vae_decoder_dynamic_shape,
-                use_fp16=args.use_fp16,
-                use_bf16=args.use_bf16,
-                device_id=args.device_id,
-                paddle_stream=paddle_stream,
-            ),
-            unet=create_paddle_inference_runtime(
-                use_trt=args.use_trt,
-                dynamic_shape=unet_dynamic_shape,
-                use_fp16=args.use_fp16,
-                use_bf16=args.use_bf16,
-                device_id=args.device_id,
-                paddle_stream=paddle_stream,
-            ),
-        )
-    pipe = FastDeployStableDiffusionImageVariationPipeline.from_pretrained(
-        args.model_dir,
-        runtime_options=runtime_options,
-    )
-    pipe.set_progress_bar_config(disable=True)
-    pipe.change_scheduler(args.scheduler)
-    # parse_prompt_type = args.parse_prompt_type
-    width = args.width
-    height = args.height
-    # hr_resize_width = args.hr_resize_width
-    # hr_resize_height = args.hr_resize_height
-
-    if args.infer_op == "all":
-        infer_op_list = ["zero_copy_infer", "raw"]
-    else:
-        infer_op_list = [args.infer_op]
-    if args.device == "kunlunxin_xpu" or args.backend == "paddle":
-        print("When device is kunlunxin_xpu or backend is paddle, we will use `raw` infer op.")
-        infer_op_list = ["raw"]
-
-    for infer_op in infer_op_list:
-        infer_op_dict = {
-            "vae_encoder": infer_op,
-            "vae_decoder": infer_op,
-            "image_encoder": infer_op,
-            "unet": infer_op,
-        }
-        folder = f"infer_op_{infer_op}_fp16" if args.use_fp16 else f"infer_op_{infer_op}_fp32"
-        os.makedirs(folder, exist_ok=True)
-
-        # image_variation
-        img_url = (
-            "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
-        )
-        init_image = load_image(img_url)
-        time_costs = []
-        # warmup
-        pipe(
-            image=init_image,
-            num_inference_steps=20,
-            height=height,
-            width=width,
-            infer_op_dict=infer_op_dict,
-        )
-        print("==> Test image_variation performance.")
-        for step in trange(args.benchmark_steps):
-            start = time.time()
-            paddle.seed(seed)
-            images = pipe(
-                image=init_image,
-                num_inference_steps=args.inference_steps,
-                height=height,
-                width=width,
-                infer_op_dict=infer_op_dict,
-            ).images
-            latency = time.time() - start
-            time_costs += [latency]
-            # print(f"No {step:3d} time cost: {latency:2f} s")
-        print(
-            f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
-            f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
-        )
-        images[0].save(f"{folder}/image_variation.png")
-
-
-if __name__ == "__main__":
-    args = parse_arguments()
-    main(args)
diff --git a/ppdiffusers/deploy/stable_diffusion_image_variation/infer_dygraph.py b/ppdiffusers/deploy/stable_diffusion_image_variation/infer_dygraph.py
deleted file mode 100644
index 8058fe78f630..000000000000
--- a/ppdiffusers/deploy/stable_diffusion_image_variation/infer_dygraph.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-import time
-import warnings
-
-import numpy as np
-import paddle
-from tqdm.auto import trange
-
-from paddlenlp.trainer.argparser import strtobool
-from paddlenlp.utils.log import logger
-from ppdiffusers import StableDiffusionImageVariationPipeline
-from ppdiffusers.utils import load_image
-
-logger.set_level("WARNING")
-
-
-def parse_arguments():
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_dir",
-        default="runwayml/stable-diffusion-v1-5",
-        help="The model directory of diffusion_model.",
-    )
-    parser.add_argument("--inference_steps", type=int, default=50, help="The number of unet inference steps.")
-    parser.add_argument("--benchmark_steps", type=int, default=1, help="The number of performance benchmark steps.")
-    parser.add_argument(
-        "--parse_prompt_type",
-        type=str,
-        default="lpw",
-        choices=[
-            "raw",
-            "lpw",
-        ],
-        help="The parse_prompt_type can be one of [raw, lpw]. ",
-    )
-    parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode")
-    parser.add_argument(
-        "--attention_type", type=str, default="raw", choices=["raw", "cutlass", "flash", "all"], help="attention_type."
-    )
-    parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu")
-    parser.add_argument("--height", type=int, default=512, help="Height of input image")
-    parser.add_argument("--width", type=int, default=512, help="Width of input image")
-    parser.add_argument("--hr_resize_height", type=int, default=768, help="HR Height of input image")
-    parser.add_argument("--hr_resize_width", type=int, default=768, help="HR Width of input image")
-    return parser.parse_args()
-
-
-def main(args):
-    if args.device_id == -1:
-        paddle.set_device("cpu")
-    else:
-        paddle.set_device(f"gpu:{args.device_id}")
-
-    seed = 1024
-    # paddle_dtype = paddle.float16 if args.use_fp16 else paddle.float32
-    pipe = StableDiffusionImageVariationPipeline.from_pretrained(
-        args.model_dir,
-        safety_checker=None,
-        requires_safety_checker=False,
-    )
-    pipe.set_progress_bar_config(disable=True)
-    # parse_prompt_type = args.parse_prompt_type
-    if args.attention_type == "all":
-        args.attention_type = ["raw", "cutlass", "flash"]
-    else:
-        args.attention_type = [args.attention_type]
-
-    for attention_type in args.attention_type:
-        if attention_type == "raw":
-            pipe.disable_xformers_memory_efficient_attention()
-        else:
-            try:
-                pipe.enable_xformers_memory_efficient_attention(attention_type)
-            except Exception as e:
-                if attention_type == "flash":
-                    warnings.warn(
-                        "Attention type flash is not supported on your GPU! We need to use 3060、3070、3080、3090、4060、4070、4080、4090、A30、A100 etc."
-                    )
-                    continue
-                else:
-                    raise ValueError(e)
-
-        width = args.width
-        height = args.height
-        # hr_resize_width = args.hr_resize_width
-        # hr_resize_height = args.hr_resize_height
-        folder = f"attn_{attention_type}_fp16" if args.use_fp16 else f"attn_{attention_type}_fp32"
-        os.makedirs(folder, exist_ok=True)
-
-        # image_variation
-        img_url = (
-            "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
-        )
-        init_image = load_image(img_url)
-        time_costs = []
-        # warmup
-        pipe(
-            image=init_image,
-            num_inference_steps=20,
-            height=height,
-            width=width,
-        )
-        print("==> Test image_variation performance.")
-        for step in trange(args.benchmark_steps):
-            start = time.time()
-            paddle.seed(seed)
-            images = pipe(
-                image=init_image,
-                num_inference_steps=args.inference_steps,
-                height=height,
-                width=width,
-            ).images
-            latency = time.time() - start
-            time_costs += [latency]
-            # print(f"No {step:3d} time cost: {latency:2f} s")
-        print(
-            f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
-            f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
-        )
-        images[0].save(f"{folder}/image_variation.png")
-
-
-if __name__ == "__main__":
-    args = parse_arguments()
-    main(args)
diff --git a/ppdiffusers/deploy/stable_diffusion_image_variation/infer_dygraph_torch.py b/ppdiffusers/deploy/stable_diffusion_image_variation/infer_dygraph_torch.py
deleted file mode 100644
index ab48e8cb4ff7..000000000000
--- a/ppdiffusers/deploy/stable_diffusion_image_variation/infer_dygraph_torch.py
+++ /dev/null
@@ -1,283 +0,0 @@
-# Copyright (c) 2023 torchtorch Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-import time
-
-import torch
-
-torch.nn.functional.scaled_dot_product_attention_ = torch.nn.functional.scaled_dot_product_attention
-delattr(torch.nn.functional, "scaled_dot_product_attention")
-import numpy as np
-from diffusers import (
-    DDIMScheduler,
-    DDPMScheduler,
-    DEISMultistepScheduler,
-    DPMSolverMultistepScheduler,
-    DPMSolverSinglestepScheduler,
-    EulerAncestralDiscreteScheduler,
-    EulerDiscreteScheduler,
-    HeunDiscreteScheduler,
-    KDPM2AncestralDiscreteScheduler,
-    KDPM2DiscreteScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    StableDiffusionImageVariationPipeline,
-    UniPCMultistepScheduler,
-)
-from diffusers.models.attention_processor import AttnProcessor, AttnProcessor2_0
-from diffusers.utils import load_image
-from tqdm.auto import trange
-
-
-def strtobool(v):
-    if isinstance(v, bool):
-        return v
-    if v.lower() in ("yes", "true", "t", "y", "1"):
-        return True
-    elif v.lower() in ("no", "false", "f", "n", "0"):
-        return False
-    else:
-        raise ValueError(
-            f"Truthy value expected: got {v} but expected one of yes/no, true/false, t/f, y/n, 1/0 (case insensitive)."
-        )
-
-
-def change_scheduler(self, scheduler_type="ddim"):
-    self.orginal_scheduler_config = self.scheduler.config
-    scheduler_type = scheduler_type.lower()
-    if scheduler_type == "pndm":
-        scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True)
-    elif scheduler_type == "lms":
-        scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config)
-    elif scheduler_type == "heun":
-        scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config)
-    elif scheduler_type == "euler":
-        scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config)
-    elif scheduler_type == "euler-ancestral":
-        scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
-    elif scheduler_type == "dpm-multi":
-        scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config)
-    elif scheduler_type == "dpm-single":
-        scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config)
-    elif scheduler_type == "kdpm2-ancestral":
-        scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
-    elif scheduler_type == "kdpm2":
-        scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config)
-    elif scheduler_type == "unipc-multi":
-        scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config)
-    elif scheduler_type == "ddim":
-        scheduler = DDIMScheduler.from_config(
-            self.orginal_scheduler_config,
-            steps_offset=1,
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-    elif scheduler_type == "ddpm":
-        scheduler = DDPMScheduler.from_config(
-            self.orginal_scheduler_config,
-        )
-    elif scheduler_type == "deis-multi":
-        scheduler = DEISMultistepScheduler.from_config(
-            self.orginal_scheduler_config,
-        )
-    else:
-        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
-    return scheduler
-
-
-def parse_arguments():
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--pretrained_model_name_or_path",
-        default="runwayml/stable-diffusion-v1-5",
-        help="The model directory of diffusion_model.",
-    )
-    parser.add_argument("--inference_steps", type=int, default=50, help="The number of unet inference steps.")
-    parser.add_argument("--benchmark_steps", type=int, default=10, help="The number of performance benchmark steps.")
-    parser.add_argument(
-        "--parse_prompt_type",
-        type=str,
-        default="raw",
-        choices=[
-            "raw",
-            "lpw",
-        ],
-        help="The parse_prompt_type can be one of [raw, lpw]. ",
-    )
-    parser.add_argument("--channels_last", type=strtobool, default=False, help="Wheter to use channels_last")
-    parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode")
-    parser.add_argument("--tf32", type=strtobool, default=True, help="tf32")
-    parser.add_argument("--compile", type=strtobool, default=False, help="compile")
-    parser.add_argument(
-        "--attention_type",
-        type=str,
-        default="sdp",
-        choices=[
-            "raw",
-            "sdp",
-        ],
-        help="attention_type.",
-    )
-    parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu")
-    parser.add_argument(
-        "--scheduler",
-        type=str,
-        default="euler-ancestral",
-        choices=[
-            "pndm",
-            "lms",
-            "euler",
-            "euler-ancestral",
-            "dpm-multi",
-            "dpm-single",
-            "unipc-multi",
-            "ddim",
-            "ddpm",
-            "deis-multi",
-            "heun",
-            "kdpm2-ancestral",
-            "kdpm2",
-        ],
-        help="The scheduler type of stable diffusion.",
-    )
-    parser.add_argument("--height", type=int, default=512, help="Height of input image")
-    parser.add_argument("--width", type=int, default=512, help="Width of input image")
-    return parser.parse_args()
-
-
-def attn_processors(self):
-    processors = {}
-
-    def fn_recursive_add_processors(name: str, module, processors):
-        if hasattr(module, "set_processor"):
-            processors[f"{name}.processor"] = module.processor
-
-        for sub_name, child in module.named_children():
-            fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-        return processors
-
-    for name, module in self.named_children():
-        fn_recursive_add_processors(name, module, processors)
-
-    return processors
-
-
-def set_attn_processor(self, processor):
-    count = len(attn_processors(self).keys())
-
-    if isinstance(processor, dict) and len(processor) != count:
-        raise ValueError(
-            f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-            f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-        )
-
-    def fn_recursive_attn_processor(name: str, module, processor):
-        if hasattr(module, "set_processor"):
-            if not isinstance(processor, dict):
-                module.set_processor(processor)
-            else:
-                module.set_processor(processor.pop(f"{name}.processor"))
-
-        for sub_name, child in module.named_children():
-            fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-    for name, module in self.named_children():
-        fn_recursive_attn_processor(name, module, processor)
-
-
-def main(args):
-    if args.tf32:
-        torch.backends.cuda.matmul.allow_tf32 = True
-    else:
-        torch.backends.cuda.matmul.allow_tf32 = False
-
-    seed = 1024
-    torch_dtype = torch.float16 if args.use_fp16 else torch.float32
-    pipe = StableDiffusionImageVariationPipeline.from_pretrained(
-        args.pretrained_model_name_or_path,
-        safety_checker=None,
-        requires_safety_checker=False,
-        torch_dtype=torch_dtype,
-    )
-    scheduler = change_scheduler(pipe, args.scheduler)
-    pipe.scheduler = scheduler
-    if args.device_id >= 0:
-        pipe.to(f"cuda:{args.device_id}")
-
-    if args.attention_type == "all":
-        args.attention_type = ["raw", "sdp"]
-    else:
-        args.attention_type = [args.attention_type]
-
-    for attention_type in args.attention_type:
-        attn_prrocessor_cls = AttnProcessor if attention_type == "raw" else AttnProcessor2_0
-        if attention_type == "sdp":
-            torch.nn.functional.scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention_
-
-        set_attn_processor(pipe.unet, attn_prrocessor_cls())
-        set_attn_processor(pipe.vae, attn_prrocessor_cls())
-        if args.channels_last:
-            pipe.unet.to(memory_format=torch.channels_last)
-
-        if args.compile:
-            print("Run torch compile")
-            pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
-
-        width = args.width
-        height = args.height
-        pipe.set_progress_bar_config(disable=True)
-
-        folder = f"torch_attn_{attention_type}_fp16" if args.use_fp16 else f"torch_attn_{attention_type}_fp32"
-        os.makedirs(folder, exist_ok=True)
-
-        # image_vairation
-        img_url = (
-            "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
-        )
-        init_image = load_image(img_url).resize((width, height))
-        time_costs = []
-        # warmup
-        pipe(
-            image=init_image,
-            num_inference_steps=20,
-            height=height,
-            width=width,
-        )
-        print("==> Test image_vairation performance.")
-        for step in trange(args.benchmark_steps):
-            start = time.time()
-            torch.cuda.manual_seed(seed)
-            images = pipe(
-                image=init_image,
-                num_inference_steps=args.inference_steps,
-                height=height,
-                width=width,
-            ).images
-            latency = time.time() - start
-            time_costs += [latency]
-            # print(f"No {step:3d} time cost: {latency:2f} s")
-        print(
-            f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
-            f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
-        )
-        images[0].save(f"{folder}/image_vairation.png")
-
-
-if __name__ == "__main__":
-    args = parse_arguments()
-    main(args)
diff --git a/ppdiffusers/examples/Stable-CycleDiffusion/README.md b/ppdiffusers/examples/Stable-CycleDiffusion/README.md
deleted file mode 100644
index 162f5eeb25bd..000000000000
--- a/ppdiffusers/examples/Stable-CycleDiffusion/README.md
+++ /dev/null
@@ -1,26 +0,0 @@
-# Stable-CycleDiffusion
-
-## 依赖
-```shell
-pip install -r requirements.txt
-```
-
-## 准备images数据
-```shell
-wget https://paddlenlp.bj.bcebos.com/models/community/ChenWu98/Stable-CycleDiffusion/images.tar.gz
-tar -zxvf images.tar.gz
-```
-
-## 使用
-### Gradio
-```shell
-python app.py
-```
-### 界面展示
-<p align="center">
-    <img src="https://user-images.githubusercontent.com/50394665/218619785-48a0bcdc-45d3-4ee7-8e1c-237c28b6056e.png" >
-</p>
-
-## 参考
-- https://huggingface.co/spaces/ChenWu98/Stable-CycleDiffusion
-- https://arxiv.org/abs/2210.05559
diff --git a/ppdiffusers/examples/Stable-CycleDiffusion/app.py b/ppdiffusers/examples/Stable-CycleDiffusion/app.py
deleted file mode 100644
index 3fb59a65b92e..000000000000
--- a/ppdiffusers/examples/Stable-CycleDiffusion/app.py
+++ /dev/null
@@ -1,676 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import abc
-import os
-from typing import Dict, List, Optional, Tuple, Union
-
-import gradio as gr
-import paddle
-import paddle.nn.functional as F
-import ptp_utils
-import seq_aligner
-from PIL import Image
-
-from ppdiffusers import CycleDiffusionPipeline, DDIMScheduler
-
-LOW_RESOURCE = False
-MAX_NUM_WORDS = 77
-
-paddle_dtype = paddle.float32  # paddle.float32
-model_id_or_path = "CompVis/stable-diffusion-v1-4"
-device_print = "GPU 🔥"
-device = "gpu"
-
-pipe = CycleDiffusionPipeline.from_pretrained(
-    model_id_or_path, use_auth_token=os.environ.get("USER_TOKEN"), paddle_dtype=paddle_dtype
-)
-pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-tokenizer = pipe.tokenizer
-
-
-class LocalBlend:
-    def __call__(self, x_t, attention_store):
-        k = 1
-        maps = attention_store["down_cross"][2:4] + attention_store["up_cross"][:3]
-        maps = [item.reshape([self.alpha_layers.shape[0], -1, 1, 16, 16, MAX_NUM_WORDS]) for item in maps]
-        maps = paddle.concat(maps, axis=1)
-        maps = (maps * self.alpha_layers).sum(-1).mean(1)
-        mask = F.max_pool2d(maps, (k * 2 + 1, k * 2 + 1), (1, 1), padding=(k, k))
-        mask = F.interpolate(mask, size=(x_t.shape[2:]))
-        mask = mask / mask.max(2, keepdim=True)[0].max(3, keepdim=True)[0]
-        mask = mask > self.threshold
-        mask = (mask[:1] + mask[1:]).cast(x_t.dtype)
-        x_t = x_t[:1] + mask * (x_t - x_t[:1])
-        return x_t
-
-    def __init__(self, prompts: List[str], words, threshold=0.3):
-        alpha_layers = paddle.zeros([len(prompts), 1, 1, 1, 1, MAX_NUM_WORDS])
-        for i, (prompt, words_) in enumerate(zip(prompts, words)):
-            if type(words_) is str:
-                words_ = [words_]
-            for word in words_:
-                ind = ptp_utils.get_word_inds(prompt, word, tokenizer)
-                alpha_layers[i, :, :, :, :, ind] = 1
-        self.alpha_layers = alpha_layers.cast(paddle_dtype)
-        self.threshold = threshold
-
-
-class AttentionControl(abc.ABC):
-    def step_callback(self, x_t):
-        return x_t
-
-    def between_steps(self):
-        return
-
-    @property
-    def num_uncond_att_layers(self):
-        return self.num_att_layers if LOW_RESOURCE else 0
-
-    @abc.abstractmethod
-    def forward(self, attn, is_cross: bool, place_in_unet: str):
-        raise NotImplementedError
-
-    def __call__(self, attn, is_cross: bool, place_in_unet: str):
-        if self.cur_att_layer >= self.num_uncond_att_layers:
-            if LOW_RESOURCE:
-                attn = self.forward(attn, is_cross, place_in_unet)
-            else:
-                # h = attn.shape[0]
-                # attn[h // 2:] = self.forward(attn[h // 2:], is_cross, place_in_unet)
-                attn[1:] = self.forward(attn[1:], is_cross, place_in_unet)
-
-        self.cur_att_layer += 1
-        if self.cur_att_layer == self.num_att_layers + self.num_uncond_att_layers:
-            self.cur_att_layer = 0
-            self.cur_step += 1
-            self.between_steps()
-        return attn
-
-    def reset(self):
-        self.cur_step = 0
-        self.cur_att_layer = 0
-
-    def __init__(self):
-        self.cur_step = 0
-        self.num_att_layers = -1
-        self.cur_att_layer = 0
-
-
-class EmptyControl(AttentionControl):
-    def forward(self, attn, is_cross: bool, place_in_unet: str):
-        return attn
-
-
-class AttentionStore(AttentionControl):
-    @staticmethod
-    def get_empty_store():
-        return {"down_cross": [], "mid_cross": [], "up_cross": [], "down_self": [], "mid_self": [], "up_self": []}
-
-    def forward(self, attn, is_cross: bool, place_in_unet: str):
-        key = f"{place_in_unet}_{'cross' if is_cross else 'self'}"
-        if attn.shape[1] <= 32**2:  # avoid memory overhead
-            self.step_store[key].append(attn)
-        return attn
-
-    def between_steps(self):
-        if len(self.attention_store) == 0:
-            self.attention_store = self.step_store
-        else:
-            for key in self.attention_store:
-                for i in range(len(self.attention_store[key])):
-                    self.attention_store[key][i] += self.step_store[key][i]
-        self.step_store = self.get_empty_store()
-
-    def get_average_attention(self):
-        average_attention = {
-            key: [item / self.cur_step for item in self.attention_store[key]] for key in self.attention_store
-        }
-        return average_attention
-
-    def reset(self):
-        super(AttentionStore, self).reset()
-        self.step_store = self.get_empty_store()
-        self.attention_store = {}
-
-    def __init__(self):
-        super(AttentionStore, self).__init__()
-        self.step_store = self.get_empty_store()
-        self.attention_store = {}
-
-
-class AttentionControlEdit(AttentionStore, abc.ABC):
-    def step_callback(self, x_t):
-        if self.local_blend is not None:
-            x_t = self.local_blend(x_t, self.attention_store)
-        return x_t
-
-    def replace_self_attention(self, attn_base, att_replace):
-        if att_replace.shape[2] <= 16**2:
-            return attn_base.unsqueeze(0).expand([att_replace.shape[0], *attn_base.shape])
-        else:
-            return att_replace
-
-    @abc.abstractmethod
-    def replace_cross_attention(self, attn_base, att_replace):
-        raise NotImplementedError
-
-    def forward(self, attn, is_cross: bool, place_in_unet: str):
-        super(AttentionControlEdit, self).forward(attn, is_cross, place_in_unet)
-        if is_cross or (self.num_self_replace[0] <= self.cur_step < self.num_self_replace[1]):
-            attn_base, attn_repalce = attn[0], attn[1:]
-            if is_cross:
-                alpha_words = self.cross_replace_alpha[self.cur_step]
-                attn_replace_new = (
-                    self.replace_cross_attention(attn_base, attn_repalce) * alpha_words
-                    + (1 - alpha_words) * attn_repalce
-                )
-                attn[1:] = attn_replace_new
-            else:
-                attn[1:] = self.replace_self_attention(attn_base, attn_repalce)
-        return attn
-
-    def __init__(
-        self,
-        prompts,
-        num_steps: int,
-        cross_replace_steps: Union[float, Tuple[float, float], Dict[str, Tuple[float, float]]],
-        self_replace_steps: Union[float, Tuple[float, float]],
-        local_blend: Optional[LocalBlend],
-    ):
-        super(AttentionControlEdit, self).__init__()
-        self.batch_size = len(prompts)
-        self.cross_replace_alpha = ptp_utils.get_time_words_attention_alpha(
-            prompts, num_steps, cross_replace_steps, tokenizer
-        ).cast(paddle_dtype)
-        if type(self_replace_steps) is float or type(self_replace_steps) is int:
-            self_replace_steps = 0, self_replace_steps
-        self.num_self_replace = int(num_steps * self_replace_steps[0]), int(num_steps * self_replace_steps[1])
-        self.local_blend = local_blend
-
-
-class AttentionReplace(AttentionControlEdit):
-    def replace_cross_attention(self, attn_base, att_replace):
-        return paddle.einsum("hpw,bwn->bhpn", attn_base, self.mapper)
-
-    def __init__(
-        self,
-        prompts,
-        num_steps: int,
-        cross_replace_steps: float,
-        self_replace_steps: float,
-        local_blend: Optional[LocalBlend] = None,
-    ):
-        super(AttentionReplace, self).__init__(
-            prompts, num_steps, cross_replace_steps, self_replace_steps, local_blend
-        )
-        self.mapper = seq_aligner.get_replacement_mapper(prompts, tokenizer).cast(paddle_dtype)
-
-
-class AttentionRefine(AttentionControlEdit):
-    def replace_cross_attention(self, attn_base, att_replace):
-        # a.shape [8, 4096, 77]
-        # b.shape [1, 77]
-        # pt: a[:, :, b].shape = torch.Size([8, 4096, 1, 77])
-        # pd: a.take_along_axis(b.unsqueeze(0), axis=-1).unsqueeze(-2)
-
-        attn_base_replace = (
-            attn_base.take_along_axis(self.mapper.unsqueeze(0), axis=-1).unsqueeze(-2).transpose([2, 0, 1, 3])
-        )
-        attn_replace = attn_base_replace * self.alphas + att_replace * (1 - self.alphas)
-        return attn_replace
-
-    def __init__(
-        self,
-        prompts,
-        num_steps: int,
-        cross_replace_steps: float,
-        self_replace_steps: float,
-        local_blend: Optional[LocalBlend] = None,
-    ):
-        super(AttentionRefine, self).__init__(prompts, num_steps, cross_replace_steps, self_replace_steps, local_blend)
-        self.mapper, alphas = seq_aligner.get_refinement_mapper(prompts, tokenizer)
-        alphas = alphas.cast(paddle_dtype)
-        self.alphas = alphas.reshape([alphas.shape[0], 1, 1, alphas.shape[1]])
-
-
-def get_equalizer(text: str, word_select: Union[int, Tuple[int, ...]], values: Union[List[float], Tuple[float, ...]]):
-    if type(word_select) is int or type(word_select) is str:
-        word_select = (word_select,)
-    equalizer = paddle.ones([len(values), 77])
-    values = paddle.to_tensor(values, dtype=paddle_dtype)
-    for word in word_select:
-        inds = ptp_utils.get_word_inds(text, word, tokenizer)
-        equalizer[:, inds] = values
-    return equalizer
-
-
-def inference(
-    source_prompt,
-    target_prompt,
-    source_guidance_scale=1,
-    guidance_scale=5,
-    num_inference_steps=100,
-    width=512,
-    height=512,
-    seed=0,
-    img=None,
-    strength=0.7,
-    cross_attention_control="None",
-    cross_replace_steps=0.8,
-    self_replace_steps=0.4,
-):
-
-    paddle.seed(seed)
-
-    ratio = min(height / img.height, width / img.width)
-    img = img.resize((int(img.width * ratio), int(img.height * ratio)))
-    # make sure dtype is float
-    source_guidance_scale = float(source_guidance_scale)
-    guidance_scale = float(guidance_scale)
-    strength = float(strength)
-    self_replace_steps = float(self_replace_steps)
-    cross_replace_steps = float(cross_replace_steps)
-
-    # create the CAC controller.
-    if cross_attention_control == "Replace":
-        controller = AttentionReplace(
-            [source_prompt, target_prompt],
-            num_inference_steps,
-            cross_replace_steps=cross_replace_steps,
-            self_replace_steps=self_replace_steps,
-        )
-        ptp_utils.register_attention_control(pipe, controller)
-    elif cross_attention_control == "Refine":
-        controller = AttentionRefine(
-            [source_prompt, target_prompt],
-            num_inference_steps,
-            cross_replace_steps=cross_replace_steps,
-            self_replace_steps=self_replace_steps,
-        )
-        ptp_utils.register_attention_control(pipe, controller)
-    elif cross_attention_control == "None":
-        controller = EmptyControl()
-        ptp_utils.register_attention_control(pipe, controller)
-    else:
-        raise ValueError("Unknown cross_attention_control: {}".format(cross_attention_control))
-
-    with paddle.amp.auto_cast(True, level="O2"):
-        results = pipe(
-            prompt=target_prompt,
-            source_prompt=source_prompt,
-            image=img,
-            num_inference_steps=num_inference_steps,
-            eta=0.1,
-            strength=strength,
-            guidance_scale=guidance_scale,
-            source_guidance_scale=source_guidance_scale,
-        )
-    if pipe.safety_checker is None:
-        return results.images[0]
-    else:
-        return replace_nsfw_images(results)
-
-
-def replace_nsfw_images(results):
-    for i in range(len(results.images)):
-        if results.nsfw_content_detected[i]:
-            results.images[i] = Image.open("images/nsfw.png")
-    return results.images[0]
-
-
-css = """.cycle-diffusion-div div{display:inline-flex;align-items:center;gap:.8rem;font-size:1.75rem}.cycle-diffusion-div div h1{font-weight:900;margin-bottom:7px}.cycle-diffusion-div p{margin-bottom:10px;font-size:94%}.cycle-diffusion-div p a{text-decoration:underline}.tabs{margin-top:0;margin-bottom:0}#gallery{min-height:20rem}
-"""
-with gr.Blocks(css=css) as demo:
-    gr.HTML(
-        """
-            <div class="cycle-diffusion-div">
-              <div>
-                <h1>CycleDiffusion with Stable Diffusion</h1>
-              </div>
-              <p>
-                Demo for CycleDiffusion with Stable Diffusion. <br>
-                CycleDiffusion (<a href="https://arxiv.org/abs/2210.05559">📄 Paper link</a> | <a href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/cycle_diffusion">🧨 Pipeline doc</a>) is an image-to-image translation method that supports stochastic samplers for diffusion models. <br>
-                We also support the combination of CycleDiffusion and Cross Attention Control (CAC | <a href="https://arxiv.org/abs/2208.01626">📄 Paper link</a>). CAC is a technique to transfer the attention map from the source prompt to the target prompt. <br>
-              </p>
-              <p>
-              <b>Quick start</b>: <br>
-              1. Click one row of Examples at the end of this page. It will fill all inputs needed. <br>
-              2. Click the "Run CycleDiffusion" button. <br>
-              </p>
-            </div>
-        """
-    )
-    with gr.Accordion("See Details", open=False):
-        gr.HTML(
-            """
-            <div class="cycle-diffusion-div">
-              <p>
-                <b>How to use:</b> <br>
-                1. Upload an image. <br>
-                2. Enter the source and target prompts. <br>
-                3. Select the source guidance scale (for "encoding") and the target guidance scale (for "decoding"). <br>
-                4. Select the strength (smaller strength means better content preservation). <br>
-                5 (optional). Configurate Cross Attention Control options (e.g., CAC type, cross replace steps, self replace steps). <br>
-                6 (optional). Configurate other options (e.g., image size, inference steps, random seed). <br>
-                7. Click the "Run CycleDiffusion" button. <br>
-              </p>
-              <p>
-                <b>Notes:</b> <br>
-                1. CycleDiffusion is likely to fail when drastic changes are intended (e.g., changing a large black car to red). <br>
-                2. The value of strength can be set larger when CAC is used. <br>
-                3. If CAC type is "Replace", the source and target prompts should differ in only one token; otherwise, an error will be raised. This is why we deliberately make some grammar mistakes in Examples.<br>
-                4. If CAC type is "Refine", the source prompt be a subsequence of the target prompt; otherwise, an error will be raised. <br>
-              </p>
-              <p>
-              <b>Runtimes:</b> <br>
-              1. 20s on A10G. <br>
-              </p>
-            </div>
-        """
-        )
-    with gr.Row():
-
-        with gr.Column(scale=55):
-            with gr.Group():
-
-                img = gr.Image(label="Input image", height=512, tool="editor", type="pil")
-
-                image_out = gr.Image(label="Output image", height=512)
-                # gallery = gr.Gallery(
-                #     label="Generated images", show_label=False, elem_id="gallery"
-                # ).style(grid=[1], height="auto")
-
-        with gr.Column(scale=45):
-            with gr.Tab("Edit options"):
-                with gr.Group():
-                    with gr.Row():
-                        source_prompt = gr.Textbox(
-                            label="Source prompt", placeholder="Source prompt describes the input image"
-                        )
-                        source_guidance_scale = gr.Slider(
-                            label="Source guidance scale", value=1, minimum=1, maximum=10
-                        )
-                    with gr.Row():
-                        target_prompt = gr.Textbox(
-                            label="Target prompt", placeholder="Target prompt describes the output image"
-                        )
-                        guidance_scale = gr.Slider(label="Target guidance scale", value=5, minimum=1, maximum=10)
-                    with gr.Row():
-                        strength = gr.Slider(label="Strength", value=0.7, minimum=0.5, maximum=1, step=0.01)
-                    with gr.Row():
-                        generate1 = gr.Button(value="Run CycleDiffusion")
-
-            with gr.Tab("CAC options"):
-                with gr.Group():
-                    with gr.Row():
-                        cross_attention_control = gr.Radio(
-                            label="CAC type", choices=["None", "Replace", "Refine"], value="None"
-                        )
-                    with gr.Row():
-                        # If not "None", the following two parameters will be used.
-                        cross_replace_steps = gr.Slider(
-                            label="Cross replace steps", value=0.8, minimum=0.0, maximum=1, step=0.01
-                        )
-                        self_replace_steps = gr.Slider(
-                            label="Self replace steps", value=0.4, minimum=0.0, maximum=1, step=0.01
-                        )
-                    with gr.Row():
-                        generate2 = gr.Button(value="Run CycleDiffusion")
-
-            with gr.Tab("Other options"):
-                with gr.Group():
-                    with gr.Row():
-                        num_inference_steps = gr.Slider(
-                            label="Inference steps", value=100, minimum=25, maximum=500, step=1
-                        )
-                        width = gr.Slider(label="Width", value=512, minimum=512, maximum=1024, step=8)
-                        height = gr.Slider(label="Height", value=512, minimum=512, maximum=1024, step=8)
-
-                    with gr.Row():
-                        seed = gr.Slider(0, 2147483647, label="Seed", value=0, step=1)
-                    with gr.Row():
-                        generate3 = gr.Button(value="Run CycleDiffusion")
-
-    inputs = [
-        source_prompt,
-        target_prompt,
-        source_guidance_scale,
-        guidance_scale,
-        num_inference_steps,
-        width,
-        height,
-        seed,
-        img,
-        strength,
-        cross_attention_control,
-        cross_replace_steps,
-        self_replace_steps,
-    ]
-    generate1.click(inference, inputs=inputs, outputs=image_out)
-    generate2.click(inference, inputs=inputs, outputs=image_out)
-    generate3.click(inference, inputs=inputs, outputs=image_out)
-
-    ex = gr.Examples(
-        [
-            [
-                "An astronaut riding a horse",
-                "An astronaut riding an elephant",
-                1,
-                2,
-                100,
-                512,
-                512,
-                0,
-                "images/astronaut_horse.png",
-                0.8,
-                "None",
-                0,
-                0,
-            ],
-            [
-                "An astronaut riding a horse",
-                "An astronaut riding a elephant",
-                1,
-                2,
-                100,
-                512,
-                512,
-                0,
-                "images/astronaut_horse.png",
-                0.9,
-                "Replace",
-                0.15,
-                0.10,
-            ],
-            [
-                "A black colored car.",
-                "A blue colored car.",
-                1,
-                3,
-                100,
-                512,
-                512,
-                0,
-                "images/black_car.png",
-                0.85,
-                "None",
-                0,
-                0,
-            ],
-            [
-                "A black colored car.",
-                "A blue colored car.",
-                1,
-                5,
-                100,
-                512,
-                512,
-                0,
-                "images/black_car.png",
-                0.95,
-                "Replace",
-                0.8,
-                0.4,
-            ],
-            [
-                "A black colored car.",
-                "A red colored car.",
-                1,
-                5,
-                100,
-                512,
-                512,
-                0,
-                "images/black_car.png",
-                1,
-                "Replace",
-                0.8,
-                0.4,
-            ],
-            [
-                "An aerial view of autumn scene.",
-                "An aerial view of winter scene.",
-                1,
-                5,
-                100,
-                512,
-                512,
-                0,
-                "images/mausoleum.png",
-                0.9,
-                "None",
-                0,
-                0,
-            ],
-            [
-                "An aerial view of autumn scene.",
-                "An aerial view of winter scene.",
-                1,
-                5,
-                100,
-                512,
-                512,
-                0,
-                "images/mausoleum.png",
-                1,
-                "Replace",
-                0.8,
-                0.4,
-            ],
-            [
-                "A green apple and a black backpack on the floor.",
-                "A red apple and a black backpack on the floor.",
-                1,
-                7,
-                100,
-                512,
-                512,
-                0,
-                "images/apple_bag.png",
-                0.9,
-                "None",
-                0,
-                0,
-            ],
-            [
-                "A green apple and a black backpack on the floor.",
-                "A red apple and a black backpack on the floor.",
-                1,
-                7,
-                100,
-                512,
-                512,
-                0,
-                "images/apple_bag.png",
-                0.9,
-                "Replace",
-                0.8,
-                0.4,
-            ],
-            [
-                "A hotel room with red flowers on the bed.",
-                "A hotel room with a cat sitting on the bed.",
-                1,
-                4,
-                100,
-                512,
-                512,
-                0,
-                "images/flower_hotel.png",
-                0.8,
-                "None",
-                0,
-                0,
-            ],
-            [
-                "A hotel room with red flowers on the bed.",
-                "A hotel room with blue flowers on the bed.",
-                1,
-                5,
-                100,
-                512,
-                512,
-                0,
-                "images/flower_hotel.png",
-                0.95,
-                "None",
-                0,
-                0,
-            ],
-            [
-                "A green apple and a black backpack on the floor.",
-                "Two green apples and a black backpack on the floor.",
-                1,
-                5,
-                100,
-                512,
-                512,
-                0,
-                "images/apple_bag.png",
-                0.89,
-                "None",
-                0,
-                0,
-            ],
-        ],
-        [
-            source_prompt,
-            target_prompt,
-            source_guidance_scale,
-            guidance_scale,
-            num_inference_steps,
-            width,
-            height,
-            seed,
-            img,
-            strength,
-            cross_attention_control,
-            cross_replace_steps,
-            self_replace_steps,
-        ],
-        image_out,
-        inference,
-        cache_examples=True,
-    )
-
-    gr.Markdown(
-        """
-      Space built with PPDiffusers 🧨 by PaddleNLP.
-      [![Twitter Follow](https://img.shields.io/twitter/follow/ChenHenryWu?style=social)](https://twitter.com/ChenHenryWu)
-      """
-    )
-
-demo.launch(debug=True, share=True, server_name="0.0.0.0", server_port=8581)
diff --git a/ppdiffusers/examples/Stable-CycleDiffusion/ptp_utils.py b/ppdiffusers/examples/Stable-CycleDiffusion/ptp_utils.py
deleted file mode 100644
index 4602f5b059be..000000000000
--- a/ppdiffusers/examples/Stable-CycleDiffusion/ptp_utils.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-# Copyright 2022 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Dict, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-
-
-def register_attention_control(model, controller):
-    def ca_forward(self, place_in_unet):
-        def forward(hidden_states, encoder_hidden_states=None, attention_mask=None, **cross_attention_kwargs):
-            batch_size, sequence_length, _ = hidden_states.shape
-            attention_mask = self.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-
-            query = self.to_q(hidden_states)
-            query = self.head_to_batch_dim(query)
-
-            is_cross = encoder_hidden_states is not None
-            encoder_hidden_states = encoder_hidden_states if is_cross else hidden_states
-
-            key = self.to_k(encoder_hidden_states)
-            value = self.to_v(encoder_hidden_states)
-            key = self.head_to_batch_dim(key)
-            value = self.head_to_batch_dim(value)
-
-            attention_probs = self.get_attention_scores(query, key, attention_mask)
-
-            attention_probs = controller(attention_probs, is_cross, place_in_unet)
-
-            hidden_states = paddle.matmul(attention_probs, value)
-            hidden_states = self.batch_to_head_dim(hidden_states)
-
-            # linear proj
-            hidden_states = self.to_out[0](hidden_states)
-            # dropout
-            hidden_states = self.to_out[1](hidden_states)
-            return hidden_states
-
-        return forward
-
-    def register_recr(net_, count, place_in_unet):
-        if net_.__class__.__name__ == "CrossAttention":
-            net_.forward = ca_forward(net_, place_in_unet)
-            return count + 1
-        elif hasattr(net_, "children"):
-            for net__ in net_.children():
-                count = register_recr(net__, count, place_in_unet)
-        return count
-
-    cross_att_count = 0
-    sub_nets = model.unet.named_children()
-    for net in sub_nets:
-        if "down" in net[0]:
-            cross_att_count += register_recr(net[1], 0, "down")
-        elif "up" in net[0]:
-            cross_att_count += register_recr(net[1], 0, "up")
-        elif "mid" in net[0]:
-            cross_att_count += register_recr(net[1], 0, "mid")
-    controller.num_att_layers = cross_att_count
-
-
-def get_word_inds(text: str, word_place: int, tokenizer):
-    split_text = text.split(" ")
-    if type(word_place) is str:
-        word_place = [i for i, word in enumerate(split_text) if word_place == word]
-    elif type(word_place) is int:
-        word_place = [word_place]
-    out = []
-    if len(word_place) > 0:
-        words_encode = [tokenizer.decode([item]).strip("#") for item in tokenizer.encode(text).input_ids][1:-1]
-        cur_len, ptr = 0, 0
-
-        for i in range(len(words_encode)):
-            cur_len += len(words_encode[i])
-            if ptr in word_place:
-                out.append(i + 1)
-            if cur_len >= len(split_text[ptr]):
-                ptr += 1
-                cur_len = 0
-    return np.array(out)
-
-
-def update_alpha_time_word(
-    alpha, bounds: Union[float, Tuple[float, float]], prompt_ind: int, word_inds: Optional[paddle.Tensor] = None
-):
-    if type(bounds) is float or bounds == 0:
-        bounds = 0, bounds
-    start, end = int(bounds[0] * alpha.shape[0]), int(bounds[1] * alpha.shape[0])
-    if word_inds is None:
-        word_inds = paddle.arange(alpha.shape[2])
-    alpha[:start, prompt_ind, word_inds] = 0
-    alpha[start:end, prompt_ind, word_inds] = 1
-    alpha[end:, prompt_ind, word_inds] = 0
-    return alpha
-
-
-def get_time_words_attention_alpha(
-    prompts,
-    num_steps,
-    cross_replace_steps: Union[float, Tuple[float, float], Dict[str, Tuple[float, float]]],
-    tokenizer,
-    max_num_words=77,
-):
-    if type(cross_replace_steps) is not dict:
-        cross_replace_steps = {"default_": cross_replace_steps}
-    if "default_" not in cross_replace_steps:
-        cross_replace_steps["default_"] = (0.0, 1.0)
-    alpha_time_words = paddle.zeros([num_steps + 1, len(prompts) - 1, max_num_words])
-    for i in range(len(prompts) - 1):
-        alpha_time_words = update_alpha_time_word(alpha_time_words, cross_replace_steps["default_"], i)
-    for key, item in cross_replace_steps.items():
-        if key != "default_":
-            inds = [get_word_inds(prompts[i], key, tokenizer) for i in range(1, len(prompts))]
-            for i, ind in enumerate(inds):
-                if len(ind) > 0:
-                    alpha_time_words = update_alpha_time_word(alpha_time_words, item, i, ind)
-    alpha_time_words = alpha_time_words.reshape(
-        [num_steps + 1, len(prompts) - 1, 1, 1, max_num_words]
-    )  # time, batch, heads, pixels, words
-    return alpha_time_words
diff --git a/ppdiffusers/examples/Stable-CycleDiffusion/requirements.txt b/ppdiffusers/examples/Stable-CycleDiffusion/requirements.txt
deleted file mode 100644
index d77a600a0daf..000000000000
--- a/ppdiffusers/examples/Stable-CycleDiffusion/requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-paddlenlp>=2.6.0rc0
-Pillow
-ppdiffusers>=0.16.1
\ No newline at end of file
diff --git a/ppdiffusers/examples/Stable-CycleDiffusion/seq_aligner.py b/ppdiffusers/examples/Stable-CycleDiffusion/seq_aligner.py
deleted file mode 100644
index 24c30b91e7f7..000000000000
--- a/ppdiffusers/examples/Stable-CycleDiffusion/seq_aligner.py
+++ /dev/null
@@ -1,206 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-# Copyright 2022 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import numpy as np
-import paddle
-
-
-class ScoreParams:
-    def __init__(self, gap, match, mismatch):
-        self.gap = gap
-        self.match = match
-        self.mismatch = mismatch
-
-    def mis_match_char(self, x, y):
-        if x != y:
-            return self.mismatch
-        else:
-            return self.match
-
-
-# def get_matrix(size_x, size_y, gap):
-#     matrix = []
-#     for i in range(len(size_x) + 1):
-#         sub_matrix = []
-#         for j in range(len(size_y) + 1):
-#             sub_matrix.append(0)
-#         matrix.append(sub_matrix)
-#     for j in range(1, len(size_y) + 1):
-#         matrix[0][j] = j * gap
-#     for i in range(1, len(size_x) + 1):
-#         matrix[i][0] = i * gap
-#     return matrix
-
-
-def get_matrix(size_x, size_y, gap):
-    matrix = np.zeros((size_x + 1, size_y + 1), dtype=np.int32)
-    matrix[0, 1:] = (np.arange(size_y) + 1) * gap
-    matrix[1:, 0] = (np.arange(size_x) + 1) * gap
-    return matrix
-
-
-def get_traceback_matrix(size_x, size_y):
-    matrix = np.zeros((size_x + 1, size_y + 1), dtype=np.int32)
-    matrix[0, 1:] = 1
-    matrix[1:, 0] = 2
-    matrix[0, 0] = 4
-    return matrix
-
-
-def global_align(x, y, score):
-    matrix = get_matrix(len(x), len(y), score.gap)
-    trace_back = get_traceback_matrix(len(x), len(y))
-    for i in range(1, len(x) + 1):
-        for j in range(1, len(y) + 1):
-            left = matrix[i, j - 1] + score.gap
-            up = matrix[i - 1, j] + score.gap
-            diag = matrix[i - 1, j - 1] + score.mis_match_char(x[i - 1], y[j - 1])
-            matrix[i, j] = max(left, up, diag)
-            if matrix[i, j] == left:
-                trace_back[i, j] = 1
-            elif matrix[i, j] == up:
-                trace_back[i, j] = 2
-            else:
-                trace_back[i, j] = 3
-    return matrix, trace_back
-
-
-def get_aligned_sequences(x, y, trace_back):
-    x_seq = []
-    y_seq = []
-    i = len(x)
-    j = len(y)
-    mapper_y_to_x = []
-    while i > 0 or j > 0:
-        if trace_back[i, j] == 3:
-            x_seq.append(x[i - 1])
-            y_seq.append(y[j - 1])
-            i = i - 1
-            j = j - 1
-            mapper_y_to_x.append((j, i))
-        elif trace_back[i][j] == 1:
-            x_seq.append("-")
-            y_seq.append(y[j - 1])
-            j = j - 1
-            mapper_y_to_x.append((j, -1))
-        elif trace_back[i][j] == 2:
-            x_seq.append(x[i - 1])
-            y_seq.append("-")
-            i = i - 1
-        elif trace_back[i][j] == 4:
-            break
-    mapper_y_to_x.reverse()
-    return x_seq, y_seq, paddle.to_tensor(mapper_y_to_x, dtype=paddle.int64)
-
-
-def get_mapper(x: str, y: str, tokenizer, max_len=77):
-    x_seq = tokenizer.encode(x).input_ids
-    y_seq = tokenizer.encode(y).input_ids
-    score = ScoreParams(0, 1, -1)
-    matrix, trace_back = global_align(x_seq, y_seq, score)
-    mapper_base = get_aligned_sequences(x_seq, y_seq, trace_back)[-1]
-    alphas = paddle.ones(
-        [
-            max_len,
-        ]
-    )
-    alphas[: mapper_base.shape[0]] = (mapper_base[:, 1] != -1).cast("float32")
-    mapper = paddle.zeros(
-        [
-            max_len,
-        ],
-        dtype=paddle.int64,
-    )
-    mapper[: mapper_base.shape[0]] = mapper_base[:, 1]
-    mapper[mapper_base.shape[0] :] = len(y_seq) + paddle.arange(max_len - len(y_seq), dtype="int64")
-    return mapper, alphas
-
-
-def get_refinement_mapper(prompts, tokenizer, max_len=77):
-    x_seq = prompts[0]
-    mappers, alphas = [], []
-    for i in range(1, len(prompts)):
-        mapper, alpha = get_mapper(x_seq, prompts[i], tokenizer, max_len)
-        mappers.append(mapper)
-        alphas.append(alpha)
-    return paddle.stack(mappers), paddle.stack(alphas)
-
-
-def get_word_inds(text: str, word_place: int, tokenizer):
-    split_text = text.split(" ")
-    if type(word_place) is str:
-        word_place = [i for i, word in enumerate(split_text) if word_place == word]
-    elif type(word_place) is int:
-        word_place = [word_place]
-    out = []
-    if len(word_place) > 0:
-        words_encode = [tokenizer.decode([item]).strip("#") for item in tokenizer.encode(text).input_ids][1:-1]
-        cur_len, ptr = 0, 0
-
-        for i in range(len(words_encode)):
-            cur_len += len(words_encode[i])
-            if ptr in word_place:
-                out.append(i + 1)
-            if cur_len >= len(split_text[ptr]):
-                ptr += 1
-                cur_len = 0
-    return np.array(out)
-
-
-def get_replacement_mapper_(x: str, y: str, tokenizer, max_len=77):
-    words_x = x.split(" ")
-    words_y = y.split(" ")
-    if len(words_x) != len(words_y):
-        raise ValueError(
-            f"attention replacement edit can only be applied on prompts with the same length"
-            f" but prompt A has {len(words_x)} words and prompt B has {len(words_y)} words."
-        )
-    inds_replace = [i for i in range(len(words_y)) if words_y[i] != words_x[i]]
-    inds_source = [get_word_inds(x, i, tokenizer) for i in inds_replace]
-    inds_target = [get_word_inds(y, i, tokenizer) for i in inds_replace]
-    mapper = np.zeros((max_len, max_len))
-    i = j = 0
-    cur_inds = 0
-    while i < max_len and j < max_len:
-        if cur_inds < len(inds_source) and inds_source[cur_inds][0] == i:
-            inds_source_, inds_target_ = inds_source[cur_inds], inds_target[cur_inds]
-            if len(inds_source_) == len(inds_target_):
-                mapper[inds_source_, inds_target_] = 1
-            else:
-                ratio = 1 / len(inds_target_)
-                for i_t in inds_target_:
-                    mapper[inds_source_, i_t] = ratio
-            cur_inds += 1
-            i += len(inds_source_)
-            j += len(inds_target_)
-        elif cur_inds < len(inds_source):
-            mapper[i, j] = 1
-            i += 1
-            j += 1
-        else:
-            mapper[j, j] = 1
-            i += 1
-            j += 1
-
-    return paddle.to_tensor(mapper).cast("float32")
-
-
-def get_replacement_mapper(prompts, tokenizer, max_len=77):
-    x_seq = prompts[0]
-    mappers = []
-    for i in range(1, len(prompts)):
-        mapper = get_replacement_mapper_(x_seq, prompts[i], tokenizer, max_len)
-        mappers.append(mapper)
-    return paddle.stack(mappers)
diff --git a/ppdiffusers/examples/autoencoder/vae/README.md b/ppdiffusers/examples/autoencoder/vae/README.md
deleted file mode 100644
index 8f71827b11ac..000000000000
--- a/ppdiffusers/examples/autoencoder/vae/README.md
+++ /dev/null
@@ -1,236 +0,0 @@
-## AutoEncoderKL(VAE) 从零训练代码
-
-本教程带领大家如何开启[f8encoder_f16decoder](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/ppdiffusers/examples/autoencoder/vae/config/f8encoder_f16decoder.yaml)架构的AutoEncoderKL (VAE) 模型。
-
-
-## 1 本地运行
-### 1.1 安装依赖
-
-在运行这个训练代码前，我们需要安装下面的训练依赖。
-
-```bash
-# paddlepaddle-gpu>=2.4.1
-python -m pip install paddlepaddle-gpu==2.4.1.post112 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
-pip install -r requirements.txt
-```
-
-### 1.2 准备数据
-
-#### laion400m_en.filelist文件内部格式如下所示
-自己准备好处理后的数据，并且将文件放置于`/data/laion400m/`目录，其中里面的每个part的前三列为`占位符空, caption文本描述, 占位符空, base64编码的图片`，`_, caption, _, img_b64 = vec[:4]`。
-
-注意，当前`laion400m_en.filelist`只存放了10条数据路径，如果想要更多数据的话，请运行`python write_filelist.py`代码，运行后会生成6万条数据路径。
-```
-/data/laion400m/part-00000.gz
-/data/laion400m/part-00001.gz
-/data/laion400m/part-00002.gz
-/data/laion400m/part-00003.gz
-/data/laion400m/part-00004.gz
-/data/laion400m/part-00005.gz
-/data/laion400m/part-00006.gz
-/data/laion400m/part-00007.gz
-/data/laion400m/part-00008.gz
-/data/laion400m/part-00009.gz
-```
-#### train.filelist.list训练文件内部格式如下所示
-我们提供了`laion400m_en.filelist`，当然也可以存放其他`filelist`
-```
-./data/filelist/laion400m_en.filelist
-```
-
-### 1.3 Encoder热启，Decoder从零开启训练
-Tips：
-- FP32 在 40GB 的显卡上可正常训练，在下面的配置条件下，显存占用约 29G。
-
-#### 1.3.1 单机单卡训练
-```bash
-python -u train_vae.py \
-    --pretrained_model_name_or_path CompVis/stable-diffusion-v1-4 \
-    --ignore_keys decoder. \
-    --vae_config_file config/vae.json \
-    --freeze_encoder \
-    --input_size 256 256 \
-    --max_train_steps 100000000000 \
-    --learning_rate 1e-4 \
-    --batch_size 4 \
-    --num_workers 8 \
-    --logging_steps 100 \
-    --save_steps 2000 \
-    --image_logging_steps 500 \
-    --disc_start 50001 \
-    --kl_weight 0.000001 \
-    --disc_weight 0.5 \
-    --resolution 512
-```
-
-`train_vae.py`代码可传入的参数解释如下：
-> * `--pretrained_model_name_or_path`: 加载预训练模型的名称或本地路径，当我们设置成`CompVis/stable-diffusion-v1-4`后，我们会加载自动加载此模型VAE（kl-8.ckpt）部分的预训练权重。例如：在上面的训练代码中，我们（1）加载了 `kl-8.ckpt` 的 `encoder` 部分权重 （设置 `pretrained_model_name_or_path` 参数），（2）修改了模型 `decoder` 部分的结构 （指定了 `vae_config_file`），（3）删除了不希望加载的预训练权重（设置`ignore_keys`，会自动删除前缀为`ignore_keys`的模型参数）。
-> * `--from_scratch`: 是否所有权重均从零初始化开启训练。
-> * `--scale_lr`: 是否对学习率进行缩放，缩放公式：`ngpus*batch_size*learning_rate`。
-> * `--batch_size`: 训练时每张显卡所使用的`batch_size批量`，当我们的显存较小的时候，需要将这个值设置的小一点。
-> * `--input_size`: `encoder`处接收图片的`height`和`width`，我们在训练不对等层数的`encoder-decoder`结构的`VAE`模型时候才会指定这个参数。
-> * `--learning_rate`: 学习率。
-> * `--max_train_steps`: 最大的训练步数。
-> * `--save_steps`: 每间隔多少步`（global step步数）`，保存模型。
-> * `--image_logging_steps`: 每隔多少步，log训练过程中的图片，默认为`500`步，注意`image_logging_steps`需要是`logging_steps`的整数倍。
-> * `--logging_steps`: logging日志的步数，默认为`100`步，注意，这里log的日志只是单卡、单步的loss信息。
-> * `--output_dir`: 模型保存路径。
-> * `--seed`: 随机种子，为了可以复现训练结果，Tips：当前paddle设置该随机种子后仍无法完美复现。
-> * `--num_workers`: Dataloader所使用的`num_workers`参数。
-> * `--dataset_type`: 数据集类型，当前我们支持`imagenet` 和 `text_image_pair` 两种数据集，默认是`text_image_pair`。
-> * `--file_list`: file_list文件地址，当我们数据集类型是`text_image_pair`时才需要指定。
-> * `--disc_start`: 判别器开启训练的步数。
-> * `--disc_weight`: 判别器loss的权重比例。
-> * `--kl_weight`: kl_loss的权重比例。
-> * `--resolution`: 训练时，图像的分辨率。
-> * `--init_from_ckpt`: 是否加载预训练的ckpt权重，注意：如果我们为了严格同步pytorch的参数初始化，我们可以首先进行转换，然后再设置`init_from_ckpt`这个参数，从而加载预训练权重，如：`scripts/ldm_vae_init0_paddle/model_state.pdparams`。
-
-
-#### 1.3.2 单机多卡训练 (多机多卡训练，仅需在 paddle.distributed.launch 后加个 --ips IP1,IP2,IP3,IP4)
-```bash
-python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" train_vae.py \
-    --pretrained_model_name_or_path CompVis/stable-diffusion-v1-4 \
-    --ignore_keys decoder. \
-    --vae_config_file config/vae.json \
-    --freeze_encoder \
-    --input_size 256 256 \
-    --max_train_steps 100000000000 \
-    --learning_rate 1e-4 \
-    --batch_size 4 \
-    --num_workers 8 \
-    --logging_steps 100 \
-    --save_steps 2000 \
-    --image_logging_steps 500 \
-    --disc_start 50001 \
-    --kl_weight 0.000001 \
-    --disc_weight 0.5 \
-    --resolution 512
-```
-
-### 1.4 Encoder和Decoder从零开启训练
-Tips：
-- FP32 在 40GB 的显卡上可正常训练，在下面的配置条件下，显存占用约 29G。
-
-#### 1.4.1 单机单卡训练
-
-```bash
-python -u train_vae.py \
-    --from_scratch \
-    --vae_config_file config/vae.json \
-    --input_size 256 256 \
-    --max_train_steps 100000000000 \
-    --learning_rate 1e-4 \
-    --batch_size 4 \
-    --num_workers 8 \
-    --logging_steps 100 \
-    --save_steps 2000 \
-    --image_logging_steps 500 \
-    --disc_start 50001 \
-    --kl_weight 0.000001 \
-    --disc_weight 0.5 \
-    --resolution 512
-```
-`train_vae.py`代码可传入的参数解释可参考 **1.3.1** 小节。
-
-注意：当我们指定开启`from_scratch`后必须指定`vae_config_file`！
-
-
-#### 1.4.2 单机多卡训练 (多机多卡训练，仅需在 paddle.distributed.launch 后加个 --ips IP1,IP2,IP3,IP4)
-
-```bash
-python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" train_vae.py \
-    --from_scratch \
-    --vae_config_file config/vae.json \
-    --input_size 256 256 \
-    --max_train_steps 100000000000 \
-    --learning_rate 1e-4 \
-    --batch_size 4 \
-    --num_workers 8 \
-    --logging_steps 100 \
-    --save_steps 2000 \
-    --image_logging_steps 500 \
-    --disc_start 50001 \
-    --kl_weight 0.000001 \
-    --disc_weight 0.5 \
-    --resolution 512
-```
-
-## 2 模型推理
-```python
-import paddle
-from IPython.display import display
-from ppdiffusers import AutoencoderKL, StableDiffusionImg2ImgPipeline
-from ppdiffusers.utils import load_image
-from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img import preprocess
-
-def decode_image(image):
-    image = (image / 2 + 0.5).clip(0, 1).transpose([0, 2, 3, 1]).cast('float32').numpy()
-    image = StableDiffusionImg2ImgPipeline.numpy_to_pil(image)
-    return image
-
-# 我们只需要修改这里的参数配置就可以！
-model_name_or_path = "./autoencoder_outputs/checkpoint-200000"
-vae = AutoencoderKL.from_pretrained(model_name_or_path)
-image = load_image("https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/vermeer.jpg")
-sample_32 = preprocess(image.resize((256, 256)))
-sample_64 = preprocess(image.resize((512, 512)))
-
-with paddle.no_grad():
-    # sample_32 256 x 256
-    dec_32 = vae(sample_32, sample_posterior=True)[0] # must set sample_posterior = True
-    img_32 = decode_image(dec_32)[0]
-    display(img_32)
-    # img_32 512 x 512
-    img_32.save('32.jpg')
-
-with paddle.no_grad():
-    # sample_32 512 x 512
-    dec_64 = vae(sample_64, sample_posterior=True)[0] # must set sample_posterior = True
-    img_64 = decode_image(dec_64)[0]
-    display(img_64)
-    # img_64 1024 x 1024
-    img_64.save('64.jpg')
-```
-
-<div align="center">
-<img width="200" alt="image" src="https://user-images.githubusercontent.com/50394665/208030125-6d617506-89a0-4251-ac98-02303b35fccd.jpg">
-<img width="400" alt="image" src="https://user-images.githubusercontent.com/50394665/208030119-c1e981d6-7364-4c7c-9163-b5b810c5b224.jpg">
-</div>
-
-## 3 其他
-### 3.1 ImageNet 数据集准备
-The code will try to download (through [Academic
-Torrents](http://academictorrents.com/)) and prepare ImageNet the first time it
-is used. However, since ImageNet is quite large, this requires a lot of disk
-space and time. If you already have ImageNet on your disk, you can speed things
-up by putting the data into
-`${XDG_CACHE}/autoencoders/data/ILSVRC2012_{split}/data/` (which defaults to
-`~/.cache/autoencoders/data/ILSVRC2012_{split}/data/`), where `{split}` is one
-of `train`/`validation`. It should have the following structure:
-
-```
-${XDG_CACHE}/autoencoders/data/ILSVRC2012_{split}/data/
-├── n01440764
-│   ├── n01440764_10026.JPEG
-│   ├── n01440764_10027.JPEG
-│   ├── ...
-├── n01443537
-│   ├── n01443537_10007.JPEG
-│   ├── n01443537_10014.JPEG
-│   ├── ...
-├── ...
-```
-
-If you haven't extracted the data, you can also place
-`ILSVRC2012_img_train.tar`/`ILSVRC2012_img_val.tar` (or symlinks to them) into
-`${XDG_CACHE}/autoencoders/data/ILSVRC2012_train/` /
-`${XDG_CACHE}/autoencoders/data/ILSVRC2012_validation/`, which will then be
-extracted into above structure without downloading it again.  Note that this
-will only happen if neither a folder
-`${XDG_CACHE}/autoencoders/data/ILSVRC2012_{split}/data/` nor a file
-`${XDG_CACHE}/autoencoders/data/ILSVRC2012_{split}/.ready` exist. Remove them
-if you want to force running the dataset preparation again.
-
-## 4 参考资料
-- https://github.com/CompVis/latent-diffusion
-- https://github.com/huggingface/diffusers
diff --git a/ppdiffusers/examples/autoencoder/vae/config/f8encoder_f16decoder.yaml b/ppdiffusers/examples/autoencoder/vae/config/f8encoder_f16decoder.yaml
deleted file mode 100644
index f08426c278a9..000000000000
--- a/ppdiffusers/examples/autoencoder/vae/config/f8encoder_f16decoder.yaml
+++ /dev/null
@@ -1,74 +0,0 @@
-model:
-  base_learning_rate: 1.0e-4
-  target: autoencoder.models.autoencoder.AutoencoderKL
-  params:
-    ckpt_path: './pretrained_autoencoder/kl-f8.ckpt'
-    load_decoder_ckpt: False
-    input_size: [256, 256]
-    monitor: "val/rec_loss"
-    embed_dim: 4
-    lossconfig:
-      target: autoencoder.modules.losses.LPIPSWithDiscriminator
-      params:
-        disc_start: 50001
-        kl_weight: 0.000001
-        disc_weight: 0.5
-    freeze_encoder: true
-    ddconfig:
-      encoder:
-        double_z: true
-        z_channels: 4
-        resolution: 256
-        in_channels: 3
-        out_ch: 3
-        ch: 128
-        ch_mult:
-        - 1
-        - 2
-        - 4
-        - 4
-        num_res_blocks: 2
-        attn_resolutions: []
-        dropout: 0.0
-      decoder:
-        double_z: true
-        z_channels: 4
-        resolution: 256
-        in_channels: 3
-        out_ch: 3
-        ch: 128
-        ch_mult:
-        - 1
-        - 2
-        - 2
-        - 2
-        - 4
-        num_res_blocks: 2
-        attn_resolutions: []
-        dropout: 0.0
-data:
-  target: train.DataModuleFromConfig
-  params:
-    batch_size: 4
-    num_workers: 8
-    train:
-      target: autoencoder.data.text_image_pair.TextImagePair
-      params:
-        file_list: data/filelist/train.filelist.list
-        size: 512
-        num_records: 62500
-        buffer_size: 100
-
-lightning:
-  callbacks:
-    image_logger:
-      target: train.ImageLogger
-      params:
-        batch_frequency: 500
-        max_images: 8
-        increase_log_steps: True
-        save_every_steps: 2000
-
-  trainer:
-    benchmark: True
-    accumulate_grad_batches: 1
\ No newline at end of file
diff --git a/ppdiffusers/examples/autoencoder/vae/config/vae.json b/ppdiffusers/examples/autoencoder/vae/config/vae.json
deleted file mode 100644
index 2b68115a36a7..000000000000
--- a/ppdiffusers/examples/autoencoder/vae/config/vae.json
+++ /dev/null
@@ -1,36 +0,0 @@
-{
-  "act_fn": "silu",
-  "block_out_channels": [
-    128,
-    256,
-    512,
-    512
-  ],
-  "down_block_out_channels": null,
-  "down_block_types": [
-    "DownEncoderBlock2D",
-    "DownEncoderBlock2D",
-    "DownEncoderBlock2D",
-    "DownEncoderBlock2D"
-  ],
-  "in_channels": 3,
-  "layers_per_block": 2,
-  "norm_num_groups": 32,
-  "out_channels": 3,
-  "sample_size": 512,
-  "up_block_out_channels": [
-    128,
-    256,
-    256,
-    256,
-    512
-  ],
-  "latent_channels": 4,
-  "up_block_types": [
-    "UpDecoderBlock2D",
-    "UpDecoderBlock2D",
-    "UpDecoderBlock2D",
-    "UpDecoderBlock2D",
-    "UpDecoderBlock2D"
-  ]
-}
diff --git a/ppdiffusers/examples/autoencoder/vae/data/filelist/laion400m_en.filelist b/ppdiffusers/examples/autoencoder/vae/data/filelist/laion400m_en.filelist
deleted file mode 100644
index a70eccdedbad..000000000000
--- a/ppdiffusers/examples/autoencoder/vae/data/filelist/laion400m_en.filelist
+++ /dev/null
@@ -1,10 +0,0 @@
-/data/laion400m/part-00000.gz
-/data/laion400m/part-00001.gz
-/data/laion400m/part-00002.gz
-/data/laion400m/part-00003.gz
-/data/laion400m/part-00004.gz
-/data/laion400m/part-00005.gz
-/data/laion400m/part-00006.gz
-/data/laion400m/part-00007.gz
-/data/laion400m/part-00008.gz
-/data/laion400m/part-00009.gz
\ No newline at end of file
diff --git a/ppdiffusers/examples/autoencoder/vae/data/filelist/train.filelist.list b/ppdiffusers/examples/autoencoder/vae/data/filelist/train.filelist.list
deleted file mode 100644
index 4bc020729904..000000000000
--- a/ppdiffusers/examples/autoencoder/vae/data/filelist/train.filelist.list
+++ /dev/null
@@ -1 +0,0 @@
-./data/filelist/laion400m_en.filelist
diff --git a/ppdiffusers/examples/autoencoder/vae/data/filelist/write_filelist.py b/ppdiffusers/examples/autoencoder/vae/data/filelist/write_filelist.py
deleted file mode 100644
index 358bca25f4fd..000000000000
--- a/ppdiffusers/examples/autoencoder/vae/data/filelist/write_filelist.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-data = []
-for index in range(60000):
-    data.append("/data/laion400m/part-{:05}.gz\n".format(index))
-
-with open("laion400m_en.filelist", "w") as w:
-    w.writelines(data)
diff --git a/ppdiffusers/examples/autoencoder/vae/ldm/__init__.py b/ppdiffusers/examples/autoencoder/vae/ldm/__init__.py
deleted file mode 100644
index b23bc1717fbf..000000000000
--- a/ppdiffusers/examples/autoencoder/vae/ldm/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from .autoencoder_datasets import ImageNetSRTrain, ImageNetSRValidation
-from .losses import LPIPS, LPIPSWithDiscriminator
-from .model import AutoencoderKLWithLoss
-from .text_image_pair import TextImagePair, worker_init_fn
diff --git a/ppdiffusers/examples/autoencoder/vae/ldm/autoencoder_datasets.py b/ppdiffusers/examples/autoencoder/vae/ldm/autoencoder_datasets.py
deleted file mode 100644
index e08645300271..000000000000
--- a/ppdiffusers/examples/autoencoder/vae/ldm/autoencoder_datasets.py
+++ /dev/null
@@ -1,603 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import glob
-import os
-import pickle
-import shutil
-import tarfile
-from functools import partial
-from pathlib import Path
-
-import albumentations
-import cv2
-import numpy as np
-import requests
-import yaml
-from paddle.io import DataLoader, Dataset, Subset
-from paddle.vision.transforms import functional as TF
-from PIL import Image
-from tqdm.auto import tqdm
-
-from .image_degradation import degradation_fn_bsr, degradation_fn_bsr_light
-
-
-class ImagePaths(Dataset):
-    def __init__(self, paths, size=None, random_crop=False, labels=None):
-        self.size = size
-        self.random_crop = random_crop
-
-        self.labels = dict() if labels is None else labels
-        self.labels["file_path_"] = paths
-        self._length = len(paths)
-
-        if self.size is not None and self.size > 0:
-            self.rescaler = albumentations.SmallestMaxSize(max_size=self.size)
-            if not self.random_crop:
-                self.cropper = albumentations.CenterCrop(height=self.size, width=self.size)
-            else:
-                self.cropper = albumentations.RandomCrop(height=self.size, width=self.size)
-            self.preprocessor = albumentations.Compose([self.rescaler, self.cropper])
-        else:
-            self.preprocessor = lambda **kwargs: kwargs
-
-    def __len__(self):
-        return self._length
-
-    def preprocess_image(self, image_path):
-        image = Image.open(image_path)
-        if not image.mode == "RGB":
-            image = image.convert("RGB")
-        image = np.array(image).astype(np.uint8)
-        image = self.preprocessor(image=image)["image"]
-        image = (image / 127.5 - 1.0).astype(np.float32)
-        return image
-
-    def __getitem__(self, i):
-        example = dict()
-        example["image"] = self.preprocess_image(self.labels["file_path_"][i])
-        for k in self.labels:
-            example[k] = self.labels[k][i]
-        return example
-
-
-def download(url, local_path, chunk_size=1024):
-    os.makedirs(os.path.split(local_path)[0], exist_ok=True)
-    with requests.get(url, stream=True) as r:
-        total_size = int(r.headers.get("content-length", 0))
-        with tqdm(total=total_size, unit="B", unit_scale=True) as pbar:
-            with open(local_path, "wb") as f:
-                for data in r.iter_content(chunk_size=chunk_size):
-                    if data:
-                        f.write(data)
-                        pbar.update(chunk_size)
-
-
-class KeyNotFoundError(Exception):
-    def __init__(self, cause, keys=None, visited=None):
-        self.cause = cause
-        self.keys = keys
-        self.visited = visited
-        messages = list()
-        if keys is not None:
-            messages.append("Key not found: {}".format(keys))
-        if visited is not None:
-            messages.append("Visited: {}".format(visited))
-        messages.append("Cause:\n{}".format(cause))
-        message = "\n".join(messages)
-        super().__init__(message)
-
-
-def retrieve(list_or_dict, key, splitval="/", default=None, expand=True, pass_success=False):
-    """Given a nested list or dict return the desired value at key expanding
-    callable nodes if necessary and :attr:`expand` is ``True``. The expansion
-    is done in-place.
-
-    Parameters
-    ----------
-        list_or_dict : list or dict
-            Possibly nested list or dictionary.
-        key : str
-            key/to/value, path like string describing all keys necessary to
-            consider to get to the desired value. List indices can also be
-            passed here.
-        splitval : str
-            String that defines the delimiter between keys of the
-            different depth levels in `key`.
-        default : obj
-            Value returned if :attr:`key` is not found.
-        expand : bool
-            Whether to expand callable nodes on the path or not.
-
-    Returns
-    -------
-        The desired value or if :attr:`default` is not ``None`` and the
-        :attr:`key` is not found returns ``default``.
-
-    Raises
-    ------
-        Exception if ``key`` not in ``list_or_dict`` and :attr:`default` is
-        ``None``.
-    """
-
-    keys = key.split(splitval)
-
-    success = True
-    try:
-        visited = []
-        parent = None
-        last_key = None
-        for key in keys:
-            if callable(list_or_dict):
-                if not expand:
-                    raise KeyNotFoundError(
-                        ValueError("Trying to get past callable node with expand=False."),
-                        keys=keys,
-                        visited=visited,
-                    )
-                list_or_dict = list_or_dict()
-                parent[last_key] = list_or_dict
-
-            last_key = key
-            parent = list_or_dict
-
-            try:
-                if isinstance(list_or_dict, dict):
-                    list_or_dict = list_or_dict[key]
-                else:
-                    list_or_dict = list_or_dict[int(key)]
-            except (KeyError, IndexError, ValueError) as e:
-                raise KeyNotFoundError(e, keys=keys, visited=visited)
-
-            visited += [key]
-        # final expansion of retrieved value
-        if expand and callable(list_or_dict):
-            list_or_dict = list_or_dict()
-            parent[last_key] = list_or_dict
-    except KeyNotFoundError as e:
-        if default is None:
-            raise e
-        else:
-            list_or_dict = default
-            success = False
-
-    if not pass_success:
-        return list_or_dict
-    else:
-        return list_or_dict, success
-
-
-def give_synsets_from_indices(indices, path_to_yaml="data/imagenet_idx_to_synset.yaml"):
-    synsets = []
-    with open(path_to_yaml) as f:
-        di2s = yaml.load(f)
-    for idx in indices:
-        synsets.append(str(di2s[idx]))
-    print("Using {} different synsets for construction of Restriced Imagenet.".format(len(synsets)))
-    return synsets
-
-
-def str_to_indices(string):
-    """Expects a string in the format '32-123, 256, 280-321'"""
-    assert not string.endswith(","), "provided string '{}' ends with a comma, pls remove it".format(string)
-    subs = string.split(",")
-    indices = []
-    for sub in subs:
-        subsubs = sub.split("-")
-        assert len(subsubs) > 0
-        if len(subsubs) == 1:
-            indices.append(int(subsubs[0]))
-        else:
-            rang = [j for j in range(int(subsubs[0]), int(subsubs[1]))]
-            indices.extend(rang)
-    return sorted(indices)
-
-
-def is_prepared(root):
-    return Path(root).joinpath(".ready").exists()
-
-
-def mark_prepared(root):
-    Path(root).joinpath(".ready").touch()
-
-
-def synset2idx(path_to_yaml="data/index_synset.yaml"):
-    with open(path_to_yaml) as f:
-        di2s = yaml.load(f)
-    return dict((v, k) for k, v in di2s.items())
-
-
-class ImageNetBase(Dataset):
-    def __init__(self, config=None):
-        self.config = config
-        if not type(self.config) == dict:
-            self.config = {}
-        self.keep_orig_class_label = self.config.get("keep_orig_class_label", False)
-        self.process_images = True  # if False we skip loading & processing images and self.data contains filepaths
-        self._prepare()
-        self._prepare_synset_to_human()
-        self._prepare_idx_to_synset()
-        self._prepare_human_to_integer_label()
-        self._load()
-
-    def __len__(self):
-        return len(self.data)
-
-    def __getitem__(self, i):
-        return self.data[i]
-
-    def _prepare(self):
-        raise NotImplementedError()
-
-    def _filter_relpaths(self, relpaths):
-        ignore = set(
-            [
-                "n06596364_9591.JPEG",
-            ]
-        )
-        relpaths = [rpath for rpath in relpaths if not rpath.split("/")[-1] in ignore]
-        if "sub_indices" in self.config:
-            indices = str_to_indices(self.config["sub_indices"])
-            synsets = give_synsets_from_indices(indices, path_to_yaml=self.idx2syn)  # returns a list of strings
-            self.synset2idx = synset2idx(path_to_yaml=self.idx2syn)
-            files = []
-            for rpath in relpaths:
-                syn = rpath.split("/")[0]
-                if syn in synsets:
-                    files.append(rpath)
-            return files
-        else:
-            return relpaths
-
-    def _prepare_synset_to_human(self):
-        SIZE = 2655750
-        URL = "https://heibox.uni-heidelberg.de/f/9f28e956cd304264bb82/?dl=1"
-        self.human_dict = os.path.join(self.root, "synset_human.txt")
-        if not os.path.exists(self.human_dict) or not os.path.getsize(self.human_dict) == SIZE:
-            download(URL, self.human_dict)
-
-    def _prepare_idx_to_synset(self):
-        URL = "https://heibox.uni-heidelberg.de/f/d835d5b6ceda4d3aa910/?dl=1"
-        self.idx2syn = os.path.join(self.root, "index_synset.yaml")
-        if not os.path.exists(self.idx2syn):
-            download(URL, self.idx2syn)
-
-    def _prepare_human_to_integer_label(self):
-        URL = "https://heibox.uni-heidelberg.de/f/2362b797d5be43b883f6/?dl=1"
-        self.human2integer = os.path.join(self.root, "imagenet1000_clsidx_to_labels.txt")
-        if not os.path.exists(self.human2integer):
-            download(URL, self.human2integer)
-        with open(self.human2integer, "r") as f:
-            lines = f.read().splitlines()
-            assert len(lines) == 1000
-            self.human2integer_dict = dict()
-            for line in lines:
-                value, key = line.split(":")
-                self.human2integer_dict[key] = int(value)
-
-    def _load(self):
-        with open(self.txt_filelist, "r") as f:
-            self.relpaths = f.read().splitlines()
-            l1 = len(self.relpaths)
-            self.relpaths = self._filter_relpaths(self.relpaths)
-            print("Removed {} files from filelist during filtering.".format(l1 - len(self.relpaths)))
-
-        self.synsets = [p.split("/")[0] for p in self.relpaths]
-        self.abspaths = [os.path.join(self.datadir, p) for p in self.relpaths]
-
-        unique_synsets = np.unique(self.synsets)
-        class_dict = dict((synset, i) for i, synset in enumerate(unique_synsets))
-        if not self.keep_orig_class_label:
-            self.class_labels = [class_dict[s] for s in self.synsets]
-        else:
-            self.class_labels = [self.synset2idx[s] for s in self.synsets]
-
-        with open(self.human_dict, "r") as f:
-            human_dict = f.read().splitlines()
-            human_dict = dict(line.split(maxsplit=1) for line in human_dict)
-
-        self.human_labels = [human_dict[s] for s in self.synsets]
-
-        labels = {
-            "relpath": np.array(self.relpaths),
-            "synsets": np.array(self.synsets),
-            "class_label": np.array(self.class_labels),
-            "human_label": np.array(self.human_labels),
-        }
-
-        if self.process_images:
-            self.size = retrieve(self.config, "size", default=256)
-            self.data = ImagePaths(
-                self.abspaths,
-                labels=labels,
-                size=self.size,
-                random_crop=self.random_crop,
-            )
-        else:
-            self.data = self.abspaths
-
-
-class ImageNetTrain(ImageNetBase):
-    NAME = "ILSVRC2012_train"
-    URL = "http://www.image-net.org/challenges/LSVRC/2012/"
-    AT_HASH = "a306397ccf9c2ead27155983c254227c0fd938e2"
-    FILES = [
-        "ILSVRC2012_img_train.tar",
-    ]
-    SIZES = [
-        147897477120,
-    ]
-
-    def __init__(self, process_images=True, data_root=None, **kwargs):
-        self.process_images = process_images
-        self.data_root = data_root
-        super().__init__(**kwargs)
-
-    def _prepare(self):
-        if self.data_root:
-            self.root = os.path.join(self.data_root, self.NAME)
-        else:
-            cachedir = os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))
-            self.root = os.path.join(cachedir, "autoencoders/data", self.NAME)
-
-        self.datadir = os.path.join(self.root, "data")
-        self.txt_filelist = os.path.join(self.root, "filelist.txt")
-        self.expected_length = 1281167
-        self.random_crop = retrieve(self.config, "ImageNetTrain/random_crop", default=True)
-        if not is_prepared(self.root):
-            # prep
-            print("Preparing dataset {} in {}".format(self.NAME, self.root))
-
-            datadir = self.datadir
-            if not os.path.exists(datadir):
-                path = os.path.join(self.root, self.FILES[0])
-                if not os.path.exists(path) or not os.path.getsize(path) == self.SIZES[0]:
-                    import academictorrents as at
-
-                    atpath = at.get(self.AT_HASH, datastore=self.root)
-                    assert atpath == path
-
-                print("Extracting {} to {}".format(path, datadir))
-                os.makedirs(datadir, exist_ok=True)
-                with tarfile.open(path, "r:") as tar:
-                    tar.extractall(path=datadir)
-
-                print("Extracting sub-tars.")
-                subpaths = sorted(glob.glob(os.path.join(datadir, "*.tar")))
-                for subpath in tqdm(subpaths):
-                    subdir = subpath[: -len(".tar")]
-                    os.makedirs(subdir, exist_ok=True)
-                    with tarfile.open(subpath, "r:") as tar:
-                        tar.extractall(path=subdir)
-
-            filelist = glob.glob(os.path.join(datadir, "**", "*.JPEG"))
-            filelist = [os.path.relpath(p, start=datadir) for p in filelist]
-            filelist = sorted(filelist)
-            filelist = "\n".join(filelist) + "\n"
-            with open(self.txt_filelist, "w") as f:
-                f.write(filelist)
-
-            mark_prepared(self.root)
-
-
-class ImageNetValidation(ImageNetBase):
-    NAME = "ILSVRC2012_validation"
-    URL = "http://www.image-net.org/challenges/LSVRC/2012/"
-    AT_HASH = "5d6d0df7ed81efd49ca99ea4737e0ae5e3a5f2e5"
-    VS_URL = "https://heibox.uni-heidelberg.de/f/3e0f6e9c624e45f2bd73/?dl=1"
-    FILES = [
-        "ILSVRC2012_img_val.tar",
-        "validation_synset.txt",
-    ]
-    SIZES = [
-        6744924160,
-        1950000,
-    ]
-
-    def __init__(self, process_images=True, data_root=None, **kwargs):
-        self.data_root = data_root
-        self.process_images = process_images
-        super().__init__(**kwargs)
-
-    def _prepare(self):
-        if self.data_root:
-            self.root = os.path.join(self.data_root, self.NAME)
-        else:
-            cachedir = os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))
-            self.root = os.path.join(cachedir, "autoencoders/data", self.NAME)
-        self.datadir = os.path.join(self.root, "data")
-        self.txt_filelist = os.path.join(self.root, "filelist.txt")
-        self.expected_length = 50000
-        self.random_crop = retrieve(self.config, "ImageNetValidation/random_crop", default=False)
-        if not is_prepared(self.root):
-            # prep
-            print("Preparing dataset {} in {}".format(self.NAME, self.root))
-
-            datadir = self.datadir
-            if not os.path.exists(datadir):
-                path = os.path.join(self.root, self.FILES[0])
-                if not os.path.exists(path) or not os.path.getsize(path) == self.SIZES[0]:
-                    import academictorrents as at
-
-                    atpath = at.get(self.AT_HASH, datastore=self.root)
-                    assert atpath == path
-
-                print("Extracting {} to {}".format(path, datadir))
-                os.makedirs(datadir, exist_ok=True)
-                with tarfile.open(path, "r:") as tar:
-                    tar.extractall(path=datadir)
-
-                vspath = os.path.join(self.root, self.FILES[1])
-                if not os.path.exists(vspath) or not os.path.getsize(vspath) == self.SIZES[1]:
-                    download(self.VS_URL, vspath)
-
-                with open(vspath, "r") as f:
-                    synset_dict = f.read().splitlines()
-                    synset_dict = dict(line.split() for line in synset_dict)
-
-                print("Reorganizing into synset folders")
-                synsets = np.unique(list(synset_dict.values()))
-                for s in synsets:
-                    os.makedirs(os.path.join(datadir, s), exist_ok=True)
-                for k, v in synset_dict.items():
-                    src = os.path.join(datadir, k)
-                    dst = os.path.join(datadir, v)
-                    shutil.move(src, dst)
-
-            filelist = glob.glob(os.path.join(datadir, "**", "*.JPEG"))
-            filelist = [os.path.relpath(p, start=datadir) for p in filelist]
-            filelist = sorted(filelist)
-            filelist = "\n".join(filelist) + "\n"
-            with open(self.txt_filelist, "w") as f:
-                f.write(filelist)
-
-            mark_prepared(self.root)
-
-
-class ImageNetSR(Dataset):
-    def __init__(
-        self,
-        size=None,
-        degradation=None,
-        downscale_f=4,
-        min_crop_f=0.5,
-        max_crop_f=1.0,
-        random_crop=True,
-        output_LR_image=False,
-    ):
-        """
-        Imagenet Superresolution Dataloader
-        Performs following ops in order:
-        1.  crops a crop of size s from image either as random or center crop
-        2.  resizes crop to size with cv2.area_interpolation
-        3.  degrades resized crop with degradation_fn
-        :param size: resizing to size after cropping
-        :param degradation: degradation_fn, e.g. cv_bicubic or bsrgan_light
-        :param downscale_f: Low Resolution Downsample factor
-        :param min_crop_f: determines crop size s,
-          where s = c * min_img_side_len with c sampled from interval (min_crop_f, max_crop_f)
-        :param max_crop_f: ""
-        :param data_root:
-        :param random_crop:
-        :param output_LR_image whether or not output LR Image
-        """
-        # (TODO, junnyu) whether or not output LR_image
-        self.output_LR_image = output_LR_image
-        self.base = self.get_base()
-        assert size
-        assert (size / downscale_f).is_integer()
-        self.size = size
-        self.LR_size = int(size / downscale_f)
-        self.min_crop_f = min_crop_f
-        self.max_crop_f = max_crop_f
-        assert max_crop_f <= 1.0
-        self.center_crop = not random_crop
-
-        self.image_rescaler = albumentations.SmallestMaxSize(max_size=size, interpolation=cv2.INTER_AREA)
-
-        self.pil_interpolation = False  # gets reset later if incase interp_op is from pillow
-
-        if degradation == "bsrgan":
-            self.degradation_process = partial(degradation_fn_bsr, sf=downscale_f)
-
-        elif degradation == "bsrgan_light":
-            self.degradation_process = partial(degradation_fn_bsr_light, sf=downscale_f)
-
-        else:
-            self.pil_interpolation = degradation.startswith("pil_")
-
-            if self.pil_interpolation:
-                interpolation_fn = degradation.replace("pil_", "")
-                self.degradation_process = partial(TF.resize, size=self.LR_size, interpolation=interpolation_fn)
-            else:
-                interpolation_fn = {
-                    "cv_nearest": cv2.INTER_NEAREST,
-                    "cv_bilinear": cv2.INTER_LINEAR,
-                    "cv_bicubic": cv2.INTER_CUBIC,
-                    "cv_area": cv2.INTER_AREA,
-                    "cv_lanczos": cv2.INTER_LANCZOS4,
-                }[degradation]
-                self.degradation_process = albumentations.SmallestMaxSize(
-                    max_size=self.LR_size, interpolation=interpolation_fn
-                )
-
-    def __len__(self):
-        return len(self.base)
-
-    def __getitem__(self, i):
-        example = self.base[i]
-        image = Image.open(example["file_path_"])
-
-        if not image.mode == "RGB":
-            image = image.convert("RGB")
-
-        image = np.array(image).astype(np.uint8)
-
-        min_side_len = min(image.shape[:2])
-        crop_side_len = min_side_len * np.random.uniform(self.min_crop_f, self.max_crop_f, size=None)
-        crop_side_len = int(crop_side_len)
-
-        if self.center_crop:
-            self.cropper = albumentations.CenterCrop(height=crop_side_len, width=crop_side_len)
-
-        else:
-            self.cropper = albumentations.RandomCrop(height=crop_side_len, width=crop_side_len)
-
-        image = self.cropper(image=image)["image"]
-        image = self.image_rescaler(image=image)["image"]
-
-        if self.output_LR_image:
-            if self.pil_interpolation:
-                image_pil = Image.fromarray(image)
-                LR_image = self.degradation_process(image_pil)
-                LR_image = np.array(LR_image).astype(np.uint8)
-            else:
-                LR_image = self.degradation_process(image=image)["image"]
-            example["LR_image"] = (LR_image / 127.5 - 1.0).astype(np.float32).transpose([2, 0, 1])
-
-        example["image"] = (image / 127.5 - 1.0).astype(np.float32).transpose([2, 0, 1])
-
-        return example
-
-
-class ImageNetSRTrain(ImageNetSR):
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
-    def get_base(self):
-        with open("data/imagenet_train_hr_indices.p", "rb") as f:
-            indices = pickle.load(f)
-        dset = ImageNetTrain(
-            process_images=False,
-        )
-        return Subset(dset, indices)
-
-
-class ImageNetSRValidation(ImageNetSR):
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
-    def get_base(self):
-        with open("data/imagenet_val_hr_indices.p", "rb") as f:
-            indices = pickle.load(f)
-        dset = ImageNetValidation(process_images=False)
-        return Subset(dset, indices)
-
-
-if __name__ == "__main__":
-    ds = ImageNetSRTrain(size=256, degradation="pil_nearest")
-    dl = DataLoader(ds, batch_size=4, shuffle=False)
-    print(len(ds))
-    for e in dl:
-        print(e["image"])
diff --git a/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/__init__.py b/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/__init__.py
deleted file mode 100644
index 890a4eea8924..000000000000
--- a/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .bsrgan import degradation_bsrgan_variant as degradation_fn_bsr
-from .bsrgan_light import degradation_bsrgan_variant as degradation_fn_bsr_light
diff --git a/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/bsrgan.py b/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/bsrgan.py
deleted file mode 100644
index 13138b0ec905..000000000000
--- a/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/bsrgan.py
+++ /dev/null
@@ -1,643 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-# --------------------------------------------
-# Super-Resolution
-# --------------------------------------------
-#
-# Kai Zhang (cskaizhang@gmail.com)
-# https://github.com/cszn
-# From 2019/03--2021/08
-# --------------------------------------------
-"""
-
-import random
-from functools import partial
-
-import albumentations
-import cv2
-import numpy as np
-import paddle
-import paddle.nn.functional as F
-import scipy
-import scipy.stats as ss
-from scipy import ndimage
-from scipy.interpolate import interp2d
-from scipy.linalg import orth
-
-from . import utils_image as util
-
-
-def modcrop_np(img, sf):
-    """
-    Args:
-        img: numpy image, WxH or WxHxC
-        sf: scale factor
-    Return:
-        cropped image
-    """
-    w, h = img.shape[:2]
-    im = np.copy(img)
-    return im[: w - w % sf, : h - h % sf, ...]
-
-
-"""
-# --------------------------------------------
-# anisotropic Gaussian kernels
-# --------------------------------------------
-"""
-
-
-def analytic_kernel(k):
-    """Calculate the X4 kernel from the X2 kernel (for proof see appendix in paper)"""
-    k_size = k.shape[0]
-    # Calculate the big kernels size
-    big_k = np.zeros((3 * k_size - 2, 3 * k_size - 2))
-    # Loop over the small kernel to fill the big one
-    for r in range(k_size):
-        for c in range(k_size):
-            big_k[2 * r : 2 * r + k_size, 2 * c : 2 * c + k_size] += k[r, c] * k
-    # Crop the edges of the big kernel to ignore very small values and increase run time of SR
-    crop = k_size // 2
-    cropped_big_k = big_k[crop:-crop, crop:-crop]
-    # Normalize to 1
-    return cropped_big_k / cropped_big_k.sum()
-
-
-def anisotropic_Gaussian(ksize=15, theta=np.pi, l1=6, l2=6):
-    """generate an anisotropic Gaussian kernel
-    Args:
-        ksize : e.g., 15, kernel size
-        theta : [0,  pi], rotation angle range
-        l1    : [0.1,50], scaling of eigenvalues
-        l2    : [0.1,l1], scaling of eigenvalues
-        If l1 = l2, will get an isotropic Gaussian kernel.
-    Returns:
-        k     : kernel
-    """
-
-    v = np.dot(np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]), np.array([1.0, 0.0]))
-    V = np.array([[v[0], v[1]], [v[1], -v[0]]])
-    D = np.array([[l1, 0], [0, l2]])
-    Sigma = np.dot(np.dot(V, D), np.linalg.inv(V))
-    k = gm_blur_kernel(mean=[0, 0], cov=Sigma, size=ksize)
-
-    return k
-
-
-def gm_blur_kernel(mean, cov, size=15):
-    center = size / 2.0 + 0.5
-    k = np.zeros([size, size])
-    for y in range(size):
-        for x in range(size):
-            cy = y - center + 1
-            cx = x - center + 1
-            k[y, x] = ss.multivariate_normal.pdf([cx, cy], mean=mean, cov=cov)
-
-    k = k / np.sum(k)
-    return k
-
-
-def shift_pixel(x, sf, upper_left=True):
-    """shift pixel for super-resolution with different scale factors
-    Args:
-        x: WxHxC or WxH
-        sf: scale factor
-        upper_left: shift direction
-    """
-    h, w = x.shape[:2]
-    shift = (sf - 1) * 0.5
-    xv, yv = np.arange(0, w, 1.0), np.arange(0, h, 1.0)
-    if upper_left:
-        x1 = xv + shift
-        y1 = yv + shift
-    else:
-        x1 = xv - shift
-        y1 = yv - shift
-
-    x1 = np.clip(x1, 0, w - 1)
-    y1 = np.clip(y1, 0, h - 1)
-
-    if x.ndim == 2:
-        x = interp2d(xv, yv, x)(x1, y1)
-    if x.ndim == 3:
-        for i in range(x.shape[-1]):
-            x[:, :, i] = interp2d(xv, yv, x[:, :, i])(x1, y1)
-
-    return x
-
-
-def blur(x, k):
-    """
-    x: image, NxcxHxW
-    k: kernel, Nx1xhxw
-    """
-    n, c = x.shape[:2]
-    p1, p2 = (k.shape[-2] - 1) // 2, (k.shape[-1] - 1) // 2
-    x = F.pad(x, pad=(p1, p2, p1, p2), mode="replicate")
-    k = k.tile([1, c, 1, 1])
-    k = k.reshape([-1, 1, k.shape[2], k.shape[3]])
-    x = x.reshape([1, -1, x.shape[2], x.shape[3]])
-    x = F.conv2d(x, k, bias=None, stride=1, padding=0, groups=n * c)
-    x = x.reshape([n, c, x.shape[2], x.shape[3]])
-
-    return x
-
-
-def gen_kernel(k_size=np.array([15, 15]), scale_factor=np.array([4, 4]), min_var=0.6, max_var=10.0, noise_level=0):
-    """ "
-    # modified version of https://github.com/assafshocher/BlindSR_dataset_generator
-    # Kai Zhang
-    # min_var = 0.175 * sf  # variance of the gaussian kernel will be sampled between min_var and max_var
-    # max_var = 2.5 * sf
-    """
-    # Set random eigen-vals (lambdas) and angle (theta) for COV matrix
-    lambda_1 = min_var + np.random.rand() * (max_var - min_var)
-    lambda_2 = min_var + np.random.rand() * (max_var - min_var)
-    theta = np.random.rand() * np.pi  # random theta
-    noise = -noise_level + np.random.rand(*k_size) * noise_level * 2
-
-    # Set COV matrix using Lambdas and Theta
-    LAMBDA = np.diag([lambda_1, lambda_2])
-    Q = np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]])
-    SIGMA = Q @ LAMBDA @ Q.T
-    INV_SIGMA = np.linalg.inv(SIGMA)[None, None, :, :]
-
-    # Set expectation position (shifting kernel for aligned image)
-    MU = k_size // 2 - 0.5 * (scale_factor - 1)  # - 0.5 * (scale_factor - k_size % 2)
-    MU = MU[None, None, :, None]
-
-    # Create meshgrid for Gaussian
-    [X, Y] = np.meshgrid(range(k_size[0]), range(k_size[1]))
-    Z = np.stack([X, Y], 2)[:, :, :, None]
-
-    # Calcualte Gaussian for every pixel of the kernel
-    ZZ = Z - MU
-    ZZ_t = ZZ.transpose(0, 1, 3, 2)
-    raw_kernel = np.exp(-0.5 * np.squeeze(ZZ_t @ INV_SIGMA @ ZZ)) * (1 + noise)
-
-    # shift the kernel so it will be centered
-    # raw_kernel_centered = kernel_shift(raw_kernel, scale_factor)
-
-    # Normalize the kernel and return
-    # kernel = raw_kernel_centered / np.sum(raw_kernel_centered)
-    kernel = raw_kernel / np.sum(raw_kernel)
-    return kernel
-
-
-def fspecial_gaussian(hsize, sigma):
-    hsize = [hsize, hsize]
-    siz = [(hsize[0] - 1.0) / 2.0, (hsize[1] - 1.0) / 2.0]
-    std = sigma
-    [x, y] = np.meshgrid(np.arange(-siz[1], siz[1] + 1), np.arange(-siz[0], siz[0] + 1))
-    arg = -(x * x + y * y) / (2 * std * std)
-    h = np.exp(arg)
-    h[h < scipy.finfo(float).eps * h.max()] = 0
-    sumh = h.sum()
-    if sumh != 0:
-        h = h / sumh
-    return h
-
-
-def fspecial_laplacian(alpha):
-    alpha = max([0, min([alpha, 1])])
-    h1 = alpha / (alpha + 1)
-    h2 = (1 - alpha) / (alpha + 1)
-    h = [[h1, h2, h1], [h2, -4 / (alpha + 1), h2], [h1, h2, h1]]
-    h = np.array(h)
-    return h
-
-
-def fspecial(filter_type, *args, **kwargs):
-    """
-    python code from:
-    https://github.com/ronaldosena/imagens-medicas-2/blob/40171a6c259edec7827a6693a93955de2bd39e76/Aulas/aula_2_-_uniform_filter/matlab_fspecial.py
-    """
-    if filter_type == "gaussian":
-        return fspecial_gaussian(*args, **kwargs)
-    if filter_type == "laplacian":
-        return fspecial_laplacian(*args, **kwargs)
-
-
-"""
-# --------------------------------------------
-# degradation models
-# --------------------------------------------
-"""
-
-
-def bicubic_degradation(x, sf=3):
-    """
-    Args:
-        x: HxWxC image, [0, 1]
-        sf: down-scale factor
-    Return:
-        bicubicly downsampled LR image
-    """
-    x = util.imresize_np(x, scale=1 / sf)
-    return x
-
-
-def srmd_degradation(x, k, sf=3):
-    """blur + bicubic downsampling
-    Args:
-        x: HxWxC image, [0, 1]
-        k: hxw, double
-        sf: down-scale factor
-    Return:
-        downsampled LR image
-    Reference:
-        @inproceedings{zhang2018learning,
-          title={Learning a single convolutional super-resolution network for multiple degradations},
-          author={Zhang, Kai and Zuo, Wangmeng and Zhang, Lei},
-          booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
-          pages={3262--3271},
-          year={2018}
-        }
-    """
-    x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode="wrap")  # 'nearest' | 'mirror'
-    x = bicubic_degradation(x, sf=sf)
-    return x
-
-
-def dpsr_degradation(x, k, sf=3):
-    """bicubic downsampling + blur
-    Args:
-        x: HxWxC image, [0, 1]
-        k: hxw, double
-        sf: down-scale factor
-    Return:
-        downsampled LR image
-    Reference:
-        @inproceedings{zhang2019deep,
-          title={Deep Plug-and-Play Super-Resolution for Arbitrary Blur Kernels},
-          author={Zhang, Kai and Zuo, Wangmeng and Zhang, Lei},
-          booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
-          pages={1671--1681},
-          year={2019}
-        }
-    """
-    x = bicubic_degradation(x, sf=sf)
-    x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode="wrap")
-    return x
-
-
-def classical_degradation(x, k, sf=3):
-    """blur + downsampling
-    Args:
-        x: HxWxC image, [0, 1]/[0, 255]
-        k: hxw, double
-        sf: down-scale factor
-    Return:
-        downsampled LR image
-    """
-    x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode="wrap")
-    # x = filters.correlate(x, np.expand_dims(np.flip(k), axis=2))
-    st = 0
-    return x[st::sf, st::sf, ...]
-
-
-def add_sharpening(img, weight=0.5, radius=50, threshold=10):
-    """USM sharpening. borrowed from real-ESRGAN
-    Input image: I; Blurry image: B.
-    1. K = I + weight * (I - B)
-    2. Mask = 1 if abs(I - B) > threshold, else: 0
-    3. Blur mask:
-    4. Out = Mask * K + (1 - Mask) * I
-    Args:
-        img (Numpy array): Input image, HWC, BGR; float32, [0, 1].
-        weight (float): Sharp weight. Default: 1.
-        radius (float): Kernel size of Gaussian blur. Default: 50.
-        threshold (int):
-    """
-    if radius % 2 == 0:
-        radius += 1
-    blur = cv2.GaussianBlur(img, (radius, radius), 0)
-    residual = img - blur
-    mask = np.abs(residual) * 255 > threshold
-    mask = mask.astype("float32")
-    soft_mask = cv2.GaussianBlur(mask, (radius, radius), 0)
-
-    K = img + weight * residual
-    K = np.clip(K, 0, 1)
-    return soft_mask * K + (1 - soft_mask) * img
-
-
-def add_blur(img, sf=4):
-    wd2 = 4.0 + sf
-    wd = 2.0 + 0.2 * sf
-    if random.random() < 0.5:
-        l1 = wd2 * random.random()
-        l2 = wd2 * random.random()
-        k = anisotropic_Gaussian(ksize=2 * random.randint(2, 11) + 3, theta=random.random() * np.pi, l1=l1, l2=l2)
-    else:
-        k = fspecial("gaussian", 2 * random.randint(2, 11) + 3, wd * random.random())
-    img = ndimage.filters.convolve(img, np.expand_dims(k, axis=2), mode="mirror")
-
-    return img
-
-
-def add_resize(img, sf=4):
-    rnum = np.random.rand()
-    if rnum > 0.8:  # up
-        sf1 = random.uniform(1, 2)
-    elif rnum < 0.7:  # down
-        sf1 = random.uniform(0.5 / sf, 1)
-    else:
-        sf1 = 1.0
-    img = cv2.resize(img, (int(sf1 * img.shape[1]), int(sf1 * img.shape[0])), interpolation=random.choice([1, 2, 3]))
-    img = np.clip(img, 0.0, 1.0)
-
-    return img
-
-
-def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
-    noise_level = random.randint(noise_level1, noise_level2)
-    rnum = np.random.rand()
-    if rnum > 0.6:  # add color Gaussian noise
-        img = img + np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
-    elif rnum < 0.4:  # add grayscale Gaussian noise
-        img = img + np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
-    else:  # add  noise
-        L = noise_level2 / 255.0
-        D = np.diag(np.random.rand(3))
-        U = orth(np.random.rand(3, 3))
-        conv = np.dot(np.dot(np.transpose(U), D), U)
-        img = img + np.random.multivariate_normal([0, 0, 0], np.abs(L**2 * conv), img.shape[:2]).astype(np.float32)
-    img = np.clip(img, 0.0, 1.0)
-    return img
-
-
-def add_speckle_noise(img, noise_level1=2, noise_level2=25):
-    noise_level = random.randint(noise_level1, noise_level2)
-    img = np.clip(img, 0.0, 1.0)
-    rnum = random.random()
-    if rnum > 0.6:
-        img += img * np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
-    elif rnum < 0.4:
-        img += img * np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
-    else:
-        L = noise_level2 / 255.0
-        D = np.diag(np.random.rand(3))
-        U = orth(np.random.rand(3, 3))
-        conv = np.dot(np.dot(np.transpose(U), D), U)
-        img += img * np.random.multivariate_normal([0, 0, 0], np.abs(L**2 * conv), img.shape[:2]).astype(np.float32)
-    img = np.clip(img, 0.0, 1.0)
-    return img
-
-
-def add_Poisson_noise(img):
-    img = np.clip((img * 255.0).round(), 0, 255) / 255.0
-    vals = 10 ** (2 * random.random() + 2.0)  # [2, 4]
-    if random.random() < 0.5:
-        img = np.random.poisson(img * vals).astype(np.float32) / vals
-    else:
-        img_gray = np.dot(img[..., :3], [0.299, 0.587, 0.114])
-        img_gray = np.clip((img_gray * 255.0).round(), 0, 255) / 255.0
-        noise_gray = np.random.poisson(img_gray * vals).astype(np.float32) / vals - img_gray
-        img += noise_gray[:, :, np.newaxis]
-    img = np.clip(img, 0.0, 1.0)
-    return img
-
-
-def add_JPEG_noise(img):
-    quality_factor = random.randint(30, 95)
-    img = cv2.cvtColor(util.single2uint(img), cv2.COLOR_RGB2BGR)
-    result, encimg = cv2.imencode(".jpg", img, [int(cv2.IMWRITE_JPEG_QUALITY), quality_factor])
-    img = cv2.imdecode(encimg, 1)
-    img = cv2.cvtColor(util.uint2single(img), cv2.COLOR_BGR2RGB)
-    return img
-
-
-def random_crop(lq, hq, sf=4, lq_patchsize=64):
-    h, w = lq.shape[:2]
-    rnd_h = random.randint(0, h - lq_patchsize)
-    rnd_w = random.randint(0, w - lq_patchsize)
-    lq = lq[rnd_h : rnd_h + lq_patchsize, rnd_w : rnd_w + lq_patchsize, :]
-
-    rnd_h_H, rnd_w_H = int(rnd_h * sf), int(rnd_w * sf)
-    hq = hq[rnd_h_H : rnd_h_H + lq_patchsize * sf, rnd_w_H : rnd_w_H + lq_patchsize * sf, :]
-    return lq, hq
-
-
-def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
-    """
-    This is the degradation model of BSRGAN from the paper
-    "Designing a Practical Degradation Model for Deep Blind Image Super-Resolution"
-    ----------
-    img: HXWXC, [0, 1], its size should be large than (lq_patchsizexsf)x(lq_patchsizexsf)
-    sf: scale factor
-    isp_model: camera ISP model
-    Returns
-    -------
-    img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1]
-    hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1]
-    """
-    isp_prob, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25
-    sf_ori = sf
-
-    h1, w1 = img.shape[:2]
-    img = img.copy()[: w1 - w1 % sf, : h1 - h1 % sf, ...]  # mod crop
-    h, w = img.shape[:2]
-
-    if h < lq_patchsize * sf or w < lq_patchsize * sf:
-        raise ValueError(f"img size ({h1}X{w1}) is too small!")
-
-    hq = img.copy()
-
-    if sf == 4 and random.random() < scale2_prob:  # downsample1
-        if np.random.rand() < 0.5:
-            img = cv2.resize(
-                img, (int(1 / 2 * img.shape[1]), int(1 / 2 * img.shape[0])), interpolation=random.choice([1, 2, 3])
-            )
-        else:
-            img = util.imresize_np(img, 1 / 2, True)
-        img = np.clip(img, 0.0, 1.0)
-        sf = 2
-
-    shuffle_order = random.sample(range(7), 7)
-    idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3)
-    if idx1 > idx2:  # keep downsample3 last
-        shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1]
-
-    for i in shuffle_order:
-
-        if i == 0:
-            img = add_blur(img, sf=sf)
-
-        elif i == 1:
-            img = add_blur(img, sf=sf)
-
-        elif i == 2:
-            a, b = img.shape[1], img.shape[0]
-            # downsample2
-            if random.random() < 0.75:
-                sf1 = random.uniform(1, 2 * sf)
-                img = cv2.resize(
-                    img,
-                    (int(1 / sf1 * img.shape[1]), int(1 / sf1 * img.shape[0])),
-                    interpolation=random.choice([1, 2, 3]),
-                )
-            else:
-                k = fspecial("gaussian", 25, random.uniform(0.1, 0.6 * sf))
-                k_shifted = shift_pixel(k, sf)
-                k_shifted = k_shifted / k_shifted.sum()  # blur with shifted kernel
-                img = ndimage.filters.convolve(img, np.expand_dims(k_shifted, axis=2), mode="mirror")
-                img = img[0::sf, 0::sf, ...]  # nearest downsampling
-            img = np.clip(img, 0.0, 1.0)
-
-        elif i == 3:
-            # downsample3
-            img = cv2.resize(img, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3]))
-            img = np.clip(img, 0.0, 1.0)
-
-        elif i == 4:
-            # add Gaussian noise
-            img = add_Gaussian_noise(img, noise_level1=2, noise_level2=25)
-
-        elif i == 5:
-            # add JPEG noise
-            if random.random() < jpeg_prob:
-                img = add_JPEG_noise(img)
-
-        elif i == 6:
-            # add processed camera sensor noise
-            if random.random() < isp_prob and isp_model is not None:
-                with paddle.no_grad():
-                    img, hq = isp_model.forward(img.copy(), hq)
-
-    # add final JPEG compression noise
-    img = add_JPEG_noise(img)
-
-    # random crop
-    img, hq = random_crop(img, hq, sf_ori, lq_patchsize)
-
-    return img, hq
-
-
-# todo no isp_model?
-def degradation_bsrgan_variant(image, sf=4, isp_model=None):
-    """
-    This is the degradation model of BSRGAN from the paper
-    "Designing a Practical Degradation Model for Deep Blind Image Super-Resolution"
-    ----------
-    sf: scale factor
-    isp_model: camera ISP model
-    Returns
-    -------
-    img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1]
-    hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1]
-    """
-    image = util.uint2single(image)
-    _, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25
-
-    h1, w1 = image.shape[:2]
-    image = image.copy()[: w1 - w1 % sf, : h1 - h1 % sf, ...]  # mod crop
-    h, w = image.shape[:2]
-
-    if sf == 4 and random.random() < scale2_prob:  # downsample1
-        if np.random.rand() < 0.5:
-            image = cv2.resize(
-                image,
-                (int(1 / 2 * image.shape[1]), int(1 / 2 * image.shape[0])),
-                interpolation=random.choice([1, 2, 3]),
-            )
-        else:
-            image = util.imresize_np(image, 1 / 2, True)
-        image = np.clip(image, 0.0, 1.0)
-        sf = 2
-
-    shuffle_order = random.sample(range(7), 7)
-    idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3)
-    if idx1 > idx2:  # keep downsample3 last
-        shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1]
-
-    for i in shuffle_order:
-
-        if i == 0:
-            image = add_blur(image, sf=sf)
-
-        elif i == 1:
-            image = add_blur(image, sf=sf)
-
-        elif i == 2:
-            a, b = image.shape[1], image.shape[0]
-            # downsample2
-            if random.random() < 0.75:
-                sf1 = random.uniform(1, 2 * sf)
-                image = cv2.resize(
-                    image,
-                    (int(1 / sf1 * image.shape[1]), int(1 / sf1 * image.shape[0])),
-                    interpolation=random.choice([1, 2, 3]),
-                )
-            else:
-                k = fspecial("gaussian", 25, random.uniform(0.1, 0.6 * sf))
-                k_shifted = shift_pixel(k, sf)
-                k_shifted = k_shifted / k_shifted.sum()  # blur with shifted kernel
-                image = ndimage.filters.convolve(image, np.expand_dims(k_shifted, axis=2), mode="mirror")
-                image = image[0::sf, 0::sf, ...]  # nearest downsampling
-            image = np.clip(image, 0.0, 1.0)
-
-        elif i == 3:
-            # downsample3
-            image = cv2.resize(image, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3]))
-            image = np.clip(image, 0.0, 1.0)
-
-        elif i == 4:
-            # add Gaussian noise
-            image = add_Gaussian_noise(image, noise_level1=2, noise_level2=25)
-
-        elif i == 5:
-            # add JPEG noise
-            if random.random() < jpeg_prob:
-                image = add_JPEG_noise(image)
-
-    # add final JPEG compression noise
-    image = add_JPEG_noise(image)
-    image = util.single2uint(image)
-    example = {"image": image}
-    return example
-
-
-if __name__ == "__main__":
-    print("hey")
-    img = util.imread_uint("utils/test.png", 3)
-    img = img[:448, :448]
-    h = img.shape[0] // 4
-    print("resizing to", h)
-    sf = 4
-    deg_fn = partial(degradation_bsrgan_variant, sf=sf)
-    for i in range(20):
-        print(i)
-        img_hq = img
-        img_lq = deg_fn(img)["image"]
-        img_hq, img_lq = util.uint2single(img_hq), util.uint2single(img_lq)
-        print(img_lq)
-        img_lq_bicubic = albumentations.SmallestMaxSize(max_size=h, interpolation=cv2.INTER_CUBIC)(image=img_hq)[
-            "image"
-        ]
-        print(img_lq.shape)
-        print("bicubic", img_lq_bicubic.shape)
-        print(img_hq.shape)
-        lq_nearest = cv2.resize(
-            util.single2uint(img_lq), (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])), interpolation=0
-        )
-        lq_bicubic_nearest = cv2.resize(
-            util.single2uint(img_lq_bicubic), (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])), interpolation=0
-        )
-        img_concat = np.concatenate([lq_bicubic_nearest, lq_nearest, util.single2uint(img_hq)], axis=1)
-        util.imsave(img_concat, str(i) + ".png")
diff --git a/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/bsrgan_light.py b/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/bsrgan_light.py
deleted file mode 100644
index 4c04cad55e21..000000000000
--- a/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/bsrgan_light.py
+++ /dev/null
@@ -1,648 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-from functools import partial
-
-import albumentations
-import cv2
-import numpy as np
-import paddle
-import paddle.nn.functional as F
-import scipy
-import scipy.stats as ss
-from scipy import ndimage
-from scipy.interpolate import interp2d
-from scipy.linalg import orth
-
-from . import utils_image as util
-
-"""
-# --------------------------------------------
-# Super-Resolution
-# --------------------------------------------
-#
-# Kai Zhang (cskaizhang@gmail.com)
-# https://github.com/cszn
-# From 2019/03--2021/08
-# --------------------------------------------
-"""
-
-
-def modcrop_np(img, sf):
-    """
-    Args:
-        img: numpy image, WxH or WxHxC
-        sf: scale factor
-    Return:
-        cropped image
-    """
-    w, h = img.shape[:2]
-    im = np.copy(img)
-    return im[: w - w % sf, : h - h % sf, ...]
-
-
-"""
-# --------------------------------------------
-# anisotropic Gaussian kernels
-# --------------------------------------------
-"""
-
-
-def analytic_kernel(k):
-    """Calculate the X4 kernel from the X2 kernel (for proof see appendix in paper)"""
-    k_size = k.shape[0]
-    # Calculate the big kernels size
-    big_k = np.zeros((3 * k_size - 2, 3 * k_size - 2))
-    # Loop over the small kernel to fill the big one
-    for r in range(k_size):
-        for c in range(k_size):
-            big_k[2 * r : 2 * r + k_size, 2 * c : 2 * c + k_size] += k[r, c] * k
-    # Crop the edges of the big kernel to ignore very small values and increase run time of SR
-    crop = k_size // 2
-    cropped_big_k = big_k[crop:-crop, crop:-crop]
-    # Normalize to 1
-    return cropped_big_k / cropped_big_k.sum()
-
-
-def anisotropic_Gaussian(ksize=15, theta=np.pi, l1=6, l2=6):
-    """generate an anisotropic Gaussian kernel
-    Args:
-        ksize : e.g., 15, kernel size
-        theta : [0,  pi], rotation angle range
-        l1    : [0.1,50], scaling of eigenvalues
-        l2    : [0.1,l1], scaling of eigenvalues
-        If l1 = l2, will get an isotropic Gaussian kernel.
-    Returns:
-        k     : kernel
-    """
-
-    v = np.dot(np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]), np.array([1.0, 0.0]))
-    V = np.array([[v[0], v[1]], [v[1], -v[0]]])
-    D = np.array([[l1, 0], [0, l2]])
-    Sigma = np.dot(np.dot(V, D), np.linalg.inv(V))
-    k = gm_blur_kernel(mean=[0, 0], cov=Sigma, size=ksize)
-
-    return k
-
-
-def gm_blur_kernel(mean, cov, size=15):
-    center = size / 2.0 + 0.5
-    k = np.zeros([size, size])
-    for y in range(size):
-        for x in range(size):
-            cy = y - center + 1
-            cx = x - center + 1
-            k[y, x] = ss.multivariate_normal.pdf([cx, cy], mean=mean, cov=cov)
-
-    k = k / np.sum(k)
-    return k
-
-
-def shift_pixel(x, sf, upper_left=True):
-    """shift pixel for super-resolution with different scale factors
-    Args:
-        x: WxHxC or WxH
-        sf: scale factor
-        upper_left: shift direction
-    """
-    h, w = x.shape[:2]
-    shift = (sf - 1) * 0.5
-    xv, yv = np.arange(0, w, 1.0), np.arange(0, h, 1.0)
-    if upper_left:
-        x1 = xv + shift
-        y1 = yv + shift
-    else:
-        x1 = xv - shift
-        y1 = yv - shift
-
-    x1 = np.clip(x1, 0, w - 1)
-    y1 = np.clip(y1, 0, h - 1)
-
-    if x.ndim == 2:
-        x = interp2d(xv, yv, x)(x1, y1)
-    if x.ndim == 3:
-        for i in range(x.shape[-1]):
-            x[:, :, i] = interp2d(xv, yv, x[:, :, i])(x1, y1)
-
-    return x
-
-
-def blur(x, k):
-    """
-    x: image, NxcxHxW
-    k: kernel, Nx1xhxw
-    """
-    n, c = x.shape[:2]
-    p1, p2 = (k.shape[-2] - 1) // 2, (k.shape[-1] - 1) // 2
-    x = F.pad(x, pad=(p1, p2, p1, p2), mode="replicate")
-    k = k.tile([1, c, 1, 1])
-    k = k.reshape([-1, 1, k.shape[2], k.shape[3]])
-    x = x.reshape([1, -1, x.shape[2], x.shape[3]])
-    x = F.conv2d(x, k, bias=None, stride=1, padding=0, groups=n * c)
-    x = x.reshape([n, c, x.shape[2], x.shape[3]])
-
-    return x
-
-
-def gen_kernel(k_size=np.array([15, 15]), scale_factor=np.array([4, 4]), min_var=0.6, max_var=10.0, noise_level=0):
-    """ "
-    # modified version of https://github.com/assafshocher/BlindSR_dataset_generator
-    # Kai Zhang
-    # min_var = 0.175 * sf  # variance of the gaussian kernel will be sampled between min_var and max_var
-    # max_var = 2.5 * sf
-    """
-    # Set random eigen-vals (lambdas) and angle (theta) for COV matrix
-    lambda_1 = min_var + np.random.rand() * (max_var - min_var)
-    lambda_2 = min_var + np.random.rand() * (max_var - min_var)
-    theta = np.random.rand() * np.pi  # random theta
-    noise = -noise_level + np.random.rand(*k_size) * noise_level * 2
-
-    # Set COV matrix using Lambdas and Theta
-    LAMBDA = np.diag([lambda_1, lambda_2])
-    Q = np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]])
-    SIGMA = Q @ LAMBDA @ Q.T
-    INV_SIGMA = np.linalg.inv(SIGMA)[None, None, :, :]
-
-    # Set expectation position (shifting kernel for aligned image)
-    MU = k_size // 2 - 0.5 * (scale_factor - 1)  # - 0.5 * (scale_factor - k_size % 2)
-    MU = MU[None, None, :, None]
-
-    # Create meshgrid for Gaussian
-    [X, Y] = np.meshgrid(range(k_size[0]), range(k_size[1]))
-    Z = np.stack([X, Y], 2)[:, :, :, None]
-
-    # Calcualte Gaussian for every pixel of the kernel
-    ZZ = Z - MU
-    ZZ_t = ZZ.transpose(0, 1, 3, 2)
-    raw_kernel = np.exp(-0.5 * np.squeeze(ZZ_t @ INV_SIGMA @ ZZ)) * (1 + noise)
-
-    # shift the kernel so it will be centered
-    # raw_kernel_centered = kernel_shift(raw_kernel, scale_factor)
-
-    # Normalize the kernel and return
-    # kernel = raw_kernel_centered / np.sum(raw_kernel_centered)
-    kernel = raw_kernel / np.sum(raw_kernel)
-    return kernel
-
-
-def fspecial_gaussian(hsize, sigma):
-    hsize = [hsize, hsize]
-    siz = [(hsize[0] - 1.0) / 2.0, (hsize[1] - 1.0) / 2.0]
-    std = sigma
-    [x, y] = np.meshgrid(np.arange(-siz[1], siz[1] + 1), np.arange(-siz[0], siz[0] + 1))
-    arg = -(x * x + y * y) / (2 * std * std)
-    h = np.exp(arg)
-    h[h < scipy.finfo(float).eps * h.max()] = 0
-    sumh = h.sum()
-    if sumh != 0:
-        h = h / sumh
-    return h
-
-
-def fspecial_laplacian(alpha):
-    alpha = max([0, min([alpha, 1])])
-    h1 = alpha / (alpha + 1)
-    h2 = (1 - alpha) / (alpha + 1)
-    h = [[h1, h2, h1], [h2, -4 / (alpha + 1), h2], [h1, h2, h1]]
-    h = np.array(h)
-    return h
-
-
-def fspecial(filter_type, *args, **kwargs):
-    """
-    python code from:
-    https://github.com/ronaldosena/imagens-medicas-2/blob/40171a6c259edec7827a6693a93955de2bd39e76/Aulas/aula_2_-_uniform_filter/matlab_fspecial.py
-    """
-    if filter_type == "gaussian":
-        return fspecial_gaussian(*args, **kwargs)
-    if filter_type == "laplacian":
-        return fspecial_laplacian(*args, **kwargs)
-
-
-"""
-# --------------------------------------------
-# degradation models
-# --------------------------------------------
-"""
-
-
-def bicubic_degradation(x, sf=3):
-    """
-    Args:
-        x: HxWxC image, [0, 1]
-        sf: down-scale factor
-    Return:
-        bicubicly downsampled LR image
-    """
-    x = util.imresize_np(x, scale=1 / sf)
-    return x
-
-
-def srmd_degradation(x, k, sf=3):
-    """blur + bicubic downsampling
-    Args:
-        x: HxWxC image, [0, 1]
-        k: hxw, double
-        sf: down-scale factor
-    Return:
-        downsampled LR image
-    Reference:
-        @inproceedings{zhang2018learning,
-          title={Learning a single convolutional super-resolution network for multiple degradations},
-          author={Zhang, Kai and Zuo, Wangmeng and Zhang, Lei},
-          booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
-          pages={3262--3271},
-          year={2018}
-        }
-    """
-    x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode="wrap")  # 'nearest' | 'mirror'
-    x = bicubic_degradation(x, sf=sf)
-    return x
-
-
-def dpsr_degradation(x, k, sf=3):
-    """bicubic downsampling + blur
-    Args:
-        x: HxWxC image, [0, 1]
-        k: hxw, double
-        sf: down-scale factor
-    Return:
-        downsampled LR image
-    Reference:
-        @inproceedings{zhang2019deep,
-          title={Deep Plug-and-Play Super-Resolution for Arbitrary Blur Kernels},
-          author={Zhang, Kai and Zuo, Wangmeng and Zhang, Lei},
-          booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
-          pages={1671--1681},
-          year={2019}
-        }
-    """
-    x = bicubic_degradation(x, sf=sf)
-    x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode="wrap")
-    return x
-
-
-def classical_degradation(x, k, sf=3):
-    """blur + downsampling
-    Args:
-        x: HxWxC image, [0, 1]/[0, 255]
-        k: hxw, double
-        sf: down-scale factor
-    Return:
-        downsampled LR image
-    """
-    x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode="wrap")
-    # x = filters.correlate(x, np.expand_dims(np.flip(k), axis=2))
-    st = 0
-    return x[st::sf, st::sf, ...]
-
-
-def add_sharpening(img, weight=0.5, radius=50, threshold=10):
-    """USM sharpening. borrowed from real-ESRGAN
-    Input image: I; Blurry image: B.
-    1. K = I + weight * (I - B)
-    2. Mask = 1 if abs(I - B) > threshold, else: 0
-    3. Blur mask:
-    4. Out = Mask * K + (1 - Mask) * I
-    Args:
-        img (Numpy array): Input image, HWC, BGR; float32, [0, 1].
-        weight (float): Sharp weight. Default: 1.
-        radius (float): Kernel size of Gaussian blur. Default: 50.
-        threshold (int):
-    """
-    if radius % 2 == 0:
-        radius += 1
-    blur = cv2.GaussianBlur(img, (radius, radius), 0)
-    residual = img - blur
-    mask = np.abs(residual) * 255 > threshold
-    mask = mask.astype("float32")
-    soft_mask = cv2.GaussianBlur(mask, (radius, radius), 0)
-
-    K = img + weight * residual
-    K = np.clip(K, 0, 1)
-    return soft_mask * K + (1 - soft_mask) * img
-
-
-def add_blur(img, sf=4):
-    wd2 = 4.0 + sf
-    wd = 2.0 + 0.2 * sf
-
-    wd2 = wd2 / 4
-    wd = wd / 4
-
-    if random.random() < 0.5:
-        l1 = wd2 * random.random()
-        l2 = wd2 * random.random()
-        k = anisotropic_Gaussian(ksize=random.randint(2, 11) + 3, theta=random.random() * np.pi, l1=l1, l2=l2)
-    else:
-        k = fspecial("gaussian", random.randint(2, 4) + 3, wd * random.random())
-    img = ndimage.filters.convolve(img, np.expand_dims(k, axis=2), mode="mirror")
-
-    return img
-
-
-def add_resize(img, sf=4):
-    rnum = np.random.rand()
-    if rnum > 0.8:  # up
-        sf1 = random.uniform(1, 2)
-    elif rnum < 0.7:  # down
-        sf1 = random.uniform(0.5 / sf, 1)
-    else:
-        sf1 = 1.0
-    img = cv2.resize(img, (int(sf1 * img.shape[1]), int(sf1 * img.shape[0])), interpolation=random.choice([1, 2, 3]))
-    img = np.clip(img, 0.0, 1.0)
-
-    return img
-
-
-def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
-    noise_level = random.randint(noise_level1, noise_level2)
-    rnum = np.random.rand()
-    if rnum > 0.6:  # add color Gaussian noise
-        img = img + np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
-    elif rnum < 0.4:  # add grayscale Gaussian noise
-        img = img + np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
-    else:  # add  noise
-        L = noise_level2 / 255.0
-        D = np.diag(np.random.rand(3))
-        U = orth(np.random.rand(3, 3))
-        conv = np.dot(np.dot(np.transpose(U), D), U)
-        img = img + np.random.multivariate_normal([0, 0, 0], np.abs(L**2 * conv), img.shape[:2]).astype(np.float32)
-    img = np.clip(img, 0.0, 1.0)
-    return img
-
-
-def add_speckle_noise(img, noise_level1=2, noise_level2=25):
-    noise_level = random.randint(noise_level1, noise_level2)
-    img = np.clip(img, 0.0, 1.0)
-    rnum = random.random()
-    if rnum > 0.6:
-        img += img * np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
-    elif rnum < 0.4:
-        img += img * np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
-    else:
-        L = noise_level2 / 255.0
-        D = np.diag(np.random.rand(3))
-        U = orth(np.random.rand(3, 3))
-        conv = np.dot(np.dot(np.transpose(U), D), U)
-        img += img * np.random.multivariate_normal([0, 0, 0], np.abs(L**2 * conv), img.shape[:2]).astype(np.float32)
-    img = np.clip(img, 0.0, 1.0)
-    return img
-
-
-def add_Poisson_noise(img):
-    img = np.clip((img * 255.0).round(), 0, 255) / 255.0
-    vals = 10 ** (2 * random.random() + 2.0)  # [2, 4]
-    if random.random() < 0.5:
-        img = np.random.poisson(img * vals).astype(np.float32) / vals
-    else:
-        img_gray = np.dot(img[..., :3], [0.299, 0.587, 0.114])
-        img_gray = np.clip((img_gray * 255.0).round(), 0, 255) / 255.0
-        noise_gray = np.random.poisson(img_gray * vals).astype(np.float32) / vals - img_gray
-        img += noise_gray[:, :, np.newaxis]
-    img = np.clip(img, 0.0, 1.0)
-    return img
-
-
-def add_JPEG_noise(img):
-    quality_factor = random.randint(80, 95)
-    img = cv2.cvtColor(util.single2uint(img), cv2.COLOR_RGB2BGR)
-    result, encimg = cv2.imencode(".jpg", img, [int(cv2.IMWRITE_JPEG_QUALITY), quality_factor])
-    img = cv2.imdecode(encimg, 1)
-    img = cv2.cvtColor(util.uint2single(img), cv2.COLOR_BGR2RGB)
-    return img
-
-
-def random_crop(lq, hq, sf=4, lq_patchsize=64):
-    h, w = lq.shape[:2]
-    rnd_h = random.randint(0, h - lq_patchsize)
-    rnd_w = random.randint(0, w - lq_patchsize)
-    lq = lq[rnd_h : rnd_h + lq_patchsize, rnd_w : rnd_w + lq_patchsize, :]
-
-    rnd_h_H, rnd_w_H = int(rnd_h * sf), int(rnd_w * sf)
-    hq = hq[rnd_h_H : rnd_h_H + lq_patchsize * sf, rnd_w_H : rnd_w_H + lq_patchsize * sf, :]
-    return lq, hq
-
-
-def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
-    """
-    This is the degradation model of BSRGAN from the paper
-    "Designing a Practical Degradation Model for Deep Blind Image Super-Resolution"
-    ----------
-    img: HXWXC, [0, 1], its size should be large than (lq_patchsizexsf)x(lq_patchsizexsf)
-    sf: scale factor
-    isp_model: camera ISP model
-    Returns
-    -------
-    img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1]
-    hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1]
-    """
-    isp_prob, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25
-    sf_ori = sf
-
-    h1, w1 = img.shape[:2]
-    img = img.copy()[: w1 - w1 % sf, : h1 - h1 % sf, ...]  # mod crop
-    h, w = img.shape[:2]
-
-    if h < lq_patchsize * sf or w < lq_patchsize * sf:
-        raise ValueError(f"img size ({h1}X{w1}) is too small!")
-
-    hq = img.copy()
-
-    if sf == 4 and random.random() < scale2_prob:  # downsample1
-        if np.random.rand() < 0.5:
-            img = cv2.resize(
-                img, (int(1 / 2 * img.shape[1]), int(1 / 2 * img.shape[0])), interpolation=random.choice([1, 2, 3])
-            )
-        else:
-            img = util.imresize_np(img, 1 / 2, True)
-        img = np.clip(img, 0.0, 1.0)
-        sf = 2
-
-    shuffle_order = random.sample(range(7), 7)
-    idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3)
-    if idx1 > idx2:  # keep downsample3 last
-        shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1]
-
-    for i in shuffle_order:
-
-        if i == 0:
-            img = add_blur(img, sf=sf)
-
-        elif i == 1:
-            img = add_blur(img, sf=sf)
-
-        elif i == 2:
-            a, b = img.shape[1], img.shape[0]
-            # downsample2
-            if random.random() < 0.75:
-                sf1 = random.uniform(1, 2 * sf)
-                img = cv2.resize(
-                    img,
-                    (int(1 / sf1 * img.shape[1]), int(1 / sf1 * img.shape[0])),
-                    interpolation=random.choice([1, 2, 3]),
-                )
-            else:
-                k = fspecial("gaussian", 25, random.uniform(0.1, 0.6 * sf))
-                k_shifted = shift_pixel(k, sf)
-                k_shifted = k_shifted / k_shifted.sum()  # blur with shifted kernel
-                img = ndimage.filters.convolve(img, np.expand_dims(k_shifted, axis=2), mode="mirror")
-                img = img[0::sf, 0::sf, ...]  # nearest downsampling
-            img = np.clip(img, 0.0, 1.0)
-
-        elif i == 3:
-            # downsample3
-            img = cv2.resize(img, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3]))
-            img = np.clip(img, 0.0, 1.0)
-
-        elif i == 4:
-            # add Gaussian noise
-            img = add_Gaussian_noise(img, noise_level1=2, noise_level2=8)
-
-        elif i == 5:
-            # add JPEG noise
-            if random.random() < jpeg_prob:
-                img = add_JPEG_noise(img)
-
-        elif i == 6:
-            # add processed camera sensor noise
-            if random.random() < isp_prob and isp_model is not None:
-                with paddle.no_grad():
-                    img, hq = isp_model.forward(img.copy(), hq)
-
-    # add final JPEG compression noise
-    img = add_JPEG_noise(img)
-
-    # random crop
-    img, hq = random_crop(img, hq, sf_ori, lq_patchsize)
-
-    return img, hq
-
-
-# todo no isp_model?
-def degradation_bsrgan_variant(image, sf=4, isp_model=None):
-    """
-    This is the degradation model of BSRGAN from the paper
-    "Designing a Practical Degradation Model for Deep Blind Image Super-Resolution"
-    ----------
-    sf: scale factor
-    isp_model: camera ISP model
-    Returns
-    -------
-    img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1]
-    hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1]
-    """
-    image = util.uint2single(image)
-    _, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25
-
-    h1, w1 = image.shape[:2]
-    image = image.copy()[: w1 - w1 % sf, : h1 - h1 % sf, ...]  # mod crop
-    h, w = image.shape[:2]
-
-    if sf == 4 and random.random() < scale2_prob:  # downsample1
-        if np.random.rand() < 0.5:
-            image = cv2.resize(
-                image,
-                (int(1 / 2 * image.shape[1]), int(1 / 2 * image.shape[0])),
-                interpolation=random.choice([1, 2, 3]),
-            )
-        else:
-            image = util.imresize_np(image, 1 / 2, True)
-        image = np.clip(image, 0.0, 1.0)
-        sf = 2
-
-    shuffle_order = random.sample(range(7), 7)
-    idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3)
-    if idx1 > idx2:  # keep downsample3 last
-        shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1]
-
-    for i in shuffle_order:
-
-        if i == 0:
-            image = add_blur(image, sf=sf)
-
-        if i == 0:
-            pass
-
-        elif i == 2:
-            a, b = image.shape[1], image.shape[0]
-            # downsample2
-            if random.random() < 0.8:
-                sf1 = random.uniform(1, 2 * sf)
-                image = cv2.resize(
-                    image,
-                    (int(1 / sf1 * image.shape[1]), int(1 / sf1 * image.shape[0])),
-                    interpolation=random.choice([1, 2, 3]),
-                )
-            else:
-                k = fspecial("gaussian", 25, random.uniform(0.1, 0.6 * sf))
-                k_shifted = shift_pixel(k, sf)
-                k_shifted = k_shifted / k_shifted.sum()  # blur with shifted kernel
-                image = ndimage.filters.convolve(image, np.expand_dims(k_shifted, axis=2), mode="mirror")
-                image = image[0::sf, 0::sf, ...]  # nearest downsampling
-
-            image = np.clip(image, 0.0, 1.0)
-
-        elif i == 3:
-            # downsample3
-            image = cv2.resize(image, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3]))
-            image = np.clip(image, 0.0, 1.0)
-
-        elif i == 4:
-            # add Gaussian noise
-            image = add_Gaussian_noise(image, noise_level1=1, noise_level2=2)
-
-        elif i == 5:
-            # add JPEG noise
-            if random.random() < jpeg_prob:
-                image = add_JPEG_noise(image)
-
-    # add final JPEG compression noise
-    image = add_JPEG_noise(image)
-    image = util.single2uint(image)
-    example = {"image": image}
-    return example
-
-
-if __name__ == "__main__":
-    print("hey")
-    img = util.imread_uint("utils/test.png", 3)
-    img = img[:448, :448]
-    h = img.shape[0] // 4
-    print("resizing to", h)
-    sf = 4
-    deg_fn = partial(degradation_bsrgan_variant, sf=sf)
-    for i in range(20):
-        print(i)
-        img_hq = img
-        img_lq = deg_fn(img)["image"]
-        img_hq, img_lq = util.uint2single(img_hq), util.uint2single(img_lq)
-        print(img_lq)
-        img_lq_bicubic = albumentations.SmallestMaxSize(max_size=h, interpolation=cv2.INTER_CUBIC)(image=img_hq)[
-            "image"
-        ]
-        print(img_lq.shape)
-        print("bicubic", img_lq_bicubic.shape)
-        print(img_hq.shape)
-        lq_nearest = cv2.resize(
-            util.single2uint(img_lq), (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])), interpolation=0
-        )
-        lq_bicubic_nearest = cv2.resize(
-            util.single2uint(img_lq_bicubic), (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])), interpolation=0
-        )
-        img_concat = np.concatenate([lq_bicubic_nearest, lq_nearest, util.single2uint(img_hq)], axis=1)
-        util.imsave(img_concat, str(i) + ".png")
diff --git a/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/utils_image.py b/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/utils_image.py
deleted file mode 100644
index be3bdaa3321c..000000000000
--- a/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/utils_image.py
+++ /dev/null
@@ -1,209 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-
-import cv2
-import numpy as np
-import paddle
-
-
-def narrow(x, axis, start, length):
-    return paddle.slice(x, [axis], [start], [start + length])
-
-
-def uint2single(img):
-    return np.float32(img / 255.0)
-
-
-def single2uint(img):
-    return np.uint8((img.clip(0, 1) * 255.0).round())
-
-
-# --------------------------------------------
-# get uint8 image of size HxWxn_channles (RGB)
-# --------------------------------------------
-def imread_uint(path, n_channels=3):
-    #  input: path
-    # output: HxWx3(RGB or GGG), or HxWx1 (G)
-    if n_channels == 1:
-        img = cv2.imread(path, 0)  # cv2.IMREAD_GRAYSCALE
-        img = np.expand_dims(img, axis=2)  # HxWx1
-    elif n_channels == 3:
-        img = cv2.imread(path, cv2.IMREAD_UNCHANGED)  # BGR or G
-        if img.ndim == 2:
-            img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)  # GGG
-        else:
-            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # RGB
-    return img
-
-
-# --------------------------------------------
-# matlab's imwrite
-# --------------------------------------------
-def imsave(img, img_path):
-    img = np.squeeze(img)
-    if img.ndim == 3:
-        img = img[:, :, [2, 1, 0]]
-    cv2.imwrite(img_path, img)
-
-
-def imwrite(img, img_path):
-    img = np.squeeze(img)
-    if img.ndim == 3:
-        img = img[:, :, [2, 1, 0]]
-    cv2.imwrite(img_path, img)
-
-
-# matlab 'imresize' function, now only support 'bicubic'
-def cubic(x):
-    absx = paddle.abs(x)
-    absx2 = absx**2
-    absx3 = absx**3
-    return (1.5 * absx3 - 2.5 * absx2 + 1) * ((absx <= 1).astype(absx.dtype)) + (
-        -0.5 * absx3 + 2.5 * absx2 - 4 * absx + 2
-    ) * (((absx > 1) * (absx <= 2)).astype(absx.dtype))
-
-
-def calculate_weights_indices(in_length, out_length, scale, kernel, kernel_width, antialiasing):
-    if (scale < 1) and (antialiasing):
-        # Use a modified kernel to simultaneously interpolate and antialias- larger kernel width
-        kernel_width = kernel_width / scale
-
-    # Output-space coordinates
-    x = paddle.linspace(1, out_length, out_length)
-
-    # Input-space coordinates. Calculate the inverse mapping such that 0.5
-    # in output space maps to 0.5 in input space, and 0.5+scale in output
-    # space maps to 1.5 in input space.
-    u = x / scale + 0.5 * (1 - 1 / scale)
-
-    # What is the left-most pixel that can be involved in the computation?
-    left = paddle.floor(u - kernel_width / 2)
-
-    # What is the maximum number of pixels that can be involved in the
-    # computation?  Note: it's OK to use an extra pixel here; if the
-    # corresponding weights are all zero, it will be eliminated at the end
-    # of this function.
-    P = math.ceil(kernel_width) + 2
-
-    # The indices of the input pixels involved in computing the k-th output
-    # pixel are in row k of the indices matrix.
-    indices = left.reshape([out_length, 1]).expand([out_length, P]) + paddle.linspace(0, P - 1, P).reshape(
-        [1, P]
-    ).expand([out_length, P])
-
-    # The weights used to compute the k-th output pixel are in row k of the
-    # weights matrix.
-    distance_to_center = u.reshape([out_length, 1]).expand([out_length, P]) - indices
-    # apply cubic kernel
-    if (scale < 1) and (antialiasing):
-        weights = scale * cubic(distance_to_center * scale)
-    else:
-        weights = cubic(distance_to_center)
-    # Normalize the weights matrix so that each row sums to 1.
-    weights_sum = paddle.sum(weights, 1).reshape([out_length, 1])
-    weights = weights / weights_sum.expand([out_length, P])
-
-    # If a column in weights is all zero, get rid of it. only consider the first and last column.
-    weights_zero_tmp = paddle.sum((weights == 0).astype("int64"), 0)
-    if not math.isclose(weights_zero_tmp[0], 0, rel_tol=1e-6):
-        indices = narrow(indices, 1, 1, P - 2)
-        weights = narrow(weights, 1, 1, P - 2)
-    if not math.isclose(weights_zero_tmp[-1], 0, rel_tol=1e-6):
-        indices = narrow(indices, 1, 0, P - 2)
-        weights = narrow(weights, 1, 0, P - 2)
-
-    sym_len_s = -indices.min() + 1
-    sym_len_e = indices.max() - in_length
-    indices = indices + sym_len_s - 1
-    return weights, indices, int(sym_len_s), int(sym_len_e)
-
-
-# --------------------------------------------
-# imresize for numpy image [0, 1]
-# --------------------------------------------
-def imresize_np(img, scale, antialiasing=True):
-    # Now the scale should be the same for H and W
-    # input: img: Numpy, HWC or HW [0,1]
-    # output: HWC or HW [0,1] w/o round
-    img = paddle.to_tensor(img)
-    need_squeeze = True if img.ndim == 2 else False
-    if need_squeeze:
-        img = img.unsqueeze(2)
-
-    in_H, in_W, in_C = img.shape
-    out_C, out_H, out_W = in_C, math.ceil(in_H * scale), math.ceil(in_W * scale)
-    kernel_width = 4
-    kernel = "cubic"
-
-    # Return the desired dimension order for performing the resize.  The
-    # strategy is to perform the resize first along the dimension with the
-    # smallest scale factor.
-    # Now we do not support this.
-
-    # get weights and indices
-    weights_H, indices_H, sym_len_Hs, sym_len_He = calculate_weights_indices(
-        in_H, out_H, scale, kernel, kernel_width, antialiasing
-    )
-    weights_W, indices_W, sym_len_Ws, sym_len_We = calculate_weights_indices(
-        in_W, out_W, scale, kernel, kernel_width, antialiasing
-    )
-    # process H dimension
-    # symmetric copying
-    img_aug = paddle.zeros([in_H + sym_len_Hs + sym_len_He, in_W, in_C])
-    img_aug[sym_len_Hs : sym_len_Hs + in_H] = img
-
-    sym_patch = img[:sym_len_Hs, :, :]
-    inv_idx = paddle.arange(sym_patch.shape[0] - 1, -1, -1).astype("int64")
-    sym_patch_inv = sym_patch.index_select(inv_idx, axis=0)
-    img_aug[:sym_len_Hs] = sym_patch_inv
-
-    sym_patch = img[-sym_len_He:, :, :]
-    inv_idx = paddle.arange(sym_patch.shape[0] - 1, -1, -1).astype("int64")
-    sym_patch_inv = sym_patch.index_select(inv_idx, axis=0)
-    img_aug[sym_len_Hs + in_H : sym_len_Hs + in_H + sym_len_He] = sym_patch_inv
-
-    out_1 = paddle.zeros([out_H, in_W, in_C])
-    kernel_width = weights_H.shape[1]
-    for i in range(out_H):
-        idx = int(indices_H[i][0])
-        for j in range(out_C):
-            out_1[i, :, j] = img_aug[idx : idx + kernel_width, :, j].transpose([1, 0]).mv(weights_H[i])
-
-    # process W dimension
-    # symmetric copying
-    out_1_aug = paddle.zeros([out_H, in_W + sym_len_Ws + sym_len_We, in_C])
-    out_1_aug[:, sym_len_Ws : sym_len_Ws + in_W] = out_1
-
-    sym_patch = out_1[:, :sym_len_Ws, :]
-    inv_idx = paddle.arange(sym_patch.shape[1] - 1, -1, -1).astype("int64")
-    sym_patch_inv = sym_patch.index_select(inv_idx, axis=1)
-    out_1_aug[:, :sym_len_Ws] = sym_patch_inv
-
-    sym_patch = out_1[:, -sym_len_We:, :]
-    inv_idx = paddle.arange(sym_patch.shape[1] - 1, -1, -1).astype("int64")
-    sym_patch_inv = sym_patch.index_select(inv_idx, axis=1)
-    out_1_aug[:, sym_len_Ws + in_W : sym_len_Ws + in_W + sym_len_We] = sym_patch_inv
-
-    out_2 = paddle.zeros([out_H, out_W, in_C])
-    kernel_width = weights_W.shape[1]
-    for i in range(out_W):
-        idx = int(indices_W[i][0])
-        for j in range(out_C):
-            out_2[:, i, j] = out_1_aug[:, idx : idx + kernel_width, j].mv(weights_W[i])
-    if need_squeeze:
-        out_2 = out_2.squeeze()
-
-    return out_2.numpy()
diff --git a/ppdiffusers/examples/autoencoder/vae/ldm/losses.py b/ppdiffusers/examples/autoencoder/vae/ldm/losses.py
deleted file mode 100644
index a1d4f642125a..000000000000
--- a/ppdiffusers/examples/autoencoder/vae/ldm/losses.py
+++ /dev/null
@@ -1,542 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import functools
-from collections import namedtuple
-
-import numpy as np
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle.utils.download import get_weights_path_from_url
-
-from ppdiffusers.initializer import constant_, normal_, reset_initialized_parameter
-
-model_urls = {
-    "vgg16": (
-        "https://paddlenlp.bj.bcebos.com/models/lpips_vgg16/lpips_vgg16.pdparams",
-        "a1583475db9e49334735f2866847ae41",
-    ),
-    "vgg_netlin": (
-        "https://paddlenlp.bj.bcebos.com/models/lpips_vgg16/vgg_netlin.pdparams",
-        "f3ae85f16a1a243e789606ae0c4a59a1",
-    ),
-}
-
-
-class ActNorm(nn.Layer):
-    def __init__(self, num_features, logdet=False, affine=True, allow_reverse_init=False):
-        assert affine
-        super().__init__()
-        self.logdet = logdet
-        self.loc = self.create_parameter((1, num_features, 1, 1), default_initializer=nn.initializer.Constant(0))
-        self.scale = self.create_parameter((1, num_features, 1, 1), default_initializer=nn.initializer.Constant(1))
-        self.allow_reverse_init = allow_reverse_init
-
-        self.register_buffer("initialized", paddle.to_tensor(0, dtype=paddle.int64))
-
-    @paddle.no_grad()
-    def initialize(self, input):
-        flatten = input.transpose([1, 0, 2, 3]).reshape([input.shape[1], -1])
-        mean = flatten.mean(1).unsqueeze(1).unsqueeze(2).unsqueeze(3).transpose([1, 0, 2, 3])
-        std = flatten.std(1).unsqueeze(1).unsqueeze(2).unsqueeze(3).transpose([1, 0, 2, 3])
-
-        self.loc.set_value(-mean)
-        self.scale.set_value(1 / (std + 1e-6))
-
-    def forward(self, input, reverse=False):
-        if reverse:
-            return self.reverse(input)
-        if len(input.shape) == 2:
-            input = input[:, :, None, None]
-            squeeze = True
-        else:
-            squeeze = False
-
-        _, _, height, width = input.shape
-
-        if self.training and self.initialized.item() == 0:
-            self.initialize(input)
-            self.initialized.set_value(paddle.to_tensor(1, dtype=self.initialized.dtype))
-
-        h = self.scale * (input + self.loc)
-
-        if squeeze:
-            h = h.squeeze(-1).squeeze(-1)
-
-        if self.logdet:
-            log_abs = paddle.log(paddle.abs(self.scale))
-            logdet = height * width * paddle.sum(log_abs)
-            logdet = logdet * input.shape[0]
-            return h, logdet
-
-        return h
-
-    def reverse(self, output):
-        if self.training and self.initialized.item() == 0:
-            if not self.allow_reverse_init:
-                raise RuntimeError(
-                    "Initializing ActNorm in reverse direction is "
-                    "disabled by default. Use allow_reverse_init=True to enable."
-                )
-            else:
-                self.initialize(output)
-                self.initialized.set_value(paddle.to_tensor(1, dtype=self.initialized.dtype))
-
-        if len(output.shape) == 2:
-            output = output[:, :, None, None]
-            squeeze = True
-        else:
-            squeeze = False
-
-        h = output / self.scale - self.loc
-
-        if squeeze:
-            h = h.squeeze(-1).squeeze(-1)
-        return h
-
-
-def adopt_weight(weight, global_step, threshold=0, value=0.0):
-    if global_step < threshold:
-        weight = value
-    return weight
-
-
-def hinge_d_loss(logits_real, logits_fake):
-    loss_real = paddle.mean(F.relu(1.0 - logits_real))
-    loss_fake = paddle.mean(F.relu(1.0 + logits_fake))
-    d_loss = 0.5 * (loss_real + loss_fake)
-    return d_loss
-
-
-def vanilla_d_loss(logits_real, logits_fake):
-    d_loss = 0.5 * (paddle.mean(F.softplus(-logits_real)) + paddle.mean(F.softplus(logits_fake)))
-    return d_loss
-
-
-def weights_init(m):
-    classname = m.__class__.__name__
-    if classname.find("Conv") != -1:
-        normal_(m.weight, 0.0, 0.02)
-    elif classname.find("BatchNorm") != -1:
-        normal_(m.weight, 1.0, 0.02)
-        constant_(m.bias, 0.0)
-
-
-class NLayerDiscriminator(nn.Layer):
-    r"""Defines a PatchGAN discriminator as in Pix2Pix
-    --> see https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/blob/master/models/networks.py
-
-    Construct a PatchGAN discriminator
-
-    Parameters:
-        input_nc (int)  -- the number of channels in input images
-        ndf (int)       -- the number of filters in the last conv layer
-        n_layers (int)  -- the number of conv layers in the discriminator
-        norm_layer      -- normalization layer
-    """
-
-    def __init__(self, input_nc=3, ndf=64, n_layers=3, use_actnorm=False):
-        super().__init__()
-        if not use_actnorm:
-            norm_layer = nn.BatchNorm2D
-        else:
-            norm_layer = ActNorm
-        if type(norm_layer) == functools.partial:  # no need to use bias as BatchNorm2d has affine parameters
-            use_bias = norm_layer.func != nn.BatchNorm2D
-        else:
-            use_bias = norm_layer != nn.BatchNorm2D
-
-        kw = 4
-        padw = 1
-        sequence = [
-            nn.Conv2D(input_nc, ndf, kernel_size=kw, stride=2, padding=padw),
-            nn.LeakyReLU(0.2),
-        ]
-        nf_mult = 1
-        nf_mult_prev = 1
-        for n in range(1, n_layers):  # gradually increase the number of filters
-            nf_mult_prev = nf_mult
-            nf_mult = min(2**n, 8)
-            sequence += [
-                nn.Conv2D(
-                    ndf * nf_mult_prev,
-                    ndf * nf_mult,
-                    kernel_size=kw,
-                    stride=2,
-                    padding=padw,
-                    bias_attr=use_bias,
-                ),
-                norm_layer(ndf * nf_mult),
-                nn.LeakyReLU(0.2),
-            ]
-
-        nf_mult_prev = nf_mult
-        nf_mult = min(2**n_layers, 8)
-        sequence += [
-            nn.Conv2D(
-                ndf * nf_mult_prev,
-                ndf * nf_mult,
-                kernel_size=kw,
-                stride=1,
-                padding=padw,
-                bias_attr=use_bias,
-            ),
-            norm_layer(ndf * nf_mult),
-            nn.LeakyReLU(0.2),
-        ]
-
-        sequence += [
-            nn.Conv2D(ndf * nf_mult, 1, kernel_size=kw, stride=1, padding=padw)
-        ]  # output 1 channel prediction map
-        self.main = nn.Sequential(*sequence)
-
-    def forward(self, input):
-        """Standard forward."""
-        return self.main(input)
-
-
-def spatial_average(in_tens, keepdim=True):
-    return in_tens.mean([2, 3], keepdim=keepdim)
-
-
-def upsample(in_tens, out_HW=(64, 64)):  # assumes scale factor is same for H and W
-    return nn.Upsample(size=out_HW, mode="bilinear", align_corners=False)(in_tens)
-
-
-def normalize_tensor(in_feat, eps=1e-10):
-    norm_factor = paddle.sqrt(paddle.sum(in_feat**2, axis=1, keepdim=True))
-    return in_feat / (norm_factor + eps)
-
-
-class NetLinLayer(nn.Layer):
-    """A single linear layer which does a 1x1 conv"""
-
-    def __init__(self, chn_in, chn_out=1, use_dropout=False):
-        super(NetLinLayer, self).__init__()
-
-        layers = (
-            [
-                nn.Dropout(),
-            ]
-            if (use_dropout)
-            else []
-        )
-        layers += [
-            nn.Conv2D(chn_in, chn_out, 1, stride=1, padding=0, bias_attr=False),
-        ]
-        self.model = nn.Sequential(*layers)
-
-    def forward(self, x):
-        return self.model(x)
-
-
-class ScalingLayer(nn.Layer):
-    def __init__(self):
-        super(ScalingLayer, self).__init__()
-        self.register_buffer(
-            "shift",
-            paddle.to_tensor(np.asarray([-0.030, -0.088, -0.188]).astype("float32")[None, :, None, None]),
-        )
-        self.register_buffer(
-            "scale",
-            paddle.to_tensor(np.asarray([0.458, 0.448, 0.450]).astype("float32")[None, :, None, None]),
-        )
-
-    def forward(self, inp):
-        return (inp - self.shift) / self.scale
-
-
-class VGG16(nn.Layer):
-    def __init__(self, pretrained=True, requires_grad=False):
-        super(VGG16, self).__init__()
-        vgg_model = paddle.vision.models.vgg16(pretrained=False)
-        if pretrained:
-            state_dict = paddle.load(get_weights_path_from_url(*model_urls["vgg16"]))
-            vgg_model.set_state_dict(state_dict)
-        vgg_pretrained_features = vgg_model.features
-        self.slice1 = nn.Sequential()
-        self.slice2 = nn.Sequential()
-        self.slice3 = nn.Sequential()
-        self.slice4 = nn.Sequential()
-        self.slice5 = nn.Sequential()
-        self.N_slices = 5
-        for x in range(4):
-            self.slice1.add_sublayer(str(x), vgg_pretrained_features[x])
-        for x in range(4, 9):
-            self.slice2.add_sublayer(str(x), vgg_pretrained_features[x])
-        for x in range(9, 16):
-            self.slice3.add_sublayer(str(x), vgg_pretrained_features[x])
-        for x in range(16, 23):
-            self.slice4.add_sublayer(str(x), vgg_pretrained_features[x])
-        for x in range(23, 30):
-            self.slice5.add_sublayer(str(x), vgg_pretrained_features[x])
-        if not requires_grad:
-            for param in self.parameters():
-                param.stop_gradient = True
-
-    def forward(self, X):
-        h = self.slice1(X)
-        h_relu1_2 = h
-        h = self.slice2(h)
-        h_relu2_2 = h
-        h = self.slice3(h)
-        h_relu3_3 = h
-        h = self.slice4(h)
-        h_relu4_3 = h
-        h = self.slice5(h)
-        h_relu5_3 = h
-        vgg_outputs = namedtuple("VggOutputs", ["relu1_2", "relu2_2", "relu3_3", "relu4_3", "relu5_3"])
-        out = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3, h_relu5_3)
-
-        return out
-
-
-class LPIPS(nn.Layer):
-    def __init__(
-        self,
-        pretrained=True,
-        net="alex",
-        lpips=True,
-        spatial=False,
-        pnet_rand=False,
-        pnet_tune=False,
-        use_dropout=True,
-        model_path=None,
-        eval_mode=True,
-        verbose=True,
-    ):
-        # lpips - [True] means with linear calibration on top of base network
-        # pretrained - [True] means load linear weights
-
-        super(LPIPS, self).__init__()
-        if verbose:
-            print(
-                "Setting up [%s] perceptual loss: trunk [%s], spatial [%s]"
-                % ("LPIPS" if lpips else "baseline", net, "on" if spatial else "off")
-            )
-
-        self.pnet_type = net.lower()
-        self.pnet_tune = pnet_tune
-        self.pnet_rand = pnet_rand
-        self.spatial = spatial
-        self.lpips = lpips  # false means baseline of just averaging all layers
-        self.scaling_layer = ScalingLayer()
-
-        if self.pnet_type in ["vgg", "vgg16"]:
-            net_type = VGG16
-            self.chns = [64, 128, 256, 512, 512]
-        else:
-            raise NotImplementedError
-        self.L = len(self.chns)
-
-        self.net = net_type(pretrained=not self.pnet_rand, requires_grad=self.pnet_tune)
-
-        if lpips:
-            lin0 = NetLinLayer(self.chns[0], use_dropout=use_dropout)
-            lin1 = NetLinLayer(self.chns[1], use_dropout=use_dropout)
-            lin2 = NetLinLayer(self.chns[2], use_dropout=use_dropout)
-            lin3 = NetLinLayer(self.chns[3], use_dropout=use_dropout)
-            lin4 = NetLinLayer(self.chns[4], use_dropout=use_dropout)
-            lins = [lin0, lin1, lin2, lin3, lin4]
-            if self.pnet_type == "squeeze":  # 7 layers for squeezenet
-                lin5 = NetLinLayer(self.chns[5], use_dropout=use_dropout)
-                lin6 = NetLinLayer(self.chns[6], use_dropout=use_dropout)
-                lins += [lin5, lin6]
-            self.lins = nn.LayerList(lins)
-
-            if pretrained:
-                if model_path is None:
-                    model_path = get_weights_path_from_url(*model_urls["vgg_netlin"])
-                if verbose:
-                    print("Loading model from: %s" % model_path)
-                import warnings
-
-                with warnings.catch_warnings():
-                    warnings.simplefilter("ignore")
-                    self.set_state_dict(paddle.load(model_path))
-
-        if eval_mode:
-            self.eval()
-        for param in self.parameters():
-            param.stop_gradient = True
-
-    def forward(self, in0, in1, retPerLayer=False, normalize=False):
-        if normalize:  # turn on this flag if input is [0,1] so it can be adjusted to [-1, +1]
-            in0 = 2 * in0 - 1
-            in1 = 2 * in1 - 1
-
-        # v0.0 - original release had a bug, where input was not scaled
-        in0_input, in1_input = (self.scaling_layer(in0), self.scaling_layer(in1))
-        outs0, outs1 = self.net.forward(in0_input), self.net.forward(in1_input)
-        feats0, feats1, diffs = {}, {}, {}
-
-        for kk in range(self.L):
-            feats0[kk], feats1[kk] = normalize_tensor(outs0[kk]), normalize_tensor(outs1[kk])
-            diffs[kk] = (feats0[kk] - feats1[kk]) ** 2
-
-        if self.lpips:
-            if self.spatial:
-                res = [upsample(self.lins[kk](diffs[kk]), out_HW=in0.shape[2:]) for kk in range(self.L)]
-            else:
-                res = [spatial_average(self.lins[kk](diffs[kk]), keepdim=True) for kk in range(self.L)]
-        else:
-            if self.spatial:
-                res = [upsample(diffs[kk].sum(axis=1, keepdim=True), out_HW=in0.shape[2:]) for kk in range(self.L)]
-            else:
-                res = [spatial_average(diffs[kk].sum(axis=1, keepdim=True), keepdim=True) for kk in range(self.L)]
-
-        val = res[0]
-        for l in range(1, self.L):
-            val += res[l]
-
-        if retPerLayer:
-            return (val, res)
-        else:
-            return val
-
-
-class LPIPSWithDiscriminator(nn.Layer):
-    def __init__(
-        self,
-        disc_start,
-        logvar_init=0.0,
-        kl_weight=1.0,
-        pixelloss_weight=1.0,
-        disc_num_layers=3,
-        disc_in_channels=3,
-        disc_factor=1.0,
-        disc_weight=1.0,
-        perceptual_weight=1.0,
-        use_actnorm=False,
-        disc_conditional=False,
-        disc_loss="hinge",
-    ):
-
-        super().__init__()
-        assert disc_loss in ["hinge", "vanilla"]
-        self.kl_weight = kl_weight
-        self.pixel_weight = pixelloss_weight
-        # LPIPS
-        self.perceptual_loss = LPIPS(net="vgg")
-        self.perceptual_loss.eval()
-
-        self.perceptual_weight = perceptual_weight
-        self.discriminator = NLayerDiscriminator(
-            input_nc=disc_in_channels, n_layers=disc_num_layers, use_actnorm=use_actnorm
-        )
-        reset_initialized_parameter(self.discriminator)
-        self.discriminator.apply(weights_init)
-
-        # output log variance
-        self.logvar = self.create_parameter((1,), default_initializer=nn.initializer.Constant(logvar_init))
-
-        self.discriminator_iter_start = disc_start
-        self.disc_loss = hinge_d_loss if disc_loss == "hinge" else vanilla_d_loss
-        self.disc_factor = disc_factor
-        self.discriminator_weight = disc_weight
-        self.disc_conditional = disc_conditional
-
-    def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None):
-        if last_layer is not None:
-            nll_grads = paddle.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
-            g_grads = paddle.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
-        else:
-            nll_grads = paddle.autograd.grad(nll_loss, self.last_layer[0], retain_graph=True)[0]
-            g_grads = paddle.autograd.grad(g_loss, self.last_layer[0], retain_graph=True)[0]
-
-        d_weight = paddle.norm(nll_grads) / (paddle.norm(g_grads) + 1e-4)
-        d_weight = paddle.clip(d_weight, 0.0, 1e4).detach()
-        d_weight = d_weight * self.discriminator_weight
-        return d_weight
-
-    def forward(
-        self,
-        inputs,
-        reconstructions,
-        posteriors,
-        optimizer_idx,
-        global_step,
-        last_layer=None,
-        cond=None,
-        split="train",
-        weights=None,
-    ):
-        rec_loss = paddle.abs(inputs - reconstructions)
-        if self.perceptual_weight > 0:
-            p_loss = self.perceptual_loss(inputs, reconstructions)
-
-            rec_loss = rec_loss + self.perceptual_weight * p_loss
-
-        nll_loss = rec_loss / paddle.exp(self.logvar) + self.logvar
-        weighted_nll_loss = nll_loss
-        if weights is not None:
-            weighted_nll_loss = weights * nll_loss
-        weighted_nll_loss = paddle.sum(weighted_nll_loss) / weighted_nll_loss.shape[0]
-        nll_loss = paddle.sum(nll_loss) / nll_loss.shape[0]
-        kl_loss = posteriors.kl()
-        kl_loss = paddle.sum(kl_loss) / kl_loss.shape[0]
-
-        # now the GAN part
-        if optimizer_idx == 0:
-            # generator update
-            if cond is None:
-                assert not self.disc_conditional
-                logits_fake = self.discriminator(reconstructions)
-            else:
-                assert self.disc_conditional
-                logits_fake = self.discriminator(paddle.concat((reconstructions, cond), axis=1))
-            g_loss = -paddle.mean(logits_fake)
-            if self.disc_factor > 0.0:
-                try:
-                    d_weight = self.calculate_adaptive_weight(nll_loss, g_loss, last_layer=last_layer)
-                except Exception:
-                    assert not self.training
-                    d_weight = paddle.to_tensor(0.0)
-            else:
-                d_weight = paddle.to_tensor(0.0)
-
-            disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
-            loss = weighted_nll_loss + self.kl_weight * kl_loss + d_weight * disc_factor * g_loss
-
-            log = {
-                "{}/total_loss".format(split): loss.clone().detach().mean().item(),
-                "{}/logvar".format(split): self.logvar.detach().item(),
-                "{}/kl_loss".format(split): kl_loss.detach().mean().item(),
-                "{}/nll_loss".format(split): nll_loss.detach().mean().item(),
-                "{}/rec_loss".format(split): rec_loss.detach().mean().item(),
-                "{}/d_weight".format(split): d_weight.detach().item(),
-                "{}/disc_factor".format(split): paddle.to_tensor(disc_factor).item(),
-                "{}/g_loss".format(split): g_loss.detach().mean().item(),
-            }
-            return loss, log
-
-        if optimizer_idx == 1:
-            # second pass for discriminator update
-            if cond is None:
-                logits_real = self.discriminator(inputs.detach())
-                logits_fake = self.discriminator(reconstructions.detach())
-            else:
-                logits_real = self.discriminator(paddle.concat((inputs.detach(), cond), axis=1))
-                logits_fake = self.discriminator(paddle.concat((reconstructions.detach(), cond), axis=1))
-            disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
-            d_loss = disc_factor * self.disc_loss(logits_real, logits_fake)
-
-            log = {
-                "{}/disc_loss".format(split): d_loss.clone().detach().mean().item(),
-                "{}/logits_real".format(split): logits_real.detach().mean().item(),
-                "{}/logits_fake".format(split): logits_fake.detach().mean().item(),
-            }
-            return d_loss, log
diff --git a/ppdiffusers/examples/autoencoder/vae/ldm/model.py b/ppdiffusers/examples/autoencoder/vae/ldm/model.py
deleted file mode 100644
index 5df1c98fe4c6..000000000000
--- a/ppdiffusers/examples/autoencoder/vae/ldm/model.py
+++ /dev/null
@@ -1,365 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from contextlib import contextmanager
-from typing import Tuple
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-
-from ppdiffusers.configuration_utils import ConfigMixin, register_to_config
-from ppdiffusers.initializer import reset_initialized_parameter
-from ppdiffusers.models.autoencoder_kl import (
-    AutoencoderKLOutput,
-    Decoder,
-    DecoderOutput,
-    DiagonalGaussianDistribution,
-    Encoder,
-)
-
-# from ppdiffusers.models.ema import LitEma
-from ppdiffusers.models.modeling_utils import ModelMixin
-
-from .losses import LPIPSWithDiscriminator
-
-
-def count_params(model, verbose=True):
-    total_params = sum(p.numel() for p in model.parameters()).item()
-    if verbose:
-        print(f"{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.")
-    return total_params
-
-
-# regist a new model
-class AutoencoderKLWithLoss(ModelMixin, ConfigMixin):
-    _supports_gradient_checkpointing = True
-
-    @register_to_config
-    def __init__(
-        self,
-        in_channels: int = 3,
-        out_channels: int = 3,
-        down_block_types: Tuple[str] = (
-            "DownEncoderBlock2D",
-            "DownEncoderBlock2D",
-            "DownEncoderBlock2D",
-            "DownEncoderBlock2D",
-        ),
-        down_block_out_channels: Tuple[int] = None,
-        up_block_types: Tuple[str] = (
-            "UpDecoderBlock2D",
-            "UpDecoderBlock2D",
-            "UpDecoderBlock2D",
-            "UpDecoderBlock2D",
-        ),
-        up_block_out_channels: Tuple[int] = None,
-        block_out_channels: Tuple[int] = (128, 256, 512, 512),
-        layers_per_block: int = 2,
-        act_fn: str = "silu",
-        latent_channels: int = 4,
-        norm_num_groups: int = 32,
-        sample_size: int = 512,
-        # new add
-        input_size: Tuple[int] = None,
-        # loss arguments
-        disc_start=50001,
-        kl_weight=1.0e-6,
-        disc_weight=0.5,
-        logvar_init=0.0,
-        pixelloss_weight=1.0,
-        disc_num_layers=3,
-        disc_in_channels=3,
-        disc_factor=1.0,
-        perceptual_weight=1.0,
-        use_actnorm=False,
-        disc_conditional=False,
-        disc_loss="hinge",
-        use_ema=False,
-        ema_decay=None,
-    ):
-        super().__init__()
-        self.input_size = [int(_) for _ in input_size] if input_size is not None else None
-        self.encoder = Encoder(
-            in_channels=in_channels,
-            out_channels=latent_channels,
-            down_block_types=down_block_types,
-            block_out_channels=down_block_out_channels
-            if down_block_out_channels
-            is not None  # if down_block_out_channels not givien, we will use block_out_channels
-            else block_out_channels,
-            layers_per_block=layers_per_block,
-            act_fn=act_fn,
-            norm_num_groups=norm_num_groups,
-            double_z=True,
-        )
-
-        # pass init params to Decoder
-        self.decoder = Decoder(
-            in_channels=latent_channels,
-            out_channels=out_channels,
-            up_block_types=up_block_types,
-            block_out_channels=up_block_out_channels  # if up_block_out_channels not givien, we will use block_out_channels
-            if up_block_out_channels is not None
-            else block_out_channels,
-            layers_per_block=layers_per_block,
-            norm_num_groups=norm_num_groups,
-            act_fn=act_fn,
-        )
-
-        self.quant_conv = nn.Conv2D(2 * latent_channels, 2 * latent_channels, 1)
-        self.post_quant_conv = nn.Conv2D(latent_channels, latent_channels, 1)
-
-        # register a loss function
-        self.loss = LPIPSWithDiscriminator(
-            disc_start=disc_start,
-            kl_weight=kl_weight,
-            disc_weight=disc_weight,
-            logvar_init=logvar_init,
-            pixelloss_weight=pixelloss_weight,
-            disc_num_layers=disc_num_layers,
-            disc_in_channels=disc_in_channels,
-            disc_factor=disc_factor,
-            perceptual_weight=perceptual_weight,
-            use_actnorm=use_actnorm,
-            disc_conditional=disc_conditional,
-            disc_loss=disc_loss,
-        )
-        count_params(self)
-        self.init_weights()
-        self.use_ema = use_ema
-        # if use_ema:
-        #     self.model_ema = LitEma(self, decay=ema_decay)
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, (Encoder, Decoder)):
-            module.gradient_checkpointing = value
-
-    def init_weights(self):
-        reset_initialized_parameter(self.encoder)
-        reset_initialized_parameter(self.decoder)
-        reset_initialized_parameter(self.quant_conv)
-        reset_initialized_parameter(self.post_quant_conv)
-
-    def custom_forward(
-        self,
-        sample: paddle.Tensor,
-        sample_posterior: bool = True,
-    ):
-        posterior = self.encode(sample).latent_dist
-        if sample_posterior:
-            z = posterior.sample()
-        else:
-            z = posterior.mode()
-        dec = self.decode(z).sample
-        return dec, posterior
-
-    def get_last_layer(self):
-        return self.decoder.conv_out.weight
-
-    def on_train_batch_end(self, *args, **kwargs):
-        # for EMA computation
-        if self.use_ema:
-            self.model_ema(self)
-
-    @contextmanager
-    def ema_scope(self, context=None):
-        if self.use_ema:
-            self.model_ema.store(self.parameters())
-            self.model_ema.copy_to(self)
-            if context is not None:
-                print(f"{context}: Switched to EMA weights")
-        try:
-            yield None
-        finally:
-            if self.use_ema:
-                self.model_ema.restore(self.parameters())
-                if context is not None:
-                    print(f"{context}: Restored training weights")
-
-    def forward(self, pixel_values, optimizer_idx=0, global_step=0):
-        # make sure we are in train mode
-        self.train()
-        if self.input_size is None:
-            encoder_inputs = pixel_values
-        else:
-            encoder_inputs = F.interpolate(pixel_values, size=self.input_size, mode="bilinear")
-
-        reconstructions, posterior = self.custom_forward(encoder_inputs)
-
-        if optimizer_idx == 0:
-            # train encoder+decoder+logvar
-            aeloss, log_dict_ae = self.loss(
-                pixel_values,
-                reconstructions,
-                posterior,
-                optimizer_idx,
-                global_step,
-                last_layer=self.get_last_layer(),
-                split="train",
-            )
-            return aeloss, log_dict_ae
-
-        if optimizer_idx == 1:
-            # train the discriminator
-            discloss, log_dict_disc = self.loss(
-                pixel_values,
-                reconstructions,
-                posterior,
-                optimizer_idx,
-                global_step,
-                last_layer=self.get_last_layer(),
-                split="train",
-            )
-            return discloss, log_dict_disc
-
-    @paddle.no_grad()
-    def log_images(self, pixel_values, only_inputs=False, **kwargs):
-        self.eval()
-        log = dict()
-        if self.input_size is None:
-            encoder_inputs = pixel_values
-        else:
-            encoder_inputs = F.interpolate(pixel_values, size=self.input_size, mode="bilinear")
-
-        if not only_inputs:
-            xrec, posterior = self.custom_forward(encoder_inputs)
-            log["samples"] = self.decode_image(self.decode(paddle.randn(posterior.sample().shape)).sample)
-            log["reconstructions"] = self.decode_image(xrec)
-            if self.use_ema:
-                with self.ema_scope():
-                    xrec_ema, posterior_ema = self.custom_forward(encoder_inputs)
-                    log["samples_ema"] = self.decode_image(
-                        self.decode(paddle.randn(posterior_ema.sample().shape)).sample
-                    )
-                    log["reconstructions_ema"] = self.decode_image(xrec_ema)
-        # update
-        log["encoder_inputs"] = self.decode_image(encoder_inputs)
-        self.train()
-        return log
-
-    def decode_image(self, image):
-        image = (image / 2 + 0.5).clip(0, 1).transpose([0, 2, 3, 1])
-        image = (image * 255.0).cast("float32").numpy().round()
-        return image
-
-    @paddle.no_grad()
-    def validation_step(self, pixel_values, global_step=0):
-        log_dict_ae, log_dict_disc = self._validation_step(pixel_values, global_step)
-        if self.use_ema:
-            with self.ema_scope():
-                log_dict_ae_ema, log_dict_disc_ema = self._validation_step(pixel_values, global_step, postfix="_ema")
-                log_dict_ae.update(log_dict_ae_ema)
-                log_dict_disc.update(log_dict_disc_ema)
-
-        return log_dict_ae, log_dict_disc
-
-    def _validation_step(self, pixel_values, global_step=0, postfix=""):
-        self.eval()
-        if self.input_size is None:
-            encoder_inputs = pixel_values
-        else:
-            encoder_inputs = F.interpolate(pixel_values, size=self.input_size, mode="bilinear")
-
-        reconstructions, posterior = self.custom_forward(encoder_inputs)
-        aeloss, log_dict_ae = self.loss(
-            pixel_values,
-            reconstructions,
-            posterior,
-            0,
-            global_step,
-            last_layer=self.get_last_layer(),
-            split="val" + postfix,
-        )
-
-        discloss, log_dict_disc = self.loss(
-            pixel_values,
-            reconstructions,
-            posterior,
-            1,
-            global_step,
-            last_layer=self.get_last_layer(),
-            split="val" + postfix,
-        )
-        self.train()
-        return log_dict_ae, log_dict_disc
-
-    def toggle_optimizer(self, optimizers, optimizer_idx):
-        """
-        Makes sure only the gradients of the current optimizer's parameters are calculated
-        in the training step to prevent dangling gradients in multiple-optimizer setup.
-        It works with :meth:`untoggle_optimizer` to make sure ``param_stop_gradient_state`` is properly reset.
-        Override for your own behavior.
-
-        Args:
-            optimizer: Current optimizer used in the training loop
-            optimizer_idx: Current optimizer idx in the training loop
-
-        Note:
-            Only called when using multiple optimizers
-        """
-        # Iterate over all optimizer parameters to preserve their `stop_gradient` information
-        # in case these are pre-defined during `configure_optimizers`
-        param_stop_gradient_state = {}
-        for opt in optimizers:
-            for param in opt._parameter_list:
-                # If a param already appear in param_stop_gradient_state, continue
-                if param in param_stop_gradient_state:
-                    continue
-                param_stop_gradient_state[param] = param.stop_gradient
-                param.stop_gradient = True
-
-        # Then iterate over the current optimizer's parameters and set its `stop_gradient`
-        # properties accordingly
-        for param in optimizers[optimizer_idx]._parameter_list:
-            param.stop_gradient = param_stop_gradient_state[param]
-        self._param_stop_gradient_state = param_stop_gradient_state
-
-    def untoggle_optimizer(self, optimizers, optimizer_idx):
-        """
-        Resets the state of required gradients that were toggled with :meth:`toggle_optimizer`.
-        Override for your own behavior.
-
-        Args:
-            optimizer_idx: Current optimizer idx in the training loop
-
-        Note:
-            Only called when using multiple optimizers
-        """
-        for opt_idx, opt in enumerate(optimizers):
-            if optimizer_idx != opt_idx:
-                for param in opt._parameter_list:
-                    if param in self._param_stop_gradient_state:
-                        param.stop_gradient = self._param_stop_gradient_state[param]
-        # save memory
-        self._param_stop_gradient_state = {}
-
-    def encode(self, x: paddle.Tensor, return_dict: bool = True):
-        h = self.encoder(x)
-        moments = self.quant_conv(h)
-        posterior = DiagonalGaussianDistribution(moments)
-
-        if not return_dict:
-            return (posterior,)
-
-        return AutoencoderKLOutput(latent_dist=posterior)
-
-    def decode(self, z: paddle.Tensor, return_dict: bool = True):
-        z = self.post_quant_conv(z)
-        dec = self.decoder(z)
-
-        if not return_dict:
-            return (dec,)
-
-        return DecoderOutput(sample=dec)
diff --git a/ppdiffusers/examples/autoencoder/vae/ldm/text_image_pair.py b/ppdiffusers/examples/autoencoder/vae/ldm/text_image_pair.py
deleted file mode 100644
index 4a91b34df3ac..000000000000
--- a/ppdiffusers/examples/autoencoder/vae/ldm/text_image_pair.py
+++ /dev/null
@@ -1,235 +0,0 @@
-# !/usr/bin/env python3
-# -*- coding: UTF-8 -*-
-
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import base64
-import gzip
-import io
-import os
-import random
-import traceback
-
-import numpy as np
-import paddle
-import paddle.distributed as dist
-from paddle.io import IterableDataset, get_worker_info
-from paddle.vision import transforms
-from paddle.vision.transforms.transforms import _get_image_size
-from PIL import Image
-
-Image.MAX_IMAGE_PIXELS = 2300000000
-
-EXIT_SIGNAL_FILE = "xxxxxxx"
-
-
-def parse_line(line, filename):
-    def parse_src(filename):
-        if "laion400m" in filename:
-            return "laion400m"
-        else:
-            raise NotImplementedError(f"Unkown data source, {filename}")
-
-    try:
-        vec = line.strip().split("\t")
-        data_source = parse_src(filename)
-        if data_source == "laion400m":
-            # _, caption, _, img_b64 = vec[:4]
-            caption, _, img_b64 = vec[:3]
-        else:
-            _, captions, _, _, _, img_b64 = vec[:6]
-            caption = random.sample(captions.split("|"), 1)[0].replace("\1", "")
-
-        image = Image.open(io.BytesIO(base64.b64decode(img_b64))).convert("RGB")
-        if random.random() < 0.075:
-            caption = ""
-        return dict(image=image, caption=caption)
-    except Exception:
-        print(f"error when parse file {filename}")
-        traceback.print_exc()
-        return None
-
-
-# donot use random.randint
-class RandomCrop(transforms.RandomCrop):
-    def _get_param(self, img, output_size):
-        w, h = _get_image_size(img)
-        th, tw = output_size
-        if w == tw and h == th:
-            return 0, 0, h, w
-
-        i = paddle.randint(0, h - th + 1).item()
-        j = paddle.randint(0, w - tw + 1).item()
-        return i, j, th, tw
-
-
-class TextImagePair(IterableDataset):
-    def __init__(
-        self,
-        file_list,
-        size,
-        num_records,
-        image_processing=None,
-        buffer_size=1000,
-        shuffle_every_n_samples=5,
-        interpolation="lanczos",
-    ):
-        self.size = size
-        if image_processing is None:
-            self.image_processing = transforms.Compose(
-                [
-                    transforms.Resize(int(size / 0.9), interpolation),
-                    RandomCrop(size),
-                    transforms.ToTensor(),
-                    transforms.Normalize(0.5, 0.5),
-                ]
-            )
-        else:
-            self.image_processing = image_processing
-        self.file_list = []
-        file_weights = []
-        with open(file_list, "r") as f:
-            file_lists = f.read().strip().split("\n")
-            for file_l in file_lists:
-                file_l = file_l.split(" ")
-                if len(file_l) > 1:
-                    file_weight = float(file_l[1])
-                    file_weights.append(file_weight)
-                file_l = file_l[0]
-                with open(file_l, "r") as f:
-                    self.file_list.append(f.read().strip().split("\n"))
-        print([len(file_l) for file_l in self.file_list])
-        if len(file_weights) == len(self.file_list):
-            file_weights = np.array(file_weights)
-            file_weight_sum = np.sum(file_weights)
-            assert file_weight_sum > 0, "sum of file weights must > 0"
-            file_weights = file_weights / file_weight_sum
-            print(f"sample weights of files: {file_weights}")
-            self.file_weights_cumsum = np.cumsum(file_weights)
-            self.file_weights_cumsum = np.concatenate([[0.0], self.file_weights_cumsum])
-        else:
-            print("sample each file list with same probabiliy")
-            self.file_weights_cumsum = None
-
-        self.num_records = num_records
-        self.file_ids = [np.arange(len(filelist)) for filelist in self.file_list]
-        print(f"original lengths of self.file_ids: {[len(f) for f in self.file_ids]}")
-        self.buffer_size = buffer_size
-        self.shuffle_every_n_samples = shuffle_every_n_samples
-
-    def sample_loader(self, file_ids, filenames):
-        while True:
-            random.shuffle(file_ids)
-            for i in file_ids:
-                filename = filenames[i].strip("\n")
-                with gzip.open(filename, "rb") if filename.endswith(".gz") else open(filename, "rb") as f:
-                    retry = 0
-                    while True:
-                        line = f.readline()
-
-                        if line == b"":
-                            break
-                        try:
-                            try:
-                                line = line.decode(encoding="utf-8")
-                            except Exception:
-                                line = line.decode(encoding="gb18030")
-                        except Exception:
-                            print(f"error on file {filename}")
-                            continue
-                        data = parse_line(line, filename)
-                        if data is None:
-                            retry += 1
-                            if retry > 100:
-                                break
-                            continue
-                        else:
-                            w, h = data["image"].size
-                            if w < self.size or h < self.size:
-                                continue
-                            data["image"] = self.image_processing(data["image"])
-                            yield data
-
-    def random_load_from_multi_dataset(self):
-        print(f"lengths of self.file_ids in random_load: {[len(f) for f in self.file_ids]}")
-        sample_loader_per_dataset = [
-            iter(self.sample_loader(self.file_ids[i], self.file_list[i])) for i in range(len(self.file_ids))
-        ]
-
-        while True:
-            if self.file_weights_cumsum is None:
-                sample_loader = random.choice(sample_loader_per_dataset)
-            else:
-                rand_num = random.random()
-                for i in range(len(self.file_list)):
-                    if self.file_weights_cumsum[i] <= rand_num < self.file_weights_cumsum[i + 1]:
-                        break
-                sample_loader = sample_loader_per_dataset[i]
-                # debug
-                # print(self.file_list[i][0])
-            yield next(sample_loader)
-
-    def shuffle(self, iterator):
-        if os.path.exists(EXIT_SIGNAL_FILE):
-            print("Stop Task!")
-            raise NotImplementedError
-        buffer_list = []
-        for _ in range(self.buffer_size):
-            buffer_list.append(next(iterator))
-        i = 0
-        while True:
-            if i % self.shuffle_every_n_samples == 0:
-                random.shuffle(buffer_list)
-            yield buffer_list.pop()
-            buffer_list.append(next(iterator))
-            i += 1
-
-    def __len__(self):
-        return self.num_records
-
-    def __iter__(self):
-        return self.shuffle(iter(self.random_load_from_multi_dataset()))
-
-
-def split_data_per_worker(dataset, worker_id, local_rank, world_size, num_workers):
-    worker_global_id = local_rank * num_workers + worker_id
-    dataset.rng = np.random.RandomState(worker_global_id)
-    for i in range(len(dataset.file_ids)):
-        file_ids = dataset.file_ids[i]
-        num_chunks = world_size * num_workers
-        chunk_size = len(file_ids) / num_chunks
-
-        begin_id = int(worker_global_id * chunk_size)
-        end_id = int((worker_global_id + 1) * chunk_size)
-        dataset.file_ids[i] = dataset.file_ids[i][begin_id:end_id]
-        print(
-            f"dataset {i}, local_rank: {local_rank}, worker_id: {worker_id}, worker_global_id: {worker_global_id}, file_range: ({begin_id}, {end_id})"
-        )
-    return None
-
-
-def worker_init_fn(_):
-    worker_info = get_worker_info()
-    dataset = worker_info.dataset
-    worker_id = worker_info.id
-
-    local_rank = dist.get_rank()
-    world_size = dist.get_world_size()
-    num_workers = worker_info.num_workers
-    if isinstance(dataset, TextImagePair):
-        split_data_per_worker(dataset, worker_id, local_rank, world_size, num_workers)
-        return np.random.seed(np.random.get_state()[1][0] + worker_id)
-    else:
-        return np.random.seed(np.random.get_state()[1][0] + worker_id)
diff --git a/ppdiffusers/examples/autoencoder/vae/requirements.txt b/ppdiffusers/examples/autoencoder/vae/requirements.txt
deleted file mode 100644
index 016aa2d1819c..000000000000
--- a/ppdiffusers/examples/autoencoder/vae/requirements.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-paddlenlp>=2.6.0rc0
-ppdiffusers>=0.16.1
-scipy
-pyyaml
-opencv-python
-albumentations
-fastcore
-visualdl
-Pillow
-opencv-python-headless<=4.3
\ No newline at end of file
diff --git a/ppdiffusers/examples/autoencoder/vae/run.sh b/ppdiffusers/examples/autoencoder/vae/run.sh
deleted file mode 100644
index 1ccd0d8985eb..000000000000
--- a/ppdiffusers/examples/autoencoder/vae/run.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-unset PADDLE_ELASTIC_JOB_ID
-unset PADDLE_TRAINER_ENDPOINTS
-unset DISTRIBUTED_TRAINER_ENDPOINTS
-unset FLAGS_START_PORT
-unset PADDLE_ELASTIC_TIMEOUT
-
-nohup python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" train_vae.py \
-    --pretrained_model_name_or_path CompVis/stable-diffusion-v1-4 \
-    --ignore_keys decoder. \
-    --vae_config_file config/vae.json \
-    --freeze_encoder \
-    --input_size 256 256 \
-    --max_train_steps 100000000000 \
-    --learning_rate 1e-4 \
-    --batch_size 4 \
-    --num_workers 8 \
-    --logging_steps 100 \
-    --save_steps 4000 \
-    --image_logging_steps 2000 \
-    --disc_start 50001 \
-    --kl_weight 0.000001 \
-    --disc_weight 0.5 \
-    --use_ema \
-    --ema_decay 0.9999 \
-    --recompute \
-    --enable_xformers_memory_efficient_attention \
-    --resolution 512 1> paddle_vae.out 2>&1 & 
\ No newline at end of file
diff --git a/ppdiffusers/examples/autoencoder/vae/scripts/README.md b/ppdiffusers/examples/autoencoder/vae/scripts/README.md
deleted file mode 100644
index e5cd12011b64..000000000000
--- a/ppdiffusers/examples/autoencoder/vae/scripts/README.md
+++ /dev/null
@@ -1,46 +0,0 @@
-# 脚本文件
-
-本目录下包含了四个脚本文件：
-- **convert_kl_8_to_ppdiffusers.py**: 将原版LDM中的 VAE 权重转换为 Paddle 版的权重，注意：我们转换过程中同时转换了 loss 部分的权重。
-- **get_autoencoder_results.py**: 加载训练好的模型文件，然后生成待评估的图片。
-- **fid_score.py**: 计算 fid score 的代码。
-- **calculate_psnr_ssim.py**: 计算 psnr 和 ssim 指标的代码。
-
-## 1. Pytorch权重转换为Paddle权重
-假设我们已经预先使用原版LDM代码初始化了一个`"ldm_vae_init0.ckpt"`权重。然后我们需要使用下面的代码进行权重转换。
-
-```shell
-python convert_kl_8_to_ppdiffusers.py \
-    --checkpoint_path ldm_vae_init0.ckpt \
-    --dump_path ldm_vae_init0_paddle \
-    --original_config_file ../config/f8encoder_f16decoder.yaml
-```
-经过转换后，我们可以得到下面的目录结构。
-
-```shell
-├── ldm_vae_init0_paddle  # 我们指定的输出文件路径
-    ├── model_state.pdparams
-    ├── config.json
-```
-
-## 2. 评估训练好的模型性能
-
-### 2.1 生成待评估的图片
-
-```shell
-python get_autoencoder_results.py --vae_path "./autoencoder_outputs/checkpoint-200000" --src_size 256 --tgt_size 512 --imgs './coco_val2014_resize512_centercrop/*.png' --outdir generate_images/
-```
-
-### 2.2 计算FID指标
-
-```shell
-python get_autoencoder_results.py --src_size 256 --tgt_size 512 --imgs './coco_val2014_resize512_centercrop/*.png' --outdir generate_images/
-```
-
-### 2.3 计算PSNR和SSIM指标
-
-```shell
-python fid_score.py ./coco_val2014_resize512_centercrop/ ./generate_images/ --device gpu
-
-python calculate_psnr_ssim.py --imgs1 'coco_val2014_resize512_centercrop/*.png' --imgs2 'generate_images/*.png'
-```
diff --git a/ppdiffusers/examples/autoencoder/vae/scripts/calculate_psnr_ssim.py b/ppdiffusers/examples/autoencoder/vae/scripts/calculate_psnr_ssim.py
deleted file mode 100644
index ebfb3ff1df67..000000000000
--- a/ppdiffusers/examples/autoencoder/vae/scripts/calculate_psnr_ssim.py
+++ /dev/null
@@ -1,190 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import glob
-
-import click
-import cv2
-import numpy as np
-from PIL import Image
-from tqdm import tqdm
-
-
-def reorder_image(img, input_order="HWC"):
-    """Reorder images to 'HWC' order.
-
-    If the input_order is (h, w), return (h, w, 1);
-    If the input_order is (c, h, w), return (h, w, c);
-    If the input_order is (h, w, c), return as it is.
-
-    Args:
-        img (ndarray): Input image.
-        input_order (str): Whether the input order is 'HWC' or 'CHW'.
-            If the input image shape is (h, w), input_order will not have
-            effects. Default: 'HWC'.
-
-    Returns:
-        ndarray: reordered image.
-    """
-
-    if input_order not in ["HWC", "CHW"]:
-        raise ValueError(f"Wrong input_order {input_order}. Supported input_orders are " "'HWC' and 'CHW'")
-    if len(img.shape) == 2:
-        img = img[..., None]
-    if input_order == "CHW":
-        img = img.transpose(1, 2, 0)
-    return img
-
-
-def calculate_psnr(img, img2, crop_border, input_order="HWC", **kwargs):
-    """Calculate PSNR (Peak Signal-to-Noise Ratio).
-
-    Ref: https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio
-
-    Args:
-        img (ndarray): Images with range [0, 255].
-        img2 (ndarray): Images with range [0, 255].
-        crop_border (int): Cropped pixels in each edge of an image. These
-            pixels are not involved in the PSNR calculation.
-        input_order (str): Whether the input order is 'HWC' or 'CHW'.
-            Default: 'HWC'.
-        test_y_channel (bool): Test on Y channel of YCbCr. Default: False.
-
-    Returns:
-        float: psnr result.
-    """
-
-    assert img.shape == img2.shape, f"Image shapes are different: {img.shape}, {img2.shape}."
-    if input_order not in ["HWC", "CHW"]:
-        raise ValueError(f"Wrong input_order {input_order}. Supported input_orders are " '"HWC" and "CHW"')
-    img = reorder_image(img, input_order=input_order)
-    img2 = reorder_image(img2, input_order=input_order)
-    img = img.astype(np.float64)
-    img2 = img2.astype(np.float64)
-
-    if crop_border != 0:
-        img = img[crop_border:-crop_border, crop_border:-crop_border, ...]
-        img2 = img2[crop_border:-crop_border, crop_border:-crop_border, ...]
-
-    mse = np.mean((img - img2) ** 2)
-    if mse == 0:
-        return float("inf")
-    return 20.0 * np.log10(255.0 / np.sqrt(mse))
-
-
-def _ssim(img, img2):
-    """Calculate SSIM (structural similarity) for one channel images.
-
-    It is called by func:`calculate_ssim`.
-
-    Args:
-        img (ndarray): Images with range [0, 255] with order 'HWC'.
-        img2 (ndarray): Images with range [0, 255] with order 'HWC'.
-
-    Returns:
-        float: ssim result.
-    """
-
-    c1 = (0.01 * 255) ** 2
-    c2 = (0.03 * 255) ** 2
-
-    img = img.astype(np.float64)
-    img2 = img2.astype(np.float64)
-    kernel = cv2.getGaussianKernel(11, 1.5)
-    window = np.outer(kernel, kernel.transpose())
-
-    mu1 = cv2.filter2D(img, -1, window)[5:-5, 5:-5]
-    mu2 = cv2.filter2D(img2, -1, window)[5:-5, 5:-5]
-    mu1_sq = mu1**2
-    mu2_sq = mu2**2
-    mu1_mu2 = mu1 * mu2
-    sigma1_sq = cv2.filter2D(img**2, -1, window)[5:-5, 5:-5] - mu1_sq
-    sigma2_sq = cv2.filter2D(img2**2, -1, window)[5:-5, 5:-5] - mu2_sq
-    sigma12 = cv2.filter2D(img * img2, -1, window)[5:-5, 5:-5] - mu1_mu2
-
-    ssim_map = ((2 * mu1_mu2 + c1) * (2 * sigma12 + c2)) / ((mu1_sq + mu2_sq + c1) * (sigma1_sq + sigma2_sq + c2))
-    return ssim_map.mean()
-
-
-def calculate_ssim(img, img2, crop_border, input_order="HWC", **kwargs):
-    """Calculate SSIM (structural similarity).
-
-    Ref:
-    Image quality assessment: From error visibility to structural similarity
-
-    The results are the same as that of the official released MATLAB code in
-    https://ece.uwaterloo.ca/~z70wang/research/ssim/.
-
-    For three-channel images, SSIM is calculated for each channel and then
-    averaged.
-
-    Args:
-        img (ndarray): Images with range [0, 255].
-        img2 (ndarray): Images with range [0, 255].
-        crop_border (int): Cropped pixels in each edge of an image. These
-            pixels are not involved in the SSIM calculation.
-        input_order (str): Whether the input order is 'HWC' or 'CHW'.
-            Default: 'HWC'.
-        test_y_channel (bool): Test on Y channel of YCbCr. Default: False.
-
-    Returns:
-        float: ssim result.
-    """
-
-    assert img.shape == img2.shape, f"Image shapes are different: {img.shape}, {img2.shape}."
-    if input_order not in ["HWC", "CHW"]:
-        raise ValueError(f"Wrong input_order {input_order}. Supported input_orders are " '"HWC" and "CHW"')
-    img = reorder_image(img, input_order=input_order)
-    img2 = reorder_image(img2, input_order=input_order)
-    img = img.astype(np.float64)
-    img2 = img2.astype(np.float64)
-
-    if crop_border != 0:
-        img = img[crop_border:-crop_border, crop_border:-crop_border, ...]
-        img2 = img2[crop_border:-crop_border, crop_border:-crop_border, ...]
-
-    ssims = []
-    for i in range(img.shape[2]):
-        ssims.append(_ssim(img[..., i], img2[..., i]))
-    return np.array(ssims).mean()
-
-
-@click.command()
-@click.option("--imgs1", type=str)
-@click.option("--imgs2", type=str)
-def main(imgs1, imgs2):
-    imgs1 = sorted(glob.glob(imgs1))
-    imgs2 = sorted(glob.glob(imgs2))
-    assert len(imgs1) == len(imgs2)
-
-    psnr_all, ssim_all = [], []
-    for img1, img2 in tqdm(list(zip(imgs1, imgs2))):
-        img1 = Image.open(img1).convert("RGB")
-        img2 = Image.open(img2).convert("RGB")
-
-        psnr = calculate_psnr(np.array(img1), np.array(img2), crop_border=4)
-        ssim = calculate_ssim(np.array(img1), np.array(img2), crop_border=4)
-        psnr_all.append(psnr)
-        ssim_all.append(ssim)
-
-    psnr = np.mean(psnr_all)
-    ssim = np.mean(ssim_all)
-
-    print(f"PSNR: {psnr}")
-    print(f"SSIM: {ssim}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/ppdiffusers/examples/autoencoder/vae/scripts/convert_kl_8_to_ppdiffusers.py b/ppdiffusers/examples/autoencoder/vae/scripts/convert_kl_8_to_ppdiffusers.py
deleted file mode 100644
index 5dc0c2bd7598..000000000000
--- a/ppdiffusers/examples/autoencoder/vae/scripts/convert_kl_8_to_ppdiffusers.py
+++ /dev/null
@@ -1,401 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-
-import paddle
-import torch
-
-try:
-    from omegaconf import OmegaConf
-except ImportError:
-    raise ImportError(
-        "OmegaConf is required to convert the LDM checkpoints. Please install it with `pip install OmegaConf`."
-    )
-
-from ppdiffusers import AutoencoderKL
-
-
-def shave_segments(path, n_shave_prefix_segments=1):
-    """
-    Removes segments. Positive values shave the first segments, negative shave the last segments.
-    """
-    if n_shave_prefix_segments >= 0:
-        return ".".join(path.split(".")[n_shave_prefix_segments:])
-    else:
-        return ".".join(path.split(".")[:n_shave_prefix_segments])
-
-
-def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside resnets to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item.replace("in_layers.0", "norm1")
-        new_item = new_item.replace("in_layers.2", "conv1")
-
-        new_item = new_item.replace("out_layers.0", "norm2")
-        new_item = new_item.replace("out_layers.3", "conv2")
-
-        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
-        new_item = new_item.replace("skip_connection", "conv_shortcut")
-
-        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-
-        mapping.append({"old": old_item, "new": new_item})
-
-    return mapping
-
-
-def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside resnets to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item
-
-        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
-        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-
-        mapping.append({"old": old_item, "new": new_item})
-
-    return mapping
-
-
-def renew_attention_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside attentions to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item
-        mapping.append({"old": old_item, "new": new_item})
-
-    return mapping
-
-
-def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside attentions to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item
-
-        new_item = new_item.replace("norm.weight", "group_norm.weight")
-        new_item = new_item.replace("norm.bias", "group_norm.bias")
-
-        new_item = new_item.replace("q.weight", "query.weight")
-        new_item = new_item.replace("q.bias", "query.bias")
-
-        new_item = new_item.replace("k.weight", "key.weight")
-        new_item = new_item.replace("k.bias", "key.bias")
-
-        new_item = new_item.replace("v.weight", "value.weight")
-        new_item = new_item.replace("v.bias", "value.bias")
-
-        new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
-        new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
-
-        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-
-        mapping.append({"old": old_item, "new": new_item})
-
-    return mapping
-
-
-def assign_to_checkpoint(
-    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
-):
-    """
-    This does the final conversion step: take locally converted weights and apply a global renaming
-    to them. It splits attention layers, and takes into account additional replacements
-    that may arise.
-    Assigns the weights to the new checkpoint.
-    """
-    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
-
-    # Splits the attention layers into three variables.
-    if attention_paths_to_split is not None:
-        for path, path_map in attention_paths_to_split.items():
-            old_tensor = old_checkpoint[path]
-            channels = old_tensor.shape[0] // 3
-
-            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
-
-            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
-
-            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
-            query, key, value = old_tensor.split(channels // num_heads, dim=1)
-
-            checkpoint[path_map["query"]] = query.reshape(target_shape)
-            checkpoint[path_map["key"]] = key.reshape(target_shape)
-            checkpoint[path_map["value"]] = value.reshape(target_shape)
-
-    for path in paths:
-        new_path = path["new"]
-
-        # These have already been assigned
-        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
-            continue
-
-        # Global renaming happens here
-        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
-        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
-        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
-
-        if additional_replacements is not None:
-            for replacement in additional_replacements:
-                new_path = new_path.replace(replacement["old"], replacement["new"])
-
-        # proj_attn.weight has to be converted from conv 1D to linear
-        if "proj_attn.weight" in new_path:
-            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
-        else:
-            checkpoint[new_path] = old_checkpoint[path["old"]]
-
-
-def conv_attn_to_linear(checkpoint):
-    keys = list(checkpoint.keys())
-    attn_keys = ["query.weight", "key.weight", "value.weight"]
-    for key in keys:
-        if ".".join(key.split(".")[-2:]) in attn_keys:
-            if checkpoint[key].ndim > 2:
-                checkpoint[key] = checkpoint[key][:, :, 0, 0]
-        elif "proj_attn.weight" in key:
-            if checkpoint[key].ndim > 2:
-                checkpoint[key] = checkpoint[key][:, :, 0]
-
-
-def create_vae_diffusers_config(original_config):
-    """
-    Creates a config for the diffusers based on the config of the LDM model.
-    """
-    encoder_vae_params = original_config.model.params.ddconfig.encoder
-    decoder_vae_params = original_config.model.params.ddconfig.decoder
-    vae_params = decoder_vae_params
-
-    encoder_block_out_channels = [encoder_vae_params.ch * mult for mult in encoder_vae_params.ch_mult]
-    down_block_types = ["DownEncoderBlock2D"] * len(encoder_block_out_channels)
-
-    decoder_block_out_channels = [decoder_vae_params.ch * mult for mult in decoder_vae_params.ch_mult]
-    up_block_types = ["UpDecoderBlock2D"] * len(decoder_block_out_channels)
-
-    config = dict(
-        sample_size=512,  # vae_params.resolution,
-        in_channels=vae_params.in_channels,
-        out_channels=vae_params.out_ch,
-        down_block_types=tuple(down_block_types),
-        up_block_types=tuple(up_block_types),
-        block_out_channels=None,
-        down_block_out_channels=tuple(encoder_block_out_channels),
-        up_block_out_channels=tuple(decoder_block_out_channels),
-        latent_channels=vae_params.z_channels,
-        layers_per_block=vae_params.num_res_blocks,
-    )
-    return config
-
-
-def convert_ldm_vae_checkpoint(vae_state_dict, config):
-    new_checkpoint = {}
-
-    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
-    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
-    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
-    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
-    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
-    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
-
-    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
-    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
-    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
-    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
-    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
-    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
-
-    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
-    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
-    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
-    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
-
-    # Retrieves the keys for the encoder down blocks only
-    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
-    down_blocks = {
-        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
-    }
-
-    # Retrieves the keys for the decoder up blocks only
-    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
-    up_blocks = {
-        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
-    }
-
-    for i in range(num_down_blocks):
-        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
-
-        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
-            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
-                f"encoder.down.{i}.downsample.conv.weight"
-            )
-            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
-                f"encoder.down.{i}.downsample.conv.bias"
-            )
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
-    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
-    num_mid_res_blocks = 2
-    for i in range(1, num_mid_res_blocks + 1):
-        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
-    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
-    paths = renew_vae_attention_paths(mid_attentions)
-    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
-    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-    conv_attn_to_linear(new_checkpoint)
-
-    for i in range(num_up_blocks):
-        block_id = num_up_blocks - 1 - i
-        resnets = [
-            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
-        ]
-
-        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
-            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
-                f"decoder.up.{block_id}.upsample.conv.weight"
-            ]
-            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
-                f"decoder.up.{block_id}.upsample.conv.bias"
-            ]
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
-    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
-    num_mid_res_blocks = 2
-    for i in range(1, num_mid_res_blocks + 1):
-        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
-    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
-    paths = renew_vae_attention_paths(mid_attentions)
-    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
-    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-    conv_attn_to_linear(new_checkpoint)
-    return new_checkpoint
-
-
-def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet, diffusers_vae_unet_checkpoint, dtype="float32"):
-    need_transpose = []
-    for k, v in vae_or_unet.named_sublayers(include_self=True):
-        if isinstance(v, paddle.nn.Linear):
-            need_transpose.append(k + ".weight")
-    new_vae_or_unet = {}
-    for k, v in diffusers_vae_unet_checkpoint.items():
-        if k not in need_transpose:
-            new_vae_or_unet[k] = v.numpy().astype(dtype)
-        else:
-            new_vae_or_unet[k] = v.t().numpy().astype(dtype)
-    return new_vae_or_unet
-
-
-def check_keys(model, state_dict):
-    cls_name = model.__class__.__name__
-    missing_keys = []
-    mismatched_keys = []
-    for k, v in model.state_dict().items():
-        if k not in state_dict.keys():
-            missing_keys.append(k)
-        if list(v.shape) != list(state_dict[k].shape):
-            mismatched_keys.append(k)
-    if len(missing_keys):
-        missing_keys_str = ", ".join(missing_keys)
-        print(f"{cls_name} Found missing_keys {missing_keys_str}!")
-    if len(mismatched_keys):
-        mismatched_keys_str = ", ".join(mismatched_keys)
-        print(f"{cls_name} Found mismatched_keys {mismatched_keys_str}!")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
-    )
-    parser.add_argument(
-        "--original_config_file",
-        default="../config/f8encoder_f16decoder.yaml",
-        type=str,
-        help="The YAML config file corresponding to the original architecture.",
-    )
-    parser.add_argument(
-        "--dtype",
-        default="float32",
-        type=str,
-        help="Dtype of model weights.",
-    )
-    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
-
-    args = parser.parse_args()
-
-    original_config = OmegaConf.load(args.original_config_file)
-    checkpoint = torch.load(args.checkpoint_path, map_location="cpu")
-    checkpoint = checkpoint.get("state_dict", checkpoint)
-    vae_config = create_vae_diffusers_config(original_config)
-
-    # 1. convert vae encoder and decoder
-    diffusers_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
-    vae = AutoencoderKL.from_config(vae_config)
-    ppdiffusers_vae_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(vae, diffusers_vae_checkpoint, args.dtype)
-
-    # 2. convert losses
-    maps = {
-        "running_mean": "_mean",
-        "running_var": "_variance",
-    }
-    for k, v in checkpoint.items():
-        # ignore num_batches_tracked
-        if "num_batches_tracked" in k:
-            print(v)
-            continue
-        # only convert loss. prefix
-        if "loss." in k:
-            for old, new in maps.items():
-                k = k.replace(old, new)
-            # paddle donot support 0d tensor
-            if v.ndim == 0:
-                v = v.reshape((1,))
-            # rename
-            if "perceptual_loss.lin" in k:
-                k = k.replace("perceptual_loss.lin", "perceptual_loss.lins.")
-            ppdiffusers_vae_checkpoint[k] = v.numpy().astype(args.dtype)
-
-    # 3. check keys
-    check_keys(vae, ppdiffusers_vae_checkpoint)
-    vae.save_config(args.dump_path)
-    # 4. save state_dict
-    paddle.save(ppdiffusers_vae_checkpoint, os.path.join(args.dump_path, "model_state.pdparams"))
diff --git a/ppdiffusers/examples/autoencoder/vae/scripts/fid_score.py b/ppdiffusers/examples/autoencoder/vae/scripts/fid_score.py
deleted file mode 100644
index 5ea658adf9b2..000000000000
--- a/ppdiffusers/examples/autoencoder/vae/scripts/fid_score.py
+++ /dev/null
@@ -1,278 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# Copyright (c) mseitzer Author. All Rights Reserved.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Calculates the Frechet Inception Distance (FID) to evalulate GANs
-
-The FID metric calculates the distance between two distributions of images.
-Typically, we have summary statistics (mean & covariance matrix) of one
-of these distributions, while the 2nd distribution is given by a GAN.
-
-When run as a stand-alone program, it compares the distribution of
-images that are stored as PNG/JPEG at a specified location with a
-distribution given by summary statistics (in pickle format).
-
-The FID is calculated by assuming that X_1 and X_2 are the activations of
-the pool_3 layer of the inception net for generated samples and real world
-samples respectively.
-
-See --help to see further details.
-
-Code apapted from https://github.com/bioinf-jku/TTUR to use PyTorch instead
-of Tensorflow
-
-Copyright 2018 Institute of Bioinformatics, JKU Linz
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-
-import os
-import pathlib
-from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
-
-import numpy as np
-import paddle
-import paddle.vision.transforms as TF
-from paddle.nn.functional import adaptive_avg_pool2d
-from PIL import Image
-from scipy import linalg
-
-try:
-    from tqdm import tqdm
-except ImportError:
-    # If tqdm is not available, provide a mock version of it
-    def tqdm(x):
-        return x
-
-
-from inception import InceptionV3
-
-parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
-parser.add_argument("--batch-size", type=int, default=50, help="Batch size to use")
-parser.add_argument(
-    "--num-workers", type=int, help=("Number of processes to use for data loading. " "Defaults to `min(8, num_cpus)`")
-)
-parser.add_argument("--device", type=str, default=None, help="Device to use. Like gpu, gpu:0 or cpu")
-parser.add_argument(
-    "--dims",
-    type=int,
-    default=2048,
-    choices=list(InceptionV3.BLOCK_INDEX_BY_DIM),
-    help=("Dimensionality of Inception features to use. " "By default, uses pool3 features"),
-)
-parser.add_argument("path", type=str, nargs=2, help=("Paths to the generated images or " "to .npz statistic files"))
-
-IMAGE_EXTENSIONS = {"bmp", "jpg", "jpeg", "pgm", "png", "ppm", "tif", "tiff", "webp"}
-
-
-class ImagePathDataset(paddle.io.Dataset):
-    def __init__(self, files, transforms=None):
-        self.files = files
-        self.transforms = transforms
-
-    def __len__(self):
-        return len(self.files)
-
-    def __getitem__(self, i):
-        path = self.files[i]
-        img = Image.open(path).convert("RGB")
-        if self.transforms is not None:
-            img = self.transforms(img)
-        return {"img": img}
-
-
-def get_activations(files, model, batch_size=50, dims=2048, num_workers=1):
-    """Calculates the activations of the pool_3 layer for all images.
-
-    Params:
-    -- files       : List of image files paths
-    -- model       : Instance of inception model
-    -- batch_size  : Batch size of images for the model to process at once.
-                     Make sure that the number of samples is a multiple of
-                     the batch size, otherwise some samples are ignored. This
-                     behavior is retained to match the original FID score
-                     implementation.
-    -- dims        : Dimensionality of features returned by Inception
-    -- num_workers : Number of parallel dataloader workers
-
-    Returns:
-    -- A numpy array of dimension (num images, dims) that contains the
-       activations of the given tensor when feeding inception with the
-       query tensor.
-    """
-    model.eval()
-
-    if batch_size > len(files):
-        print(("Warning: batch size is bigger than the data size. " "Setting batch size to data size"))
-        batch_size = len(files)
-
-    dataset = ImagePathDataset(files, transforms=TF.ToTensor())
-    dataloader = paddle.io.DataLoader(
-        dataset, batch_size=batch_size, shuffle=False, drop_last=False, num_workers=num_workers
-    )
-
-    pred_arr = np.empty((len(files), dims))
-
-    start_idx = 0
-
-    for batch in tqdm(dataloader):
-        batch = batch["img"]
-        with paddle.no_grad():
-            pred = model(batch)[0]
-
-        # If model output is not scalar, apply global spatial average pooling.
-        # This happens if you choose a dimensionality not equal 2048.
-        # import pdb;pdb.set_trace()
-        if pred.shape[2] != 1 or pred.shape[3] != 1:
-            pred = adaptive_avg_pool2d(pred, output_size=(1, 1))
-
-        pred = pred.squeeze(3).squeeze(2).cpu().numpy()
-
-        pred_arr[start_idx : start_idx + pred.shape[0]] = pred
-
-        start_idx = start_idx + pred.shape[0]
-
-    return pred_arr
-
-
-def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
-    """Numpy implementation of the Frechet Distance.
-    The Frechet distance between two multivariate Gaussians X_1 ~ N(mu_1, C_1)
-    and X_2 ~ N(mu_2, C_2) is
-            d^2 = ||mu_1 - mu_2||^2 + Tr(C_1 + C_2 - 2*sqrt(C_1*C_2)).
-
-    Stable version by Dougal J. Sutherland.
-
-    Params:
-    -- mu1   : Numpy array containing the activations of a layer of the
-               inception net (like returned by the function 'get_predictions')
-               for generated samples.
-    -- mu2   : The sample mean over activations, precalculated on an
-               representative data set.
-    -- sigma1: The covariance matrix over activations for generated samples.
-    -- sigma2: The covariance matrix over activations, precalculated on an
-               representative data set.
-
-    Returns:
-    --   : The Frechet Distance.
-    """
-
-    mu1 = np.atleast_1d(mu1)
-    mu2 = np.atleast_1d(mu2)
-
-    sigma1 = np.atleast_2d(sigma1)
-    sigma2 = np.atleast_2d(sigma2)
-
-    assert mu1.shape == mu2.shape, "Training and test mean vectors have different lengths"
-    assert sigma1.shape == sigma2.shape, "Training and test covariances have different dimensions"
-
-    diff = mu1 - mu2
-
-    # Product might be almost singular
-    covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
-    if not np.isfinite(covmean).all():
-        msg = ("fid calculation produces singular product; " "adding %s to diagonal of cov estimates") % eps
-        print(msg)
-        offset = np.eye(sigma1.shape[0]) * eps
-        covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
-
-    # Numerical error might give slight imaginary component
-    if np.iscomplexobj(covmean):
-        if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
-            m = np.max(np.abs(covmean.imag))
-            raise ValueError("Imaginary component {}".format(m))
-        covmean = covmean.real
-
-    tr_covmean = np.trace(covmean)
-
-    return diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) - 2 * tr_covmean
-
-
-def calculate_activation_statistics(files, model, batch_size=50, dims=2048, num_workers=1):
-    """Calculation of the statistics used by the FID.
-    Params:
-    -- files       : List of image files paths
-    -- model       : Instance of inception model
-    -- batch_size  : The images numpy array is split into batches with
-                     batch size batch_size. A reasonable batch size
-                     depends on the hardware.
-    -- dims        : Dimensionality of features returned by Inception
-    -- num_workers : Number of parallel dataloader workers
-
-    Returns:
-    -- mu    : The mean over samples of the activations of the pool_3 layer of
-               the inception model.
-    -- sigma : The covariance matrix of the activations of the pool_3 layer of
-               the inception model.
-    """
-    act = get_activations(files, model, batch_size, dims, num_workers)
-    mu = np.mean(act, axis=0)
-    sigma = np.cov(act, rowvar=False)
-    return mu, sigma
-
-
-def compute_statistics_of_path(path, model, batch_size, dims, num_workers=1):
-    if path.endswith(".npz"):
-        with np.load(path) as f:
-            m, s = f["mu"][:], f["sigma"][:]
-    else:
-        path = pathlib.Path(path)
-        files = sorted([file for ext in IMAGE_EXTENSIONS for file in path.glob("*.{}".format(ext))])
-        m, s = calculate_activation_statistics(files, model, batch_size, dims, num_workers)
-
-    return m, s
-
-
-def calculate_fid_given_paths(paths, batch_size, dims, num_workers=1):
-    """Calculates the FID of two paths"""
-    for p in paths:
-        if not os.path.exists(p):
-            raise RuntimeError("Invalid path: %s" % p)
-
-    block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[dims]
-
-    model = InceptionV3([block_idx])
-
-    m1, s1 = compute_statistics_of_path(paths[0], model, batch_size, dims, num_workers)
-    m2, s2 = compute_statistics_of_path(paths[1], model, batch_size, dims, num_workers)
-    fid_value = calculate_frechet_distance(m1, s1, m2, s2)
-
-    return fid_value
-
-
-def main():
-    args = parser.parse_args()
-    if args.device is not None:
-        paddle.set_device(args.device)
-
-    if args.num_workers is None:
-        num_avail_cpus = len(os.sched_getaffinity(0))
-        num_workers = min(num_avail_cpus, 8)
-    else:
-        num_workers = args.num_workers
-
-    fid_value = calculate_fid_given_paths(args.path, args.batch_size, args.dims, num_workers)
-    print("FID: ", fid_value)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/ppdiffusers/examples/autoencoder/vae/scripts/get_autoencoder_results.py b/ppdiffusers/examples/autoencoder/vae/scripts/get_autoencoder_results.py
deleted file mode 100644
index 7e5eadaf365b..000000000000
--- a/ppdiffusers/examples/autoencoder/vae/scripts/get_autoencoder_results.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import glob
-import os
-import os.path as osp
-
-import click
-import paddle
-from paddle.vision import transforms
-from PIL import Image
-from tqdm import tqdm
-
-from ppdiffusers import AutoencoderKL, StableDiffusionImg2ImgPipeline
-
-image_processing = transforms.Compose(
-    [
-        transforms.ToTensor(),
-        transforms.Normalize(0.5, 0.5),
-    ]
-)
-
-
-def decode_image(image):
-    image = (image / 2 + 0.5).clip(0, 1).transpose([0, 2, 3, 1]).cast("float32").numpy()
-    image = StableDiffusionImg2ImgPipeline.numpy_to_pil(image)
-    return image
-
-
-@click.command()
-@click.option("--vae_path", type=str)
-@click.option("--src_size", type=int)
-@click.option("--tgt_size", type=int)
-@click.option("--imgs", type=str)
-@click.option("--outdir", type=str)
-def main(vae_path, src_size, tgt_size, imgs, outdir):
-    imgs = sorted(glob.glob(imgs))
-    model = AutoencoderKL.from_pretrained(vae_path)
-    model.eval()
-    with paddle.no_grad():
-        os.makedirs(outdir, exist_ok=True)
-        for img_path in tqdm(imgs):
-            img = Image.open(img_path).convert("RGB")
-            w, h = img.size
-
-            assert w == tgt_size and h == tgt_size
-            img = img.resize([src_size, src_size])
-
-            img = image_processing(img).unsqueeze(0)
-
-            z = model.encode(img).latent_dist.sample()
-            recon = model.decode(z).sample
-
-            decode_image(recon)[0].save(osp.join(outdir, osp.basename(img_path)))
-
-
-if __name__ == "__main__":
-    main()
diff --git a/ppdiffusers/examples/autoencoder/vae/scripts/inception.py b/ppdiffusers/examples/autoencoder/vae/scripts/inception.py
deleted file mode 100644
index 9a024ff75803..000000000000
--- a/ppdiffusers/examples/autoencoder/vae/scripts/inception.py
+++ /dev/null
@@ -1,493 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# Copyright (c) mseitzer Author. All Rights Reserved.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle.utils.download import get_weights_path_from_url
-
-# Inception weights ported to Pytorch from
-# http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz
-FID_WEIGHTS_URL = (
-    "https://paddlenlp.bj.bcebos.com/models/mseitzer/pp_inception-2015-12-05-6726825d.pdparams",
-    "8e2ae24c34c5c8b81d45167bb9361f4c",
-)
-WEIGHTS_PATH = "pp_inception-2015-12-05-6726825d.pdparams"
-
-
-class ConvNormActivation(nn.Sequential):
-    """
-    Configurable block used for Convolution-Normalzation-Activation blocks.
-    This code is based on the torchvision code with modifications.
-    You can also see at https://github.com/pytorch/vision/blob/main/torchvision/ops/misc.py#L68
-    Args:
-        in_channels (int): Number of channels in the input image
-        out_channels (int): Number of channels produced by the Convolution-Normalzation-Activation block
-        kernel_size: (int|list|tuple, optional): Size of the convolving kernel. Default: 3
-        stride (int|list|tuple, optional): Stride of the convolution. Default: 1
-        padding (int|str|tuple|list, optional): Padding added to all four sides of the input. Default: None,
-            in wich case it will calculated as ``padding = (kernel_size - 1) // 2 * dilation``
-        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
-        norm_layer (Callable[..., paddle.nn.Layer], optional): Norm layer that will be stacked on top of the convolutiuon layer.
-            If ``None`` this layer wont be used. Default: ``paddle.nn.BatchNorm2D``
-        activation_layer (Callable[..., paddle.nn.Layer], optional): Activation function which will be stacked on top of the normalization
-            layer (if not ``None``), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``paddle.nn.ReLU``
-        dilation (int): Spacing between kernel elements. Default: 1
-        bias (bool, optional): Whether to use bias in the convolution layer. By default, biases are included if ``norm_layer is None``.
-    """
-
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        kernel_size=3,
-        stride=1,
-        padding=None,
-        groups=1,
-        norm_layer=nn.BatchNorm2D,
-        activation_layer=nn.ReLU,
-        dilation=1,
-        bias=None,
-    ):
-        if padding is None:
-            padding = (kernel_size - 1) // 2 * dilation
-        if bias is None:
-            bias = norm_layer is None
-        layers = [
-            nn.Conv2D(
-                in_channels,
-                out_channels,
-                kernel_size,
-                stride,
-                padding,
-                dilation=dilation,
-                groups=groups,
-                bias_attr=bias,
-            )
-        ]
-        if norm_layer is not None:
-            # The hyperparameter of BatchNorm2D is different from PaddlePaddle.
-            layers.append(norm_layer(out_channels, momentum=0.1, epsilon=0.001))
-        if activation_layer is not None:
-            layers.append(activation_layer())
-        super().__init__(*layers)
-
-
-class InceptionV3(nn.Layer):
-    """Pretrained InceptionV3 network returning feature maps"""
-
-    # Index of default block of inception to return,
-    # corresponds to output of final average pooling
-    DEFAULT_BLOCK_INDEX = 3
-
-    # Maps feature dimensionality to their output blocks indices
-    BLOCK_INDEX_BY_DIM = {
-        64: 0,  # First max pooling features
-        192: 1,  # Second max pooling featurs
-        768: 2,  # Pre-aux classifier features
-        2048: 3,  # Final average pooling features
-    }
-
-    def __init__(
-        self,
-        output_blocks=(DEFAULT_BLOCK_INDEX,),
-        resize_input=True,
-        normalize_input=True,
-        requires_grad=False,
-        use_fid_inception=True,
-    ):
-        """Build pretrained InceptionV3
-
-        Parameters
-        ----------
-        output_blocks : list of int
-            Indices of blocks to return features of. Possible values are:
-                - 0: corresponds to output of first max pooling
-                - 1: corresponds to output of second max pooling
-                - 2: corresponds to output which is fed to aux classifier
-                - 3: corresponds to output of final average pooling
-        resize_input : bool
-            If true, bilinearly resizes input to width and height 299 before
-            feeding input to model. As the network without fully connected
-            layers is fully convolutional, it should be able to handle inputs
-            of arbitrary size, so resizing might not be strictly needed
-        normalize_input : bool
-            If true, scales the input from range (0, 1) to the range the
-            pretrained Inception network expects, namely (-1, 1)
-        requires_grad : bool
-            If true, parameters of the model require gradients. Possibly useful
-            for finetuning the network
-        use_fid_inception : bool
-            If true, uses the pretrained Inception model used in Tensorflow's
-            FID implementation. If false, uses the pretrained Inception model
-            available in paddle.vision. The FID Inception model has different
-            weights and a slightly different structure from paddle.vision's
-            Inception model. If you want to compute FID scores, you are
-            strongly advised to set this parameter to true to get comparable
-            results.
-        """
-        super(InceptionV3, self).__init__()
-
-        self.resize_input = resize_input
-        self.normalize_input = normalize_input
-        self.output_blocks = sorted(output_blocks)
-        self.last_needed_block = max(output_blocks)
-
-        assert self.last_needed_block <= 3, "Last possible output block index is 3"
-
-        self.blocks = nn.LayerList()
-
-        if use_fid_inception:
-            inception = fid_inception_v3()
-        else:
-            inception = _inception_v3(pretrained=True)
-
-        # Block 0: input to maxpool1
-        block0 = [
-            inception.inception_stem.conv_1a_3x3,
-            inception.inception_stem.conv_2a_3x3,
-            inception.inception_stem.conv_2b_3x3,
-            inception.inception_stem.max_pool,
-        ]
-        self.blocks.append(nn.Sequential(*block0))
-
-        # Block 1: maxpool1 to maxpool2
-        if self.last_needed_block >= 1:
-            block1 = [
-                inception.inception_stem.conv_3b_1x1,
-                inception.inception_stem.conv_4a_3x3,
-                inception.inception_stem.max_pool,
-            ]
-            self.blocks.append(nn.Sequential(*block1))
-
-        # Block 2: maxpool2 to aux classifier
-        if self.last_needed_block >= 2:
-            block2 = [
-                inception.inception_block_list[0],
-                inception.inception_block_list[1],
-                inception.inception_block_list[2],
-                inception.inception_block_list[3],
-                inception.inception_block_list[4],
-                inception.inception_block_list[5],
-                inception.inception_block_list[6],
-                inception.inception_block_list[7],
-            ]
-            self.blocks.append(nn.Sequential(*block2))
-
-        # Block 3: aux classifier to final avgpool
-        if self.last_needed_block >= 3:
-            block3 = [
-                inception.inception_block_list[8],
-                inception.inception_block_list[9],
-                inception.inception_block_list[10],
-                inception.avg_pool,
-            ]
-            self.blocks.append(nn.Sequential(*block3))
-
-        for param in self.parameters():
-            param.stop_gradient = requires_grad
-
-    def forward(self, inp):
-        """Get Inception feature maps
-
-        Parameters
-        ----------
-        inp : paddle.Tensor
-            Input tensor of shape Bx3xHxW. Values are expected to be in
-            range (0, 1)
-
-        Returns
-        -------
-        List of paddle.Tensor, corresponding to the selected output
-        block, sorted ascending by index
-        """
-        outp = []
-        x = inp
-        if self.resize_input:
-            x = F.interpolate(x, size=(299, 299), mode="bilinear", align_corners=False)
-
-        if self.normalize_input:
-            x = 2 * x - 1  # Scale from range (0, 1) to range (-1, 1)
-        for idx, block in enumerate(self.blocks):
-            x = block(x)
-            if idx in self.output_blocks:
-                outp.append(x)
-
-            if idx == self.last_needed_block:
-                break
-
-        return outp
-
-
-def hack_bn_layer(layer):
-    if isinstance(layer, nn.BatchNorm2D):
-        layer._momentum = 0.1
-        layer._epsilon = 0.001
-
-
-def _inception_v3(*args, **kwargs):
-    """Wraps `paddle.vision.models.inception_v3`"""
-    return paddle.vision.models.inception_v3(*args, **kwargs).apply(hack_bn_layer)
-
-
-def fid_inception_v3():
-    """Build pretrained Inception model for FID computation
-
-    The Inception model for FID computation uses a different set of weights
-    and has a slightly different structure than paddle.vision's Inception.
-
-    This method first constructs paddle.vision's Inception and then patches the
-    necessary parts that are different in the FID Inception model.
-    """
-    inception = _inception_v3(num_classes=1008, with_pool=True, pretrained=False)
-    inception.inception_block_list[0] = InceptionA(192, pool_features=32)
-    inception.inception_block_list[1] = InceptionA(256, pool_features=64)
-    inception.inception_block_list[2] = InceptionA(288, pool_features=64)
-    inception.inception_block_list[4] = InceptionC(768, channels_7x7=128)
-    inception.inception_block_list[5] = InceptionC(768, channels_7x7=160)
-    inception.inception_block_list[6] = InceptionC(768, channels_7x7=160)
-    inception.inception_block_list[7] = InceptionC(768, channels_7x7=192)
-    inception.inception_block_list[9] = InceptionE_1(1280)
-    inception.inception_block_list[10] = InceptionE_2(2048)
-
-    weight_path = get_weights_path_from_url(FID_WEIGHTS_URL[0], FID_WEIGHTS_URL[1])
-    state_dict = paddle.load(weight_path)
-    inception.set_state_dict(state_dict)
-    return inception
-
-
-class InceptionA(nn.Layer):
-    def __init__(self, num_channels, pool_features):
-        super().__init__()
-        self.branch1x1 = ConvNormActivation(
-            in_channels=num_channels, out_channels=64, kernel_size=1, padding=0, activation_layer=nn.ReLU
-        )
-
-        self.branch5x5_1 = ConvNormActivation(
-            in_channels=num_channels, out_channels=48, kernel_size=1, padding=0, activation_layer=nn.ReLU
-        )
-        self.branch5x5_2 = ConvNormActivation(
-            in_channels=48, out_channels=64, kernel_size=5, padding=2, activation_layer=nn.ReLU
-        )
-
-        self.branch3x3dbl_1 = ConvNormActivation(
-            in_channels=num_channels, out_channels=64, kernel_size=1, padding=0, activation_layer=nn.ReLU
-        )
-        self.branch3x3dbl_2 = ConvNormActivation(
-            in_channels=64, out_channels=96, kernel_size=3, padding=1, activation_layer=nn.ReLU
-        )
-        self.branch3x3dbl_3 = ConvNormActivation(
-            in_channels=96, out_channels=96, kernel_size=3, padding=1, activation_layer=nn.ReLU
-        )
-        # Patch: Tensorflow's average pool does not use the padded zero's in
-        # its average calculation
-        self.branch_pool = nn.AvgPool2D(kernel_size=3, stride=1, padding=1, exclusive=True)
-        self.branch_pool_conv = ConvNormActivation(
-            in_channels=num_channels, out_channels=pool_features, kernel_size=1, padding=0, activation_layer=nn.ReLU
-        )
-
-    def forward(self, x):
-        branch1x1 = self.branch1x1(x)
-        branch5x5 = self.branch5x5_1(x)
-        branch5x5 = self.branch5x5_2(branch5x5)
-
-        branch3x3dbl = self.branch3x3dbl_1(x)
-        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
-        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
-
-        branch_pool = self.branch_pool(x)
-        branch_pool = self.branch_pool_conv(branch_pool)
-        x = paddle.concat([branch1x1, branch5x5, branch3x3dbl, branch_pool], axis=1)
-        return x
-
-
-class InceptionC(nn.Layer):
-    def __init__(self, num_channels, channels_7x7):
-        super().__init__()
-        self.branch1x1 = ConvNormActivation(
-            in_channels=num_channels, out_channels=192, kernel_size=1, padding=0, activation_layer=nn.ReLU
-        )
-
-        self.branch7x7_1 = ConvNormActivation(
-            in_channels=num_channels,
-            out_channels=channels_7x7,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            activation_layer=nn.ReLU,
-        )
-        self.branch7x7_2 = ConvNormActivation(
-            in_channels=channels_7x7,
-            out_channels=channels_7x7,
-            kernel_size=(1, 7),
-            stride=1,
-            padding=(0, 3),
-            activation_layer=nn.ReLU,
-        )
-        self.branch7x7_3 = ConvNormActivation(
-            in_channels=channels_7x7,
-            out_channels=192,
-            kernel_size=(7, 1),
-            stride=1,
-            padding=(3, 0),
-            activation_layer=nn.ReLU,
-        )
-
-        self.branch7x7dbl_1 = ConvNormActivation(
-            in_channels=num_channels, out_channels=channels_7x7, kernel_size=1, padding=0, activation_layer=nn.ReLU
-        )
-        self.branch7x7dbl_2 = ConvNormActivation(
-            in_channels=channels_7x7,
-            out_channels=channels_7x7,
-            kernel_size=(7, 1),
-            padding=(3, 0),
-            activation_layer=nn.ReLU,
-        )
-        self.branch7x7dbl_3 = ConvNormActivation(
-            in_channels=channels_7x7,
-            out_channels=channels_7x7,
-            kernel_size=(1, 7),
-            padding=(0, 3),
-            activation_layer=nn.ReLU,
-        )
-        self.branch7x7dbl_4 = ConvNormActivation(
-            in_channels=channels_7x7,
-            out_channels=channels_7x7,
-            kernel_size=(7, 1),
-            padding=(3, 0),
-            activation_layer=nn.ReLU,
-        )
-        self.branch7x7dbl_5 = ConvNormActivation(
-            in_channels=channels_7x7, out_channels=192, kernel_size=(1, 7), padding=(0, 3), activation_layer=nn.ReLU
-        )
-        # Patch: Tensorflow's average pool does not use the padded zero's in
-        # its average calculation
-        self.branch_pool = nn.AvgPool2D(kernel_size=3, stride=1, padding=1, exclusive=True)
-        self.branch_pool_conv = ConvNormActivation(
-            in_channels=num_channels, out_channels=192, kernel_size=1, padding=0, activation_layer=nn.ReLU
-        )
-
-    def forward(self, x):
-        branch1x1 = self.branch1x1(x)
-
-        branch7x7 = self.branch7x7_1(x)
-        branch7x7 = self.branch7x7_2(branch7x7)
-        branch7x7 = self.branch7x7_3(branch7x7)
-
-        branch7x7dbl = self.branch7x7dbl_1(x)
-        branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl)
-        branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl)
-        branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl)
-        branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl)
-
-        branch_pool = self.branch_pool(x)
-        branch_pool = self.branch_pool_conv(branch_pool)
-
-        x = paddle.concat([branch1x1, branch7x7, branch7x7dbl, branch_pool], axis=1)
-
-        return x
-
-
-class InceptionE_1(nn.Layer):
-    def __init__(self, num_channels):
-        super().__init__()
-        self.branch1x1 = ConvNormActivation(
-            in_channels=num_channels, out_channels=320, kernel_size=1, padding=0, activation_layer=nn.ReLU
-        )
-        self.branch3x3_1 = ConvNormActivation(
-            in_channels=num_channels, out_channels=384, kernel_size=1, padding=0, activation_layer=nn.ReLU
-        )
-        self.branch3x3_2a = ConvNormActivation(
-            in_channels=384, out_channels=384, kernel_size=(1, 3), padding=(0, 1), activation_layer=nn.ReLU
-        )
-        self.branch3x3_2b = ConvNormActivation(
-            in_channels=384, out_channels=384, kernel_size=(3, 1), padding=(1, 0), activation_layer=nn.ReLU
-        )
-
-        self.branch3x3dbl_1 = ConvNormActivation(
-            in_channels=num_channels, out_channels=448, kernel_size=1, padding=0, activation_layer=nn.ReLU
-        )
-        self.branch3x3dbl_2 = ConvNormActivation(
-            in_channels=448, out_channels=384, kernel_size=3, padding=1, activation_layer=nn.ReLU
-        )
-        self.branch3x3dbl_3a = ConvNormActivation(
-            in_channels=384, out_channels=384, kernel_size=(1, 3), padding=(0, 1), activation_layer=nn.ReLU
-        )
-        self.branch3x3dbl_3b = ConvNormActivation(
-            in_channels=384, out_channels=384, kernel_size=(3, 1), padding=(1, 0), activation_layer=nn.ReLU
-        )
-
-        # Patch: Tensorflow's average pool does not use the padded zero's in
-        # its average calculation
-        self.branch_pool = nn.AvgPool2D(kernel_size=3, stride=1, padding=1, exclusive=True)
-        self.branch_pool_conv = ConvNormActivation(
-            in_channels=num_channels, out_channels=192, kernel_size=1, padding=0, activation_layer=nn.ReLU
-        )
-
-    def forward(self, x):
-        branch1x1 = self.branch1x1(x)
-
-        branch3x3 = self.branch3x3_1(x)
-        branch3x3 = [
-            self.branch3x3_2a(branch3x3),
-            self.branch3x3_2b(branch3x3),
-        ]
-        branch3x3 = paddle.concat(branch3x3, axis=1)
-
-        branch3x3dbl = self.branch3x3dbl_1(x)
-        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
-        branch3x3dbl = [
-            self.branch3x3dbl_3a(branch3x3dbl),
-            self.branch3x3dbl_3b(branch3x3dbl),
-        ]
-        branch3x3dbl = paddle.concat(branch3x3dbl, axis=1)
-
-        branch_pool = self.branch_pool(x)
-        branch_pool = self.branch_pool_conv(branch_pool)
-
-        x = paddle.concat([branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1)
-        return x
-
-
-class InceptionE_2(InceptionE_1):
-    def __init__(self, num_channels):
-        super(InceptionE_2, self).__init__(num_channels)
-
-    def forward(self, x):
-        branch1x1 = self.branch1x1(x)
-
-        branch3x3 = self.branch3x3_1(x)
-        branch3x3 = [
-            self.branch3x3_2a(branch3x3),
-            self.branch3x3_2b(branch3x3),
-        ]
-        branch3x3 = paddle.concat(branch3x3, axis=1)
-
-        branch3x3dbl = self.branch3x3dbl_1(x)
-        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
-        branch3x3dbl = [
-            self.branch3x3dbl_3a(branch3x3dbl),
-            self.branch3x3dbl_3b(branch3x3dbl),
-        ]
-        branch3x3dbl = paddle.concat(branch3x3dbl, axis=1)
-
-        # Patch: The FID Inception model uses max pooling instead of average
-        # pooling. This is likely an error in this specific Inception
-        # implementation, as other Inception models use average pooling here
-        # (which matches the description in the paper).
-        branch_pool = F.max_pool2d(x, kernel_size=3, stride=1, padding=1)
-        branch_pool = self.branch_pool_conv(branch_pool)
-
-        x = paddle.concat([branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1)
-        return x
diff --git a/ppdiffusers/examples/autoencoder/vae/train_vae.py b/ppdiffusers/examples/autoencoder/vae/train_vae.py
deleted file mode 100644
index e778b0eece78..000000000000
--- a/ppdiffusers/examples/autoencoder/vae/train_vae.py
+++ /dev/null
@@ -1,487 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import json
-import math
-import os
-from collections import defaultdict
-
-import numpy as np
-import paddle
-from ldm import AutoencoderKLWithLoss, TextImagePair, worker_init_fn
-from paddle.io import BatchSampler, DataLoader, DistributedBatchSampler
-from paddle.optimizer import Adam
-from tqdm.auto import tqdm
-
-from paddlenlp.trainer import set_seed
-from paddlenlp.utils.log import logger
-from ppdiffusers.models.ema import LitEma
-from ppdiffusers.training_utils import freeze_params, main_process_first, unwrap_model
-
-
-def read_json(file):
-    with open(file, "r", encoding="utf-8") as f:
-        data = json.load(f)
-    return data
-
-
-def get_writer(args):
-    if args.report_to == "visualdl":
-        from visualdl import LogWriter
-
-        writer = LogWriter(logdir=args.logging_dir)
-    elif args.report_to == "tensorboard":
-        from tensorboardX import SummaryWriter
-
-        writer = SummaryWriter(logdir=args.logging_dir)
-    else:
-        raise ValueError("report_to must be in ['visualdl', 'tensorboard']")
-    return writer
-
-
-def run_evaluate(vae, val_dataloader, writer, global_step):
-    log_dict_ae_all = defaultdict(list)
-    log_dict_disc_all = defaultdict(list)
-    for batch in val_dataloader:
-        log_dict_ae, log_dict_disc = unwrap_model(vae).validation_step(batch["image"], global_step=global_step)
-        for k, v in log_dict_ae.items():
-            if "loss" not in k:
-                continue
-            log_dict_ae_all[k].append(v)
-        for k, v in log_dict_disc.items():
-            log_dict_disc_all[k].append(v)
-    for name, val in log_dict_ae_all.items():
-        writer.add_scalar(name, np.mean(val), global_step)
-    for name, val in log_dict_disc_all.items():
-        writer.add_scalar(name, np.mean(val), global_step)
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="Simple example of a training a autoencoder model script.")
-    parser.add_argument(
-        "--pretrained_model_name_or_path",
-        type=str,
-        default=None,
-        required=False,
-        help="Path to pretrained model or model identifier from bos.",
-    )
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        default="autoencoder_outputs",
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-    parser.add_argument("--seed", type=int, default=23, help="A seed for reproducible training.")
-    parser.add_argument(
-        "--batch_size",
-        type=int,
-        default=4,
-        help="Batch size (per device) for the training/validation dataloader.",
-    )
-    parser.add_argument("--num_train_epochs", type=int, default=100)
-    parser.add_argument(
-        "--max_train_steps",
-        type=int,
-        default=None,
-        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
-    )
-    parser.add_argument(
-        "--learning_rate",
-        type=float,
-        default=4.5e-06,
-        help="Initial learning rate (after the potential warmup period) to use.",
-    )
-    parser.add_argument(
-        "--scale_lr",
-        action="store_true",
-        help="Scale base-lr by ngpu * batch_size",
-    )
-    parser.add_argument("--freeze_encoder", action="store_true", help="Whether to freeze encoder layer.")
-    parser.add_argument(
-        "--from_scratch",
-        action="store_true",
-        help="Whether to train new model from scratch. ",
-    )
-    parser.add_argument("--vae_config_file", default=None, type=str, help="Path to the vae_config_file.")
-    parser.add_argument(
-        "--logging_dir",
-        type=str,
-        default="logs",
-        help=(
-            "[TensorBoard](https://www.tensorflow.org/tensorboard) or [VisualDL](https://www.paddlepaddle.org.cn/paddle/visualdl) log directory. Will default to"
-            "*output_dir/logs"
-        ),
-    )
-    parser.add_argument(
-        "--report_to",
-        type=str,
-        default="visualdl",
-        choices=["tensorboard", "visualdl"],
-        help="Log writer type.",
-    )
-    parser.add_argument("--logging_steps", default=100, type=int, help="The interval steps to logging.")
-    parser.add_argument(
-        "--image_logging_steps",
-        default=500,
-        type=int,
-        help="The interval steps to logging images.",
-    )
-    parser.add_argument("--save_steps", default=2000, type=int, help="The interval steps to saveing.")
-    parser.add_argument(
-        "--ignore_keys",
-        default=[],
-        type=str,
-        nargs="*",
-        help="The prefix keys to be ignored when we resume from a pretrained model, e.g. ignore_keys = ['decoder.'], we will ignore 'decoder.xxx', 'decoder.xxx.xxx'.",
-    )
-    parser.add_argument(
-        "--input_size", default=None, type=int, nargs="*", help="The height and width of the input at the encoder."
-    )
-    # dataset
-    parser.add_argument(
-        "--dataset_type",
-        type=str,
-        default="text_image_pair",
-        choices=["imagenet", "text_image_pair"],
-        help="The type of dataset.",
-    )
-    parser.add_argument(
-        "--resolution",
-        type=int,
-        default=512,
-        help=(
-            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
-            " resolution"
-        ),
-    )
-    parser.add_argument(
-        "--degradation",
-        type=str,
-        default="pil_nearest",
-        help="Degradation_fn, e.g. cv_bicubic, bsrgan_light, or pil_nearest",
-    )
-    parser.add_argument(
-        "--file_list",
-        type=str,
-        default="./data/filelist/train.filelist.list",
-        help="Path to the train file_list.",
-    )
-    parser.add_argument(
-        "--num_workers",
-        type=int,
-        default=8,
-        help="The number of subprocess to load data.",
-    )
-    parser.add_argument(
-        "--num_records",
-        type=int,
-        default=62500,
-        help="The num_records of the text_image_pair dataset.",
-    )
-    parser.add_argument(
-        "--buffer_size",
-        type=int,
-        default=100,
-        help="The buffer size of the text_image_pair dataset.",
-    )
-    parser.add_argument(
-        "--shuffle_every_n_samples",
-        type=int,
-        default=5,
-        help="The shuffle_every_n_samples of the text_image_pair dataset.",
-    )
-    parser.add_argument("--init_from_ckpt", type=str, default=None, help="The path of checkpoint to be loaded.")
-
-    # loss fn
-    parser.add_argument("--disc_start", type=int, default=50001, help="The number of steps the discriminator started.")
-    parser.add_argument("--kl_weight", type=float, default=1.0e-6, help="The weight ratio of the kl_loss.")
-    parser.add_argument("--disc_weight", type=float, default=0.5, help="The weight ratio of the disc_loss.")
-    parser.add_argument("--logvar_init", type=float, default=0.0, help="The init value of the output log variances.")
-    parser.add_argument("--pixelloss_weight", type=float, default=1.0, help="The weight ratio of the pixelloss.")
-    parser.add_argument("--disc_num_layers", type=int, default=3, help="The num layers of the discriminator.")
-    parser.add_argument("--disc_in_channels", type=int, default=3, help="The in channels of the discriminator.")
-    parser.add_argument("--disc_factor", type=float, default=1.0, help="The factor of the discriminator loss.")
-    parser.add_argument(
-        "--perceptual_weight", type=float, default=1.0, help="The weight ratio of the perceptual loss."
-    )
-    parser.add_argument(
-        "--use_actnorm", action="store_true", help="Whether to use actnorm in NLayerDiscriminator layer."
-    )
-    parser.add_argument("--disc_conditional", action="store_true", help="Whether to use conditional discriminator.")
-    parser.add_argument(
-        "--disc_loss", type=str, choices=["hinge", "vanilla"], default="hinge", help="The type of discriminator loss."
-    )
-    parser.add_argument("--use_ema", action="store_true", help="Whether to use_ema.")
-    parser.add_argument(
-        "--enable_xformers_memory_efficient_attention",
-        action="store_true",
-        help="Whether to enable_xformers_memory_efficient_attention.",
-    )
-    parser.add_argument("--recompute", action="store_true", help="Whether to recompute.")
-    parser.add_argument("--ema_decay", type=float, default=0.9999, help="The value of ema_decay.")
-    args = parser.parse_args()
-
-    args.logging_dir = os.path.join(args.output_dir, args.logging_dir)
-    args.image_logging_steps = math.ceil(args.image_logging_steps / args.logging_steps) * args.logging_steps
-
-    return args
-
-
-def check_keys(model, state_dict):
-    cls_name = model.__class__.__name__
-    missing_keys = []
-    mismatched_keys = []
-    for k, v in model.state_dict().items():
-        if k not in state_dict.keys():
-            missing_keys.append(k)
-        if list(v.shape) != list(state_dict[k].shape):
-            mismatched_keys.append(k)
-    if len(missing_keys):
-        missing_keys_str = ", ".join(missing_keys)
-        print(f"{cls_name} Found missing_keys {missing_keys_str}!")
-    if len(mismatched_keys):
-        mismatched_keys_str = ", ".join(mismatched_keys)
-        print(f"{cls_name} Found mismatched_keys {mismatched_keys_str}!")
-    if len(missing_keys) == 0 and len(mismatched_keys) == 0:
-        print(f"{cls_name} All model state_dict are loaded!")
-
-
-def main():
-    args = parse_args()
-    rank = paddle.distributed.get_rank()
-    num_processes = paddle.distributed.get_world_size()
-    if num_processes > 1:
-        paddle.distributed.init_parallel_env()
-
-    # If passed along, set the training seed now.
-    if args.seed is not None:
-        set_seed(args.seed)
-
-    if args.output_dir is not None:
-        os.makedirs(args.output_dir, exist_ok=True)
-
-    if not args.from_scratch:
-        # Load pretrained model
-        if args.vae_config_file is not None:
-            model_kwargs = read_json(args.vae_config_file)
-        else:
-            model_kwargs = {}
-        vae = AutoencoderKLWithLoss.from_pretrained(
-            args.pretrained_model_name_or_path,
-            subfolder="vae",
-            ignore_keys=args.ignore_keys,
-            input_size=args.input_size,
-            disc_start=args.disc_start,
-            kl_weight=args.kl_weight,
-            disc_weight=args.disc_weight,
-            logvar_init=args.logvar_init,
-            pixelloss_weight=args.pixelloss_weight,
-            disc_num_layers=args.disc_num_layers,
-            disc_in_channels=args.disc_in_channels,
-            disc_factor=args.disc_factor,
-            perceptual_weight=args.perceptual_weight,
-            use_actnorm=args.use_actnorm,
-            disc_conditional=args.disc_conditional,
-            disc_loss=args.disc_loss,
-            ema_decay=args.ema_decay,
-            use_ema=args.use_ema,
-            **model_kwargs,
-        )
-    else:
-        assert args.vae_config_file is not None, "We must supply vae_config_file!"
-        # Load config: train model from scatch
-        vae = AutoencoderKLWithLoss.from_config(
-            read_json(args.vae_config_file),
-            input_size=args.input_size,
-            disc_start=args.disc_start,
-            kl_weight=args.kl_weight,
-            disc_weight=args.disc_weight,
-            logvar_init=args.logvar_init,
-            pixelloss_weight=args.pixelloss_weight,
-            disc_num_layers=args.disc_num_layers,
-            disc_in_channels=args.disc_in_channels,
-            disc_factor=args.disc_factor,
-            perceptual_weight=args.perceptual_weight,
-            use_actnorm=args.use_actnorm,
-            disc_conditional=args.disc_conditional,
-            disc_loss=args.disc_loss,
-            ema_decay=args.ema_decay,
-            use_ema=args.use_ema,
-        )
-
-    if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt):
-        state_dict = paddle.load(args.init_from_ckpt)
-        vae.set_dict(state_dict)
-        check_keys(vae, state_dict)
-        del state_dict
-
-    if args.scale_lr:
-        args.learning_rate = num_processes * args.batch_size * args.learning_rate
-
-    # configure_optimizers
-    parameters = list(vae.decoder.parameters()) + list(vae.post_quant_conv.parameters())
-    # we may freeze_encoder
-    if not args.freeze_encoder:
-        parameters += list(vae.encoder.parameters())
-        parameters += list(vae.quant_conv.parameters())
-    else:
-        freeze_params(vae.encoder.parameters())
-        freeze_params(vae.quant_conv.parameters())
-        print("Freeze vae.encoder.parameters and vae.quant_conv.parameters!")
-
-    opt_ae = Adam(parameters=parameters, learning_rate=args.learning_rate, beta1=0.5, beta2=0.9)
-    opt_disc = Adam(
-        parameters=vae.loss.discriminator.parameters(),
-        learning_rate=args.learning_rate,
-        beta1=0.5,
-        beta2=0.9,
-    )
-    if args.use_ema:
-        vae.model_ema = LitEma(vae, decay=args.ema_decay)
-    if args.recompute:
-        vae.enable_gradient_checkpointing()
-    if args.enable_xformers_memory_efficient_attention:
-        vae.enable_xformers_memory_efficient_attention()
-
-    optimizers = [opt_ae, opt_disc]
-
-    if num_processes > 1:
-        vae = paddle.DataParallel(vae, find_unused_parameters=True)
-
-    if args.dataset_type == "imagenet":
-        from ldm import ImageNetSRTrain, ImageNetSRValidation
-
-        with main_process_first():
-            train_dataset = ImageNetSRTrain(size=args.resolution, degradation=args.degradation)
-            val_dataset = ImageNetSRValidation(size=args.resolution, degradation=args.degradation)
-        train_sampler = (
-            DistributedBatchSampler(train_dataset, batch_size=args.batch_size, shuffle=True)
-            if num_processes > 1
-            else BatchSampler(train_dataset, batch_size=args.batch_size, shuffle=True)
-        )
-        train_dataloader = DataLoader(train_dataset, batch_sampler=train_sampler, num_workers=args.num_workers)
-
-        val_sampler = BatchSampler(val_dataset, batch_size=args.batch_size * 2, shuffle=False)
-        val_dataloader = DataLoader(val_dataset, batch_sampler=val_sampler, num_workers=args.num_workers)
-    else:
-        train_dataset = TextImagePair(
-            file_list=args.file_list,
-            size=args.resolution,
-            num_records=args.num_records,
-            buffer_size=args.buffer_size,
-            shuffle_every_n_samples=args.shuffle_every_n_samples,
-            interpolation="lanczos",
-        )
-
-        train_dataloader = DataLoader(
-            train_dataset,
-            batch_size=args.batch_size,
-            num_workers=args.num_workers,
-            worker_init_fn=worker_init_fn,
-        )
-        val_dataloader = val_dataset = None
-    # Scheduler and math around the number of training steps.
-    overrode_max_train_steps = False
-    num_update_steps_per_epoch = (
-        len(train_dataloader) if args.dataset_type == "imagenet" else math.ceil(len(train_dataset) / args.batch_size)
-    )
-    if args.max_train_steps is None:
-        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
-        overrode_max_train_steps = True
-
-    if overrode_max_train_steps:
-        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
-    # Afterwards we recalculate our number of training epochs
-    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
-
-    if rank == 0:
-        logger.info("-----------  Configuration Arguments -----------")
-        for arg, value in sorted(vars(args).items()):
-            logger.info("%s: %s" % (arg, value))
-        logger.info("------------------------------------------------")
-        writer = get_writer(args)
-
-    # Train!
-    total_batch_size = args.batch_size * num_processes
-
-    logger.info("***** Running training *****")
-    logger.info(f"  Num examples = {len(train_dataset)}")
-    logger.info(f"  Num Epochs = {args.num_train_epochs}")
-    logger.info(f"  Instantaneous batch size per device = {args.batch_size}")
-    logger.info(f"  Total train batch size (w. parallel, distributed) = {total_batch_size}")
-    logger.info(f"  Total optimization steps = {args.max_train_steps}")
-    logger.info(
-        f"  Number of trainable parameters = {sum(p.numel().item() for p in vae.parameters() if not p.stop_gradient) }"
-    )
-    logger.info(
-        f"  Number of non-trainable parameters = {sum(p.numel().item() for p in vae.parameters() if p.stop_gradient) }"
-    )
-    # Only show the progress bar once on each machine.
-    progress_bar = tqdm(range(args.max_train_steps), disable=rank > 0)
-    progress_bar.set_description("Steps")
-    global_step = 0
-
-    vae.train()
-    for epoch in range(args.num_train_epochs):
-        for batch in train_dataloader:
-            logs = {"epoch": str(epoch).zfill(4)}
-            for optimizer_idx in [0, 1]:
-                # pytorch_lightning use this `toggle_optimizer` method
-                # ref: https://github.com/Lightning-AI/lightning/blob/a58639ce7e864dd70484e7d34c37730ae204183c/src/pytorch_lightning/core/module.py#L1419-L1447
-                unwrap_model(vae).toggle_optimizer(optimizers, optimizer_idx)
-                loss, log_dict = vae(batch["image"], optimizer_idx=optimizer_idx, global_step=global_step)
-                optimizers[optimizer_idx].clear_grad()
-                loss.backward()
-                optimizers[optimizer_idx].step()
-                # pytorch_lightning use this `untoggle_optimizer` method
-                # ref: https://github.com/Lightning-AI/lightning/blob/a58639ce7e864dd70484e7d34c37730ae204183c/src/pytorch_lightning/core/module.py#L1449-L1464
-                unwrap_model(vae).untoggle_optimizer(optimizers, optimizer_idx)
-                logs.update(log_dict)
-            unwrap_model(vae).on_train_batch_end()
-            progress_bar.update(1)
-            global_step += 1
-            # progress_bar.set_postfix(**logs)
-
-            if rank == 0:
-                # logging
-                if global_step % args.logging_steps == 0:
-                    for name, val in logs.items():
-                        if name == "epoch":
-                            continue
-                        writer.add_scalar(name, val, global_step)
-
-                if global_step % args.image_logging_steps == 0:
-                    images_log = unwrap_model(vae).log_images(batch["image"])
-                    for name, val in images_log.items():
-                        writer.add_image(name, val, global_step, dataformats="NHWC")
-
-                # saving
-                if global_step % args.save_steps == 0:
-                    if val_dataloader is not None:
-                        run_evaluate(unwrap_model(vae), val_dataloader, writer, global_step)
-                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
-                    unwrap_model(vae).save_pretrained(output_dir)
-
-            del logs
-            if global_step >= args.max_train_steps:
-                break
-
-    if rank == 0:
-        writer.close()
-        unwrap_model(vae).save_pretrained(args.output_dir)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/ppdiffusers/examples/clip_interrogator/LICENSE b/ppdiffusers/examples/clip_interrogator/LICENSE
deleted file mode 100644
index 701a2623f9d5..000000000000
--- a/ppdiffusers/examples/clip_interrogator/LICENSE
+++ /dev/null
@@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) 2022 pharmapsychotic
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
diff --git a/ppdiffusers/examples/clip_interrogator/README.md b/ppdiffusers/examples/clip_interrogator/README.md
deleted file mode 100644
index ae9580842a68..000000000000
--- a/ppdiffusers/examples/clip_interrogator/README.md
+++ /dev/null
@@ -1,31 +0,0 @@
-# clip-interrogator
-
-## 依赖
-```shell
-pip install -r requirements.txt
-
-```
-## 准备data数据（包含artists.txt、flavors.txt、mediums.txt、movements.txt）
-```shell
-wget https://paddlenlp.bj.bcebos.com/models/community/Salesforce/blip-image-captioning-large/data.zip
-# 将data文件解压至clip_interrogator目录下
-unzip -d clip_interrogator data.zip
-```
-
-## 使用
-### 快速开始
-```python
-from PIL import Image
-from clip_interrogator import Config, Interrogator
-image = Image.open(image_path).convert('RGB')
-ci = Interrogator(Config(clip_pretrained_model_name_or_path="openai/clip-vit-large-patch14"))
-print(ci.interrogate(image))
-```
-
-### Gradio
-```shell
-python run_gradio.py \
-    --clip="openai/clip-vit-large-patch14" \
-    --blip="Salesforce/blip-image-captioning-large" \
-    --share
-```
diff --git a/ppdiffusers/examples/clip_interrogator/clip_interrogator/__init__.py b/ppdiffusers/examples/clip_interrogator/clip_interrogator/__init__.py
deleted file mode 100644
index 4ee76f27fd81..000000000000
--- a/ppdiffusers/examples/clip_interrogator/clip_interrogator/__init__.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .clip_interrogator import Config, Interrogator
-
-__version__ = "0.3.5"
-__author__ = "pharmapsychotic"
-
-
-CLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    # vit model
-    "openai/clip-vit-base-patch32",  # ViT-B/32
-    "openai/clip-vit-base-patch16",  # ViT-B/16
-    "openai/clip-vit-large-patch14",  # ViT-L/14
-    "laion/CLIP-ViT-H-14-laion2B-s32B-b79K",
-    "laion/CLIP-ViT-B-32-laion2B-s34B-b79K",
-    # resnet model
-    "openai/clip-rn50",  # RN50
-    "openai/clip-rn101",  # RN101
-    "openai/clip-rn50x4",  # RN50x4
-]
-
-
-BLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "Salesforce/blip-image-captioning-base",
-    "Salesforce/blip-image-captioning-large",
-]
diff --git a/ppdiffusers/examples/clip_interrogator/clip_interrogator/blip_decoder.py b/ppdiffusers/examples/clip_interrogator/clip_interrogator/blip_decoder.py
deleted file mode 100644
index a8eb1d461683..000000000000
--- a/ppdiffusers/examples/clip_interrogator/clip_interrogator/blip_decoder.py
+++ /dev/null
@@ -1,126 +0,0 @@
-"""
- * Copyright (c) 2022, salesforce.com, inc.
- * All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
- * By Junnan Li
-"""
-import paddle
-import paddle.nn as nn
-from fastcore.all import patch_to
-
-from paddlenlp.transformers import BlipForConditionalGeneration, BlipProcessor
-from paddlenlp.transformers.generation_utils import BeamHypotheses
-
-
-@patch_to(BeamHypotheses)
-def add(self: BeamHypotheses, hyp: paddle.Tensor, sum_logprobs: float, origin_len: int = 0) -> None:
-    """
-    Add a new hypothesis to the list.
-    """
-    score = sum_logprobs / (hyp.shape[-1] ** self.length_penalty)
-    if len(self) < self.num_beams or score > self.worst_score:
-        self.beams.append((score, hyp))
-        if len(self) > self.num_beams:
-            sorted_next_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)])
-            del self.beams[sorted_next_scores[0][1]]
-            self.worst_score = sorted_next_scores[1][0]
-        else:
-            self.worst_score = min(score, self.worst_score)
-
-
-@patch_to(BeamHypotheses)
-def is_done(self: BeamHypotheses, best_sum_logprobs: float, cur_len: int, origin_len: int = 0) -> bool:
-    """
-    If there are enough hypotheses and that none of the hypotheses being generated can become better than the worst
-    one in the heap, then we are done with this sentence.
-    """
-
-    if len(self) < self.num_beams:
-        return False
-    elif self.early_stopping:
-        return True
-    else:
-        cur_score = best_sum_logprobs / cur_len**self.length_penalty
-        ret = self.worst_score >= cur_score
-        return ret
-
-
-class BLIP_Decoder(nn.Layer):
-    def __init__(
-        self,
-        pretrained_model_name_or_path,
-        prompt="a picture of ",
-    ):
-        super().__init__()
-        self.text_decoder = BlipForConditionalGeneration.from_pretrained(pretrained_model_name_or_path)
-        self.text_decoder.eval()
-        self.processor = BlipProcessor.from_pretrained(pretrained_model_name_or_path)
-        self.processor.tokenizer.add_special_tokens({"bos_token": "[DEC]"})
-        self.processor.tokenizer.add_special_tokens({"additional_special_tokens": ["[ENC]"]})
-        self.processor.tokenizer.enc_token_id = self.processor.tokenizer.additional_special_tokens_ids[0]
-        self.prompt = prompt
-        self.prompt_length = len(self.processor.tokenizer(self.prompt).input_ids) - 1
-
-    def generate(
-        self,
-        image,
-        prompt=None,
-        sample=False,
-        num_beams=3,
-        max_length=30,
-        min_length=10,
-        top_p=0.9,
-        repetition_penalty=1.0,
-    ):
-        if prompt is None:
-            prompt = self.prompt
-            prompt_length = self.prompt_length
-        else:
-            prompt_length = len(self.processor.tokenizer(prompt).input_ids) - 1
-        if not paddle.is_tensor(image):
-            model_kwargs = self.processor(images=image, return_tensors="pd")
-        else:
-            model_kwargs = {"pixel_values": image}
-        prompt = [prompt] * model_kwargs["pixel_values"].shape[0]
-        input_ids = self.processor.tokenizer(prompt, return_tensors="pd").input_ids
-
-        if sample:
-            # nucleus sampling
-            outputs = self.text_decoder.generate(
-                input_ids=input_ids,
-                max_length=max_length - prompt_length,
-                min_length=min_length,
-                decode_strategy="sampling",
-                top_p=top_p,
-                num_return_sequences=1,
-                repetition_penalty=repetition_penalty,
-                **model_kwargs,
-            )[0]
-        else:
-            if num_beams == 1:
-                # greedy search
-                outputs = self.text_decoder.generate(
-                    input_ids=input_ids,
-                    max_length=max_length - prompt_length,
-                    min_length=min_length,
-                    decode_strategy="greedy_search",
-                    **model_kwargs,
-                )[0]
-            else:
-                # beam search
-                outputs = self.text_decoder.generate(
-                    input_ids=input_ids,
-                    max_length=max_length - prompt_length,
-                    min_length=min_length,
-                    num_beams=num_beams,
-                    decode_strategy="beam_search",
-                    repetition_penalty=repetition_penalty,
-                    length_penalty=1.0,  # note this is not
-                    **model_kwargs,
-                )[0]
-
-        captions = []
-        for output in outputs:
-            captions.append(self.processor.decode(output, skip_special_tokens=True))
-        return captions
diff --git a/ppdiffusers/examples/clip_interrogator/clip_interrogator/clip_interrogator.py b/ppdiffusers/examples/clip_interrogator/clip_interrogator/clip_interrogator.py
deleted file mode 100644
index 5422976b73a7..000000000000
--- a/ppdiffusers/examples/clip_interrogator/clip_interrogator/clip_interrogator.py
+++ /dev/null
@@ -1,355 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import hashlib
-import math
-import os
-import pickle
-from dataclasses import dataclass
-from functools import partial
-from typing import List
-
-import numpy as np
-import paddle
-from paddle.vision import transforms
-from PIL import Image
-from tqdm import tqdm
-
-from paddlenlp.transformers import CLIPModel, CLIPProcessor
-
-from .blip_decoder import BLIP_Decoder
-
-
-@dataclass
-class Config:
-    # blip settings
-    blip_pretrained_model_name_or_path: str = "Salesforce/blip-image-captioning-large"
-    blip_image_eval_size: int = 384
-    blip_max_length: int = 32
-    blip_num_beams: int = 8
-    blip_min_length: int = 5
-    blip_top_p: float = 0.9
-    blip_repetition_penalty: float = 1.0
-    blip_sample: bool = False
-
-    # clip settings
-    clip_pretrained_model_name_or_path: str = "openai/clip-vit-large-patch14"
-
-    # interrogator settings
-    cache_path: str = "cache"
-    chunk_size: int = 2048
-    data_path: str = os.path.join(os.path.dirname(__file__), "data")
-    flavor_intermediate_count: int = 2048
-    quiet: bool = False  # when quiet progress bars are not shown
-
-
-class Interrogator:
-    def __init__(self, config: Config):
-        self.config = config
-        # blip model
-        self.load_blip_model()
-        self.load_clip_model()
-
-    def load_blip_model(self):
-        config = self.config
-        self.blip_model = BLIP_Decoder(pretrained_model_name_or_path=config.blip_pretrained_model_name_or_path)
-        self.blip_model.eval()
-
-    def load_clip_model(self):
-        config = self.config
-
-        # clip model
-        self.clip_model: CLIPModel = CLIPModel.from_pretrained(config.clip_pretrained_model_name_or_path)
-        self.clip_model.eval()
-        self.clip_preprocess = CLIPProcessor.from_pretrained(config.clip_pretrained_model_name_or_path)
-
-        sites = [
-            "Artstation",
-            "behance",
-            "cg society",
-            "cgsociety",
-            "deviantart",
-            "dribble",
-            "flickr",
-            "instagram",
-            "pexels",
-            "pinterest",
-            "pixabay",
-            "pixiv",
-            "polycount",
-            "reddit",
-            "shutterstock",
-            "tumblr",
-            "unsplash",
-            "zbrush central",
-        ]
-        trending_list = [site for site in sites]
-        trending_list.extend(["trending on " + site for site in sites])
-        trending_list.extend(["featured on " + site for site in sites])
-        trending_list.extend([site + " contest winner" for site in sites])
-
-        raw_artists = _load_list(config.data_path, "artists.txt")
-        artists = [f"by {a}" for a in raw_artists]
-        artists.extend([f"inspired by {a}" for a in raw_artists])
-
-        # (TODO, junnyu) we must set pad_token_id to zero
-        self.clip_preprocess.tokenizer.pad_token_id = 0
-        self.tokenize = partial(
-            self.clip_preprocess.tokenizer.__call__,
-            return_tensors="pd",
-            padding="max_length",
-            truncation=True,
-            max_length=self.clip_preprocess.tokenizer.model_max_length,
-        )
-        self.artists = LabelTable(artists, "artists", self.clip_model, self.tokenize, config)
-        self.flavors = LabelTable(
-            _load_list(config.data_path, "flavors.txt"), "flavors", self.clip_model, self.tokenize, config
-        )
-        self.mediums = LabelTable(
-            _load_list(config.data_path, "mediums.txt"), "mediums", self.clip_model, self.tokenize, config
-        )
-        self.movements = LabelTable(
-            _load_list(config.data_path, "movements.txt"), "movements", self.clip_model, self.tokenize, config
-        )
-        self.trendings = LabelTable(trending_list, "trendings", self.clip_model, self.tokenize, config)
-        self.pad_token_id = self.clip_preprocess.tokenizer.pad_token_id
-
-    def generate_caption(self, pil_image: Image) -> str:
-        size = self.config.blip_image_eval_size
-        gpu_image = transforms.Compose(
-            [
-                transforms.Resize((size, size), interpolation="bicubic"),
-                transforms.ToTensor(),
-                transforms.Normalize(
-                    self.clip_preprocess.image_processor.image_mean, self.clip_preprocess.image_processor.image_std
-                ),
-            ]
-        )(pil_image).unsqueeze(0)
-
-        with paddle.no_grad():
-            caption = self.blip_model.generate(
-                gpu_image,
-                sample=self.config.blip_sample,
-                num_beams=self.config.blip_num_beams,
-                max_length=self.config.blip_max_length,
-                min_length=self.config.blip_min_length,
-                top_p=self.config.blip_top_p,
-                repetition_penalty=self.config.blip_repetition_penalty,
-            )
-        return caption[0]
-
-    def image_to_features(self, image: Image) -> paddle.Tensor:
-        images = self.clip_preprocess(images=image, return_tensors="pd")
-        with paddle.no_grad():
-            image_features = self.clip_model.get_image_features(images["pixel_values"])
-            image_features /= image_features.norm(axis=-1, keepdim=True)
-        return image_features
-
-    def interrogate_classic(self, image: Image, max_flavors: int = 3) -> str:
-        caption = self.generate_caption(image)
-        image_features = self.image_to_features(image)
-
-        medium = self.mediums.rank(image_features, 1)[0]
-        artist = self.artists.rank(image_features, 1)[0]
-        trending = self.trendings.rank(image_features, 1)[0]
-        movement = self.movements.rank(image_features, 1)[0]
-        flaves = ", ".join(self.flavors.rank(image_features, max_flavors))
-
-        if caption.startswith(medium):
-            prompt = f"{caption} {artist}, {trending}, {movement}, {flaves}"
-        else:
-            prompt = f"{caption}, {medium} {artist}, {trending}, {movement}, {flaves}"
-
-        return _truncate_to_fit(prompt, self.tokenize, self.pad_token_id)
-
-    def interrogate_fast(self, image: Image, max_flavors: int = 32) -> str:
-        caption = self.generate_caption(image)
-        image_features = self.image_to_features(image)
-        merged = _merge_tables([self.artists, self.flavors, self.mediums, self.movements, self.trendings], self.config)
-        tops = merged.rank(image_features, max_flavors)
-        return _truncate_to_fit(caption + ", " + ", ".join(tops), self.tokenize, self.pad_token_id)
-
-    def interrogate(self, image: Image, max_flavors: int = 32) -> str:
-        caption = self.generate_caption(image)
-        image_features = self.image_to_features(image)
-
-        flaves = self.flavors.rank(image_features, self.config.flavor_intermediate_count)
-        best_medium = self.mediums.rank(image_features, 1)[0]
-        best_artist = self.artists.rank(image_features, 1)[0]
-        best_trending = self.trendings.rank(image_features, 1)[0]
-        best_movement = self.movements.rank(image_features, 1)[0]
-
-        best_prompt = caption
-        best_sim = self.similarity(image_features, best_prompt)
-
-        def check(addition: str) -> bool:
-            nonlocal best_prompt, best_sim
-            prompt = best_prompt + ", " + addition
-            sim = self.similarity(image_features, prompt)
-            if sim > best_sim:
-                best_sim = sim
-                best_prompt = prompt
-                return True
-            return False
-
-        def check_multi_batch(opts: List[str]):
-            nonlocal best_prompt, best_sim
-            prompts = []
-            for i in range(2 ** len(opts)):
-                prompt = best_prompt
-                for bit in range(len(opts)):
-                    if i & (1 << bit):
-                        prompt += ", " + opts[bit]
-                prompts.append(prompt)
-
-            t = LabelTable(prompts, None, self.clip_model, self.tokenize, self.config)
-            best_prompt = t.rank(image_features, 1)[0]
-            best_sim = self.similarity(image_features, best_prompt)
-
-        check_multi_batch([best_medium, best_artist, best_trending, best_movement])
-
-        extended_flavors = set(flaves)
-        for i in tqdm(range(max_flavors), desc="Flavor chain", disable=self.config.quiet):
-            best = self.rank_top(image_features, [f"{best_prompt}, {f}" for f in extended_flavors])
-            flave = best[len(best_prompt) + 2 :]
-            if not check(flave):
-                break
-            if _prompt_at_max_len(best_prompt, self.tokenize, self.pad_token_id):
-                break
-            extended_flavors.remove(flave)
-
-        return best_prompt
-
-    def rank_top(self, image_features: paddle.Tensor, text_array: List[str]) -> str:
-        text_tokens = self.tokenize(text_array)
-        with paddle.no_grad():
-            text_features = self.clip_model.get_text_features(text_tokens["input_ids"])
-            text_features /= text_features.norm(axis=-1, keepdim=True)
-            similarity = text_features @ image_features.T
-        return text_array[similarity.argmax().item()]
-
-    def similarity(self, image_features: paddle.Tensor, text: str) -> float:
-        text_tokens = self.tokenize([text])
-        with paddle.no_grad():
-            text_features = self.clip_model.get_text_features(text_tokens["input_ids"])
-            text_features /= text_features.norm(axis=-1, keepdim=True)
-            similarity = text_features @ image_features.T
-        return similarity[0][0].item()
-
-
-class LabelTable:
-    def __init__(self, labels: List[str], desc: str, clip_model, tokenize, config: Config):
-        self.chunk_size = config.chunk_size
-        self.config = config
-        self.embeds = []
-        self.labels = labels
-        self.tokenize = tokenize
-
-        hash = hashlib.sha256(",".join(labels).encode()).hexdigest()
-
-        cache_filepath = None
-        if config.cache_path is not None and desc is not None:
-            os.makedirs(config.cache_path, exist_ok=True)
-            sanitized_name = config.clip_pretrained_model_name_or_path.replace("/", "_").replace("@", "_")
-            cache_filepath = os.path.join(config.cache_path, f"{sanitized_name}_{desc}.pkl")
-            if desc is not None and os.path.exists(cache_filepath):
-                with open(cache_filepath, "rb") as f:
-                    try:
-                        data = pickle.load(f)
-                        if data.get("hash") == hash:
-                            self.labels = data["labels"]
-                            self.embeds = data["embeds"]
-                    except Exception as e:
-                        print(f"Error loading cached table {desc}: {e}")
-
-        if len(self.labels) != len(self.embeds):
-            self.embeds = []
-            chunks = np.array_split(self.labels, max(1, len(self.labels) / config.chunk_size))
-            for chunk in tqdm(chunks, desc=f"Preprocessing {desc}" if desc else None, disable=self.config.quiet):
-                text_tokens = self.tokenize(chunk.tolist())
-                with paddle.no_grad():
-                    text_features = clip_model.get_text_features(text_tokens["input_ids"])
-                    text_features /= text_features.norm(axis=-1, keepdim=True)
-                    text_features = text_features.cpu().numpy()
-                for i in range(text_features.shape[0]):
-                    self.embeds.append(text_features[i])
-
-            if cache_filepath is not None:
-                with open(cache_filepath, "wb") as f:
-                    pickle.dump(
-                        {
-                            "labels": self.labels,
-                            "embeds": self.embeds,
-                            "hash": hash,
-                            "model": config.clip_pretrained_model_name_or_path,
-                        },
-                        f,
-                    )
-
-    def _rank(self, image_features: paddle.Tensor, text_embeds: paddle.Tensor, top_count: int = 1) -> str:
-        top_count = min(top_count, len(text_embeds))
-        text_embeds = paddle.to_tensor(text_embeds)
-        similarity = image_features @ text_embeds.T
-        _, top_labels = similarity.cast("float32").topk(top_count, axis=-1)
-        top_labels = top_labels.tolist()
-        return [top_labels[0][i] for i in range(top_count)]
-
-    def rank(self, image_features: paddle.Tensor, top_count: int = 1) -> List[str]:
-        if len(self.labels) <= self.chunk_size:
-            tops = self._rank(image_features, self.embeds, top_count=top_count)
-            return [self.labels[i] for i in tops]
-
-        num_chunks = int(math.ceil(len(self.labels) / self.chunk_size))
-        keep_per_chunk = int(self.chunk_size / num_chunks)
-
-        top_labels, top_embeds = [], []
-        for chunk_idx in tqdm(range(num_chunks), disable=self.config.quiet):
-            start = chunk_idx * self.chunk_size
-            stop = min(start + self.chunk_size, len(self.embeds))
-            tops = self._rank(image_features, self.embeds[start:stop], top_count=keep_per_chunk)
-            top_labels.extend([self.labels[start + i] for i in tops])
-            top_embeds.extend([self.embeds[start + i] for i in tops])
-
-        tops = self._rank(image_features, top_embeds, top_count=top_count)
-        return [top_labels[i] for i in tops]
-
-
-def _load_list(data_path: str, filename: str) -> List[str]:
-    with open(os.path.join(data_path, filename), "r", encoding="utf-8", errors="replace") as f:
-        items = [line.strip() for line in f.readlines()]
-    return items
-
-
-def _merge_tables(tables: List[LabelTable], config: Config) -> LabelTable:
-    m = LabelTable([], None, None, None, config)
-    for table in tables:
-        m.labels.extend(table.labels)
-        m.embeds.extend(table.embeds)
-    return m
-
-
-def _prompt_at_max_len(text: str, tokenize, pad_token_id: int = 0) -> bool:
-    tokens = tokenize([text])["input_ids"]
-    return tokens[0][-1] != pad_token_id
-
-
-def _truncate_to_fit(text: str, tokenize, pad_token_id) -> str:
-    parts = text.split(", ")
-    new_text = parts[0]
-    for part in parts[1:]:
-        if _prompt_at_max_len(new_text + part, tokenize, pad_token_id):
-            break
-        new_text += ", " + part
-    return new_text
diff --git a/ppdiffusers/examples/clip_interrogator/dumpy.py b/ppdiffusers/examples/clip_interrogator/dumpy.py
deleted file mode 100644
index 552e84eae594..000000000000
--- a/ppdiffusers/examples/clip_interrogator/dumpy.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gradio as gr
-from clip_interrogator import (
-    BLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
-    CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
-    Config,
-    Interrogator,
-)
-
-blip_pretrained_model_name_or_path = "Salesforce/blip-image-captioning-base"
-clip_pretrained_model_name_or_path = "openai/clip-vit-large-patch14"
-
-# validate clip model name
-if clip_pretrained_model_name_or_path not in CLIP_PRETRAINED_MODEL_ARCHIVE_LIST:
-    clip_models = ", ".join(CLIP_PRETRAINED_MODEL_ARCHIVE_LIST)
-    print(f"Could not find CLIP model {clip_pretrained_model_name_or_path}!")
-    print(f"    available clip models: {clip_models}")
-    exit(1)
-
-# validate clip model name
-if blip_pretrained_model_name_or_path not in BLIP_PRETRAINED_MODEL_ARCHIVE_LIST:
-    blip_models = ", ".join(BLIP_PRETRAINED_MODEL_ARCHIVE_LIST)
-    print(f"Could not find BLIP model {blip_pretrained_model_name_or_path}!")
-    print(f"    available blip models: {blip_models}")
-    exit(1)
-
-config = Config(
-    blip_num_beams=64,
-    blip_pretrained_model_name_or_path=blip_pretrained_model_name_or_path,
-    clip_pretrained_model_name_or_path=clip_pretrained_model_name_or_path,
-)
-ci = Interrogator(config)
-
-
-def inference(image, mode, best_max_flavors=32):
-    ci.config.chunk_size = (
-        2048 if ci.config.clip_pretrained_model_name_or_path == "openai/clip-vit-large-patch14" else 1024
-    )
-    ci.config.flavor_intermediate_count = (
-        2048 if ci.config.clip_pretrained_model_name_or_path == "openai/clip-vit-large-patch14" else 1024
-    )
-    image = image.convert("RGB")
-    if mode == "best":
-        return ci.interrogate(image, max_flavors=int(best_max_flavors))
-    elif mode == "classic":
-        return ci.interrogate_classic(image)
-    else:
-        return ci.interrogate_fast(image)
-
-
-inputs = [
-    gr.inputs.Image(type="pil"),
-    gr.Radio(["best", "fast", "classic"], label="", value="best"),
-    gr.Number(value=16, label="best mode max flavors"),
-]
-outputs = [
-    gr.outputs.Textbox(label="Output"),
-]
-
-io = gr.Interface(
-    inference,
-    inputs,
-    outputs,
-    allow_flagging=False,
-)
-io.launch(debug=False, server_name="0.0.0.0", server_port=8586)
diff --git a/ppdiffusers/examples/clip_interrogator/predict.py b/ppdiffusers/examples/clip_interrogator/predict.py
deleted file mode 100644
index 62f508c1c193..000000000000
--- a/ppdiffusers/examples/clip_interrogator/predict.py
+++ /dev/null
@@ -1,73 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from clip_interrogator import (
-    BLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
-    CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
-    Config,
-    Interrogator,
-)
-from cog import BasePredictor, Input, Path
-from PIL import Image
-
-
-class Predictor(BasePredictor):
-    def setup(self):
-        self.ci = Interrogator(
-            Config(
-                blip_pretrained_model_name_or_path="Salesforce/blip-image-captioning-large",
-                clip_pretrained_model_name_or_path="openai/clip-vit-large-patch14",
-                device="gpu",
-            )
-        )
-
-    def predict(
-        self,
-        image: Path = Input(description="Input image"),
-        clip_pretrained_model_name_or_path: str = Input(
-            default="openai/clip-vit-large-patch14",
-            choices=CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
-            description="Choose ViT-L for Stable Diffusion 1, and ViT-H for Stable Diffusion 2",
-        ),
-        blip_pretrained_model_name_or_path: str = Input(
-            default="Salesforce/blip-image-captioning-large",
-            choices=BLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
-            description="Choose Salesforce/blip-image-captioning-large",
-        ),
-        mode: str = Input(
-            default="best",
-            choices=["best", "classic", "fast"],
-            description="Prompt mode (best takes 10-20 seconds, fast takes 1-2 seconds).",
-        ),
-    ) -> str:
-        """Run a single prediction on the model"""
-        image = Image.open(str(image)).convert("RGB")
-        self.switch_model(clip_pretrained_model_name_or_path, blip_pretrained_model_name_or_path)
-        if mode == "best":
-            return self.ci.interrogate(image)
-        elif mode == "classic":
-            return self.ci.interrogate_classic(image)
-        else:
-            return self.ci.interrogate_fast(image)
-
-    def switch_model(self, clip_pretrained_model_name_or_path: str, blip_pretrained_model_name_or_path: str):
-        if clip_pretrained_model_name_or_path != self.ci.config.clip_pretrained_model_name_or_path:
-            self.ci.config.clip_pretrained_model_name_or_path = clip_pretrained_model_name_or_path
-            self.ci.load_clip_model()
-        if blip_pretrained_model_name_or_path != self.ci.config.blip_pretrained_model_name_or_path:
-            self.ci.config.blip_pretrained_model_name_or_path = blip_pretrained_model_name_or_path
-            self.ci.load_blip_model()
diff --git a/ppdiffusers/examples/clip_interrogator/requirements.txt b/ppdiffusers/examples/clip_interrogator/requirements.txt
deleted file mode 100644
index 9e3b9fa1757e..000000000000
--- a/ppdiffusers/examples/clip_interrogator/requirements.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-paddlenlp>=2.6.0rc0
-Pillow
-requests
-tqdm
-fastcore
\ No newline at end of file
diff --git a/ppdiffusers/examples/clip_interrogator/run_cli.py b/ppdiffusers/examples/clip_interrogator/run_cli.py
deleted file mode 100755
index 72ef646cc328..000000000000
--- a/ppdiffusers/examples/clip_interrogator/run_cli.py
+++ /dev/null
@@ -1,137 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import csv
-import os
-
-import paddle
-import requests
-from clip_interrogator import (
-    BLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
-    CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
-    Config,
-    Interrogator,
-)
-from PIL import Image
-
-
-def inference(ci, image, mode):
-    image = image.convert("RGB")
-    if mode == "best":
-        return ci.interrogate(image)
-    elif mode == "classic":
-        return ci.interrogate_classic(image)
-    else:
-        return ci.interrogate_fast(image)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-c",
-        "--clip",
-        default="openai/clip-vit-large-patch14",
-        choices=CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
-        help="name of CLIP model to use",
-    )
-    parser.add_argument(
-        "-b",
-        "--blip",
-        default="Salesforce/blip-image-captioning-large",
-        choices=BLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
-        help="name of BLIP model to use",
-    )
-    parser.add_argument("-d", "--device", default="auto", help="device to use (auto, gpu or cpu)")
-    parser.add_argument("-f", "--folder", help="path to folder of images")
-    parser.add_argument("-i", "--image", help="image file or url")
-    parser.add_argument(
-        "-m", "--mode", default="best", choices=["best", "classic", "fast"], help="best, classic, or fast"
-    )
-
-    args = parser.parse_args()
-    if not args.folder and not args.image:
-        parser.print_help()
-        exit(1)
-
-    if args.folder is not None and args.image is not None:
-        print("Specify a folder or batch processing or a single image, not both")
-        exit(1)
-
-    # validate clip model name
-    if args.clip not in CLIP_PRETRAINED_MODEL_ARCHIVE_LIST:
-        models = ", ".join(CLIP_PRETRAINED_MODEL_ARCHIVE_LIST)
-        print(f"Could not find CLIP model {args.clip}!")
-        print(f"    available models: {models}")
-        exit(1)
-
-    # validate clip model name
-    if args.blip not in BLIP_PRETRAINED_MODEL_ARCHIVE_LIST:
-        models = ", ".join(BLIP_PRETRAINED_MODEL_ARCHIVE_LIST)
-        print(f"Could not find BLIP model {args.blip}!")
-        print(f"    available models: {models}")
-        exit(1)
-
-    # select device
-    if args.device == "auto":
-        device = "gpu" if paddle.device.is_compiled_with_cuda() else "cpu"
-    else:
-        device = args.device
-    paddle.set_device(device)
-    # generate a nice prompt
-    config = Config(clip_pretrained_model_name_or_path=args.clip, blip_pretrained_model_name_or_path=args.blip)
-    ci = Interrogator(config)
-
-    # process single image
-    if args.image is not None:
-        image_path = args.image
-        if str(image_path).startswith("http://") or str(image_path).startswith("https://"):
-            image = Image.open(requests.get(image_path, stream=True).raw).convert("RGB")
-        else:
-            image = Image.open(image_path).convert("RGB")
-        if not image:
-            print(f"Error opening image {image_path}")
-            exit(1)
-        print(inference(ci, image, args.mode))
-
-    # process folder of images
-    elif args.folder is not None:
-        if not os.path.exists(args.folder):
-            print(f"The folder {args.folder} does not exist!")
-            exit(1)
-
-        files = [f for f in os.listdir(args.folder) if f.endswith(".jpg") or f.endswith(".png")]
-        prompts = []
-        for file in files:
-            image = Image.open(os.path.join(args.folder, file)).convert("RGB")
-            prompt = inference(ci, image, args.mode)
-            prompts.append(prompt)
-            print(prompt)
-
-        if len(prompts):
-            csv_path = os.path.join(args.folder, "desc.csv")
-            with open(csv_path, "w", encoding="utf-8", newline="") as f:
-                w = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
-                w.writerow(["image", "prompt"])
-                for file, prompt in zip(files, prompts):
-                    w.writerow([file, prompt])
-
-            print(f"\n\n\n\nGenerated {len(prompts)} and saved to {csv_path}, enjoy!")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/ppdiffusers/examples/clip_interrogator/run_gradio.py b/ppdiffusers/examples/clip_interrogator/run_gradio.py
deleted file mode 100755
index 1651e884e588..000000000000
--- a/ppdiffusers/examples/clip_interrogator/run_gradio.py
+++ /dev/null
@@ -1,136 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-
-import gradio as gr
-import paddle
-from clip_interrogator import (
-    BLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
-    CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
-    Config,
-    Interrogator,
-)
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "-c",
-    "--clip",
-    default="openai/clip-vit-large-patch14",
-    choices=CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
-    help="name of CLIP model to use",
-)
-parser.add_argument(
-    "-b",
-    "--blip",
-    default="Salesforce/blip-image-captioning-large",
-    choices=BLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
-    help="name of BLIP model to use",
-)
-parser.add_argument("-d", "--device", default="auto", help="device to use (auto, gpu or cpu)")
-parser.add_argument("-s", "--share", action="store_true", help="Create a public link")
-parser.add_argument("--server_name", default="0.0.0.0", type=str, help="server_name")
-parser.add_argument("--server_port", default=8586, type=int, help="server_port")
-
-args = parser.parse_args()
-
-# validate clip model name
-if args.clip not in CLIP_PRETRAINED_MODEL_ARCHIVE_LIST:
-    clip_models = ", ".join(CLIP_PRETRAINED_MODEL_ARCHIVE_LIST)
-    print(f"Could not find CLIP model {args.clip}!")
-    print(f"    available clip models: {clip_models}")
-    exit(1)
-
-# validate clip model name
-if args.blip not in BLIP_PRETRAINED_MODEL_ARCHIVE_LIST:
-    blip_models = ", ".join(BLIP_PRETRAINED_MODEL_ARCHIVE_LIST)
-    print(f"Could not find BLIP model {args.blip}!")
-    print(f"    available blip models: {blip_models}")
-    exit(1)
-
-# select device
-if args.device == "auto":
-    device = "gpu" if paddle.device.is_compiled_with_cuda() else "cpu"
-else:
-    device = args.device
-paddle.set_device(device)
-config = Config(
-    cache_path="cache", clip_pretrained_model_name_or_path=args.clip, blip_pretrained_model_name_or_path=args.blip
-)
-ci = Interrogator(config)
-
-
-def inference(
-    image,
-    mode,
-    clip_pretrained_model_name_or_path,
-    blip_pretrained_model_name_or_path,
-    blip_min_length,
-    blip_max_length,
-    blip_sample,
-    blip_top_p,
-    blip_repetition_penalty,
-    blip_num_beams,
-):
-    if clip_pretrained_model_name_or_path != ci.config.clip_pretrained_model_name_or_path:
-        ci.config.clip_pretrained_model_name_or_path = clip_pretrained_model_name_or_path
-        ci.load_clip_model()
-
-    if blip_pretrained_model_name_or_path != ci.config.blip_pretrained_model_name_or_path:
-        ci.config.blip_pretrained_model_name_or_path = blip_pretrained_model_name_or_path
-        ci.load_blip_model()
-
-    ci.config.blip_min_length = int(blip_min_length)
-    ci.config.blip_max_length = int(blip_max_length)
-    ci.config.blip_sample = eval(blip_sample)
-    ci.config.blip_top_p = float(blip_top_p)
-    ci.config.blip_repetition_penalty = float(blip_repetition_penalty)
-    ci.config.blip_num_beams = int(blip_num_beams)
-
-    image = image.convert("RGB")
-    if mode == "best":
-        return ci.interrogate(image)
-    elif mode == "classic":
-        return ci.interrogate_classic(image)
-    else:
-        return ci.interrogate_fast(image)
-
-
-inputs = [
-    gr.inputs.Image(type="pil"),
-    gr.Radio(["best", "classic", "fast"], label="Mode", value="fast"),
-    gr.Dropdown(CLIP_PRETRAINED_MODEL_ARCHIVE_LIST, value=args.clip, label="CLIP Model"),
-    gr.Dropdown(BLIP_PRETRAINED_MODEL_ARCHIVE_LIST, value=args.blip, label="BLIP Model"),
-    gr.Number(value=8, label="Caption min Length"),
-    gr.Number(value=32, label="Caption Max Length"),
-    gr.Radio(["True", "False"], value="False", label="Sample or not?"),
-    gr.Number(value=0.9, label="TopP value, when Sample is true"),
-    gr.Number(value=1.1, label="Repetition penalty value, when Sample is false"),
-    gr.Number(value=64, label="Caption Num Beams, when Sample is false"),
-]
-outputs = [
-    gr.outputs.Textbox(label="Image Caption Output"),
-]
-
-io = gr.Interface(
-    inference,
-    inputs,
-    outputs,
-    title="🕵️‍♂️ Paddle CLIP Interrogator 🕵️‍♂️",
-    allow_flagging=False,
-)
-io.launch(share=args.share, server_name=args.server_name, server_port=args.server_port)
diff --git a/ppdiffusers/examples/community/README.md b/ppdiffusers/examples/community/README.md
deleted file mode 100644
index b8e001a434f8..000000000000
--- a/ppdiffusers/examples/community/README.md
+++ /dev/null
@@ -1,628 +0,0 @@
-# Community Examples
-
-社区示例包含由社区添加的推理和训练示例。可以从下表中了解所有社区实例的概况。点击**Code Example**，跳转到对应实例的可运行代码，可以复制并运行。如果一个示例不能像预期的那样工作，请创建一个issue提问。
-
-|Example|Description|Code Example|Author|
-|-|-|-|-|
-|CLIP Guided Stable Diffusion|使用CLIP引导Stable Diffusion实现文生图|[CLIP Guided Stable Diffusion](#clip-guided-stable-diffusion)||
-|Stable Diffusion Interpolation|在不同的prompts或seed的Stable Diffusion潜空间进行插值|[Stable Diffusion Interpolation](#stable-diffusion-interpolation)||
-|Stable Diffusion Mega|一个集成Stable Diffusion 文生图、图生图、图像修复的Pipeline|[Stable Diffusion Mega](#stable-diffusion-mega)||
-|Long Prompt Weighting Stable Diffusion| 一个没有token数目限制的Stable Diffusion Pipeline，支持在prompt中解析权重|[Long Prompt Weighting Stable Diffusion](#long-prompt-weighting-stable-diffusion)||
-|AUTOMATIC1111 WebUI Stable Diffusion| 与AUTOMATIC1111的WebUI基本一致的Pipeline |[AUTOMATIC1111 WebUI Stable Diffusion](#automatic1111-webui-stable-diffusion)||
-|Stable Diffusion with High Resolution Fixing| 使用高分辨率修复功能进行文图生成|[Stable Diffusion with High Resolution Fixing](#stable-diffusion-with-high-resolution-fixing)||
-|ControlNet Reference Only| 基于参考图片生成与图片相似的图片|[ControlNet Reference Only](#controlnet-reference-only)||
-|Stable Diffusion Mixture Tiling| 基于Mixture机制的多文本大图生成Stable Diffusion Pipeline|[Stable Diffusion Mixture Tiling](#stable-diffusion-mixture-tiling)||
-|CLIP Guided Images Mixing Stable Diffusion Pipeline| 一个用于图片融合的Stable Diffusion Pipeline|[CLIP Guided Images Mixing Using Stable Diffusion](#clip-guided-images-mixing-with-stable-diffusion)||
-
-## Example usages
-
-### CLIP Guided Stable Diffusion
-
-使用 CLIP 模型引导 Stable Diffusion 去噪，可以生成更真实的图像。
-
-以下代码运行需要16GB的显存。
-
-```python
-import os
-
-import paddle
-from clip_guided_stable_diffusion import CLIPGuidedStableDiffusion
-
-from paddlenlp.transformers import CLIPFeatureExtractor, CLIPModel
-
-feature_extractor = CLIPFeatureExtractor.from_pretrained(
-    "laion/CLIP-ViT-B-32-laion2B-s34B-b79K")
-clip_model = CLIPModel.from_pretrained("laion/CLIP-ViT-B-32-laion2B-s34B-b79K",
-                                       dtype=paddle.float32)
-
-guided_pipeline = CLIPGuidedStableDiffusion.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
-    clip_model=clip_model,
-    feature_extractor=feature_extractor,
-    paddle_dtype=paddle.float16,
-)
-guided_pipeline.enable_attention_slicing()
-
-prompt = "fantasy book cover, full moon, fantasy forest landscape, golden vector elements, fantasy magic, dark light night, intricate, elegant, sharp focus, illustration, highly detailed, digital painting, concept art, matte, art by WLOP and Artgerm and Albert Bierstadt, masterpiece"
-
-generator = paddle.Generator().manual_seed(2022)
-with paddle.amp.auto_cast(True, level="O2"):
-    images = []
-    for i in range(4):
-        image = guided_pipeline(
-            prompt,
-            num_inference_steps=50,
-            guidance_scale=7.5,
-            clip_guidance_scale=100,
-            num_cutouts=4,
-            use_cutouts=False,
-            generator=generator,
-            unfreeze_unet=False,
-            unfreeze_vae=False,
-        ).images[0]
-        images.append(image)
-
-# save images locally
-if not os.path.exists("clip_guided_sd"):
-    os.mkdir("clip_guided_sd")
-for i, img in enumerate(images):
-    img.save(f"./clip_guided_sd/image_{i}.png")
-```
-生成的图片保存在`images`列表中，样例如下：
-
-|       image_0       |       image_1       |       image_2       |       image_3       |
-|:-------------------:|:-------------------:|:-------------------:|:-------------------:|
-|![][clip_guided_sd_0]|![][clip_guided_sd_1]|![][clip_guided_sd_2]|![][clip_guided_sd_3]|
-
-[clip_guided_sd_0]: https://user-images.githubusercontent.com/40912707/220514674-e5cb29a3-b07e-4e8f-a4c8-323b35637294.png
-[clip_guided_sd_1]: https://user-images.githubusercontent.com/40912707/220514703-1eaf444e-1506-4c44-b686-5950fd79a3da.png
-[clip_guided_sd_2]: https://user-images.githubusercontent.com/40912707/220514765-89e48c13-156f-4e61-b433-06f1283d2265.png
-[clip_guided_sd_3]: https://user-images.githubusercontent.com/40912707/220514751-82d63fd4-e35e-482b-a8e1-c5c956119b2e.png
-
-### Wildcard Stable Diffusion
-
-例如我们有下面的prompt:
-
-```python
-prompt = "__animal__ sitting on a __object__ wearing a __clothing__"
-```
-然后，我们可以定义动物、物体和衣服的可能采样值。这些文件可以来自与类别同名的.txt文件。
-这些可能值也可以定义为字典，例如：`{"animal":["dog", "cat", mouse"]}`
-
-下面是一个完整的示例：
-创建一个`animal.txt`，包含的内容为：
-
-```
-dog
-cat
-mouse
-```
-创建一个`object.txt`，包含的内容为：
-```
-chair
-sofa
-bench
-```
-代码示例为：
-```python
-from wildcard_stable_diffusion import WildcardStableDiffusionPipeline
-
-pipe = WildcardStableDiffusionPipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4"
-)
-prompt = "__animal__ sitting on a __object__ wearing a __clothing__"
-image = pipe(
-    prompt,
-    wildcard_option_dict={
-        "clothing":["hat", "shirt", "scarf", "beret"]
-    },
-    wildcard_files=["object.txt", "animal.txt"],
-    num_prompt_samples=1
-).images[0]
-image.save("wildcard_img.png")
-```
-
-### Composable Stable diffusion
-
-以下代码需要9GB的显存。
-```python
-import os
-
-import paddle
-from composable_stable_diffusion import ComposableStableDiffusionPipeline
-
-prompt = "mystical trees | A magical pond | dark"
-scale = 7.5
-steps = 50
-weights = "7.5 | 7.5 | -7.5"
-pipe = ComposableStableDiffusionPipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4",
-)
-pipe.safety_checker = None
-
-images = []
-generator = paddle.Generator().manual_seed(2)
-for i in range(4):
-    image = pipe(prompt, guidance_scale=scale, num_inference_steps=steps,
-                 weights=weights, generator=generator).images[0]
-    images.append(image)
-
-# save images locally
-if not os.path.exists("composable_sd"):
-    os.mkdir("composable_sd")
-for i, img in enumerate(images):
-    img.save(f"./composable_sd/image_{i}.png")
-```
-
-### One Step Unet
-
-one-step-unet可以按照下面的方式运行：
-
-```python
-from one_step_unet import UnetSchedulerOneForwardPipeline
-
-pipe = UnetSchedulerOneForwardPipeline.from_pretrained("google/ddpm-cifar10-32")
-pipe()
-```
-这个pipeline不是作为feature使用的，它只是一个如何添加社区pipeline的示例
-
-### Stable Diffusion Interpolation
-
-以下代码运行需要10GB的显存。
-
-```python
-from interpolate_stable_diffusion import StableDiffusionWalkPipeline
-import paddle
-
-pipe = StableDiffusionWalkPipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4",
-    paddle_dtype=paddle.float16,
-    safety_checker=
-    None,  # Very important for videos...lots of false positives while interpolating
-    # custom_pipeline="interpolate_stable_diffusion",
-)
-pipe.enable_attention_slicing()
-
-prompts = [
-    'a photo of a landscape in summer',
-    'a photo of a landscape in autumn',
-]
-seeds = [0] * len(prompts)
-
-with paddle.amp.auto_cast(True, level="O2"):
-    frame_filepaths = pipe.walk(
-        prompts=prompts,
-        seeds=seeds,
-        num_interpolation_steps=16,
-        output_dir='./dreams',
-        batch_size=4,
-        height=512,
-        width=512,
-        guidance_scale=8.5,
-        num_inference_steps=50,
-    )
-```
-
-`walk(...)`方法将生成一系列图片，保存在`output_dir`指定的目录下，并返回这些图片的路径。你可以使用这些图片来制造stable diffusion视频。上述代码生成的效果如下：
-
-<center class="half">
-<img src="https://user-images.githubusercontent.com/40912707/220613501-df579ae1-c3a3-4f22-8865-d899c4732fe7.gif">
-</center>
-
-> 关于如何使用 stable diffusion 制作视频详细介绍以及更多完整的功能，请参考 [https://github.com/nateraw/stable-diffusion-videos](https://github.com/nateraw/stable-diffusion-videos)。
-
-
-### Stable Diffusion Mega
-
-`StableDiffusionMegaPipeline`可以让你在一个类里使用stable diffusion的主要用例。下述示例代码中展示了在一个pipeline中运行"text-to-image", "image-to-image", and "inpainting"。
-
-```python
-from stable_diffusion_mega import StableDiffusionMegaPipeline
-import PIL
-import requests
-from io import BytesIO
-import paddle
-
-
-def download_image(url):
-    response = requests.get(url)
-    return PIL.Image.open(BytesIO(response.content)).convert("RGB")
-
-
-pipe = StableDiffusionMegaPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5", paddle_dtype=paddle.float16)
-# pipe.to("gpu")
-pipe.enable_attention_slicing()
-generator = paddle.Generator().manual_seed(2022)
-
-# Text-to-Image
-with paddle.amp.auto_cast(True, level="O2"):
-    images = pipe.text2img("An astronaut riding a horse",
-                           generator=generator).images
-
-images[0].save("text2img.png")
-
-# Image-to-Image
-
-init_image = download_image(
-    "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
-)
-prompt = "A fantasy landscape, trending on artstation"
-with paddle.amp.auto_cast(True, level="O2"):
-    images = pipe.img2img(prompt=prompt,
-                          image=init_image,
-                          strength=0.75,
-                          guidance_scale=7.5,
-                          generator=generator).images
-images[0].save("img2img.png")
-
-# Inpainting
-
-img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
-mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
-init_image = download_image(img_url).resize((512, 512))
-mask_image = download_image(mask_url).resize((512, 512))
-
-prompt = "a cat sitting on a bench"
-with paddle.amp.auto_cast(True, level="O2"):
-    images = pipe.inpaint(prompt=prompt,
-                          image=init_image,
-                          mask_image=mask_image,
-                          strength=0.75,
-                          generator=generator).images
-images[0].save("inpainting.png")
-```
-上述代码生成效果如下：
-
-|使用|源|效果|
-|:-:|:-:|:-:|
-|text-to-image|An astronaut riding a horse|<img src="https://user-images.githubusercontent.com/40912707/220876185-4c2c01f8-90f3-45c4-813a-7143541ec456.png" width="500" />|
-|image-to-image|<img src="https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg" width="500" /> </br> A fantasy landscape, trending on artstation|<img src="https://user-images.githubusercontent.com/40912707/220876054-5eca5e9a-340e-40a4-a28e-b97af1b006e9.png" width="500" />|
-|inpainting|<img src="https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png" width="250" /><img src="https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png" width="250" /> </br>a cat sitting on a bench|<img src="https://user-images.githubusercontent.com/40912707/220876220-ee044a56-6455-4566-9f42-580e29555497.png" width="500" />|
-
-[text2img]: https://user-images.githubusercontent.com/40912707/220876185-4c2c01f8-90f3-45c4-813a-7143541ec456.png
-[img2img]: https://user-images.githubusercontent.com/40912707/220876054-5eca5e9a-340e-40a4-a28e-b97af1b006e9.png
-[inpainting]: https://user-images.githubusercontent.com/40912707/220876220-ee044a56-6455-4566-9f42-580e29555497.png
-
-
-### Long Prompt Weighting Stable Diffusion
-
-该自定义Pipeline特征如下：
-* 输入提示没有77 token的长度限制
-* 包括文生图、图生图、图像修复三种管道
-* 给提示片段加上强调，例如 `a baby deer with (big eyes)`
-* 给提示片段加上淡化，例如 `a [baby] deer with big eyes`
-* 给提示片段加上精确的权重，例如 `a baby deer with (big eyes:1.3)`
-
-prompt加权公示：
-* `a baby deer with` == `(a baby deer with:1.0)`
-* `(big eyes)` == `(big eyes:1.1)`
-* `((big eyes))` == `(big eyes:1.21)`
-* `[big eyes]` == `(big eyes:0.91)`
-
-代码示例如下：
-
-```python
-import paddle
-from lpw_stable_diffusion import StableDiffusionLongPromptWeightingPipeline
-
-pipe = StableDiffusionLongPromptWeightingPipeline.from_pretrained(
-    "hakurei/waifu-diffusion",
-    paddle_dtype=paddle.float16,
-    )
-
-prompt = "1girl, aqua eyes, baseball cap, blonde hair, closed mouth, earrings, green background, hat, hoop earrings, jewelry, looking at viewer, shirt, short hair, simple background, solo, upper body, yellow shirt"
-neg_prompt = "lowres, bad_anatomy, error_body, error_hair, error_arm, (error_hands, bad_hands, error_fingers, bad_fingers, missing_fingers) error_legs, bad_legs, multiple_legs, missing_legs, error_lighting, error_shadow, error_reflection, text, error, extra_digit, fewer_digits, cropped, worst_quality, low_quality, normal_quality, jpeg_artifacts, signature, watermark, username, blurry"
-
-generator = paddle.Generator().manual_seed(0)
-
-with paddle.amp.auto_cast(True, level="O2"):
-    images = pipe.text2img(prompt,
-                           negative_prompt=neg_prompt,
-                           width=512,
-                           height=512,
-                           max_embeddings_multiples=3,
-                           generator=generator).images
-
-images[0].save("lpw.png")
-```
-
-上述代码生成结果如下:
-
-<center><img src="https://user-images.githubusercontent.com/40912707/221503299-24055b14-0b07-4f94-b7f9-d4f84b492540.png" style="zoom:50%"/></center>
-
-
-### AUTOMATIC1111 WebUI Stable Diffusion
-`WebUIStableDiffusionPipeline` 是与 [AUTOMATIC1111/stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) 基本对齐的一个pipeline。
-
-该自定义Pipeline支持如下的功能：
-* 输入的 token 没有长度限制，可以超过77；
-* 支持clip_skip，即可以使用不同层text_encoder的输出；
-* 支持直接加载webui中的textual_inversion权重；
-* 支持ControlNet；
-
-```python
-from pathlib import Path
-
-import cv2
-import numpy as np
-import paddle
-from PIL import Image
-from webui_stable_diffusion import WebUIStableDiffusionPipeline
-
-from ppdiffusers import ControlNetModel, DiffusionPipeline
-from ppdiffusers.utils import image_grid, load_image
-
-# 支持controlnet模型
-controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", paddle_dtype=paddle.float16)
-pipe = WebUIStableDiffusionPipeline.from_pretrained(
-    "TASUKU2023/Chilloutmix", controlnet=controlnet, paddle_dtype=paddle.float16
-)
-# 或者
-# pipe = DiffusionPipeline.from_pretrained("TASUKU2023/Chilloutmix", controlnet=controlnet, paddle_dtype=paddle.float16, custom_pipeline="webui_stable_diffusion")
-
-# 自动下载civitai的lora及ti文件（请注意自己的网络。）
-# 介绍网页，程序将自动搜索介绍网页的下载链接
-pipe.download_civitai_lora_file("https://civitai.com/models/15365/hanfu")
-pipe.download_civitai_lora_file("https://civitai.com/models/12597/moxin")
-pipe.download_civitai_ti_file("https://civitai.com/models/1998/autumn-style")
-pipe.download_civitai_ti_file("https://civitai.com/models/21131/daisy-ridley-embedding")
-# 纯下载链接
-pipe.download_civitai_lora_file("https://civitai.com/api/download/models/21656")
-
-print("Supported Lora: " + "、 ".join([p.stem for p in Path(pipe.LORA_DIR).glob("*.safetensors")]))
-
-# 我们需要安装develop版的paddle才可以使用xformers
-# pipe.enable_xformers_memory_efficient_attention()
-scheduler_name = ["ddim", "pndm", "euler", "dpm-multi"]
-for enable_lora in [True, False]:
-    images = []
-    for sc in scheduler_name:
-        # 切换scheduler
-        pipe.switch_scheduler(sc)
-        # 指定clip_skip
-        clip_skip = 1
-        # 指定seed
-        generator = paddle.Generator().manual_seed(0)
-        # guidance_scale
-        guidance_scale = 3.5
-        prompt = "# shukezouma, negative space, , shuimobysim , portrait of a woman standing , willow branches, (masterpiece, best quality:1.2), traditional chinese ink painting, <lora:MoXinV1:1.0>, modelshoot style, peaceful, (smile), looking at viewer, wearing long hanfu, hanfu, song, willow tree in background, wuchangshuo,"
-        negative_prompt = "(worst quality:2), (low quality:2), (normal quality:2), lowres, normal quality, skin spots, acnes, skin blemishes, age spot, glans, (watermark:2),"
-        img = pipe(
-            prompt,
-            negative_prompt=negative_prompt,
-            num_inference_steps=50,
-            height=768,
-            width=512,
-            clip_skip=clip_skip,
-            guidance_scale=guidance_scale,
-            generator=generator,
-            enable_lora=enable_lora,
-        ).images[0]
-        images.append(img)
-    if enable_lora:
-        image_grid(images, 2, 2).save(f"lora_enable.png")
-    else:
-        image_grid(images, 2, 2).save(f"lora_disable.png")
-
-
-image = np.array(
-    load_image("https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/control_bird_canny_demo.png")
-)
-image = cv2.Canny(image, 100, 200)
-image = image[:, :, None]
-image = np.concatenate([image] * 3, axis=2)
-canny_image = Image.fromarray(image)
-canny_image = canny_image.resize((512, 768))
-
-# controlnet
-for enable_lora in [True, False]:
-    images = []
-    for sc in scheduler_name:
-        pipe.switch_scheduler(sc)
-        clip_skip = 1
-        generator = paddle.Generator().manual_seed(0)
-        guidance_scale = 3.5
-        prompt = "a bird <lora:MoXinV1:1.0>"
-        negative_prompt = "(worst quality:2), (low quality:2), (normal quality:2), lowres, normal quality, skin spots, acnes, skin blemishes, age spot, glans, (watermark:2),"
-        img = pipe(
-            prompt,
-            image=canny_image,
-            negative_prompt=negative_prompt,
-            num_inference_steps=50,
-            height=None,  # auto detect image height and width
-            width=None,  # auto detect image height and width
-            clip_skip=clip_skip,
-            guidance_scale=guidance_scale,
-            generator=generator,
-            enable_lora=enable_lora,
-            resize_mode=1,
-            controlnet_conditioning_scale=1.0,
-        ).images[0]
-        images.append(img)
-    if enable_lora:
-        image_grid(images, 2, 2).save(f"lora_enable_controlnet.png")
-    else:
-        image_grid(images, 2, 2).save(f"lora_disable_controlnet.png")
-```
-
-生成的图片如下所示：
-|       lora_disable.png       |       lora_enable.png       |       lora_disable_controlnet.png       |       lora_enable_controlnet.png       |
-|:-------------------:|:-------------------:|:-------------------:|:-------------------:|
-|![][lora_disable]|![][lora_enable]|![][lora_disable_controlnet]|![][lora_enable_controlnet]|
-
-[lora_disable]: https://user-images.githubusercontent.com/50394665/230832029-c06a1367-1f2c-4206-9666-99854fcee240.png
-[lora_enable]: https://user-images.githubusercontent.com/50394665/230832028-730ce442-dd34-4e36-afd0-81d40843359a.png
-[lora_disable_controlnet]: https://github.com/PaddlePaddle/PaddleNLP/assets/50394665/49ad234e-f92c-4e55-9d4c-86b5d392d704
-[lora_enable_controlnet]: https://github.com/PaddlePaddle/PaddleNLP/assets/50394665/cda43315-cfa5-490a-a2ab-09d9ded7bf44
-
-### Stable Diffusion with High Resolution Fixing
-`StableDiffusionHiresFixPipeline` 基于Stable Diffusion进行文图生成，同时启动高分辨率修复功能。该自定义Pipeline生成图像期间共包含两个阶段: 初始生成图像阶段和高清修复阶段。使用方式如下所示：
-
-```python
-import paddle
-from stable_diffusion_hires_fix import StableDiffusionHiresFixPipeline
-from ppdiffusers import EulerAncestralDiscreteScheduler
-
-pipe = StableDiffusionHiresFixPipeline.from_pretrained("stabilityai/stable-diffusion-2", paddle_dtype=paddle.float16)
-pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
-
-generator = paddle.Generator().manual_seed(5232132133)
-prompt = "1 real girl, long black hair, detailed face, light smile, chinese style, hanfu"
-image = pipe(prompt, guidance_scale=7.5, height=768, width=768, generator=generator, num_inference_steps=40, hires_ratio=0.5, hr_resize_width=768, hr_resize_height=1024, enable_hr=True).images[0]
-
-image.show()
-
-```
-生成的图片如下所示：
-<center><img src="https://github.com/PaddlePaddle/PaddleNLP/assets/35913314/1c96a219-0b5e-4e1a-b244-0c8cc7cb41f9" width=40%></center>
-
-
-### ControlNet Reference Only
-[Reference-Only Control](https://github.com/Mikubill/sd-webui-controlnet#reference-only-control) 是一种不需要任何控制模型就可以直接使用图像作为参考来引导生成图像的方法。它使用方式如下所示：
-
-```python
-import paddle
-from reference_only import ReferenceOnlyPipeline
-from ppdiffusers import DDIMScheduler
-from ppdiffusers.utils import load_image
-
-pipe = ReferenceOnlyPipeline.from_pretrained("TASUKU2023/Chilloutmix", safety_checker=None, paddle_dtype=paddle.float16)
-pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config, steps_offset=1, clip_sample=False, set_alpha_to_one=False,)
-
-prompt = "a dog running on grassland, best quality"
-input_image = load_image("https://raw.githubusercontent.com/Mikubill/sd-webui-controlnet/main/samples/dog_rel.png")
-
-for control_name in ["none", "reference_only", "reference_adain", "reference_adain+attn"]:
-    generator = paddle.Generator().manual_seed(42)
-    image = pipe(prompt,
-                 guidance_scale=7.,
-                 height=512,
-                 width=512,
-                 image=input_image,
-                 num_inference_steps=20,
-                 generator=generator,
-                 control_name=control_name, # "none", "reference_only", "reference_adain", "reference_adain+attn"
-                 attention_auto_machine_weight=1.0, # 0.0~1.0
-                 gn_auto_machine_weight=1.0, # 0.0~2.0
-                 current_style_fidelity=0.5, # 0.0~1.0
-                 resize_mode=0, # ["0 means Just resize", "1 means Crop and resize", "2 means Resize and fill", "-1 means Do nothing"]
-                ).images[0]
-    image.save(control_name + ".png")
-```
-生成的图片如下所示：
-
-
-|       none       |       reference_only       |       reference_adain       |       reference_adain+attn       |
-|:-------------------:|:-------------------:|:-------------------:|:-------------------:|
-|![][none]|![][reference_only]|![][reference_adain]|![][reference_adain+attn]|
-
-[none]: https://github.com/PaddlePaddle/PaddleNLP/assets/50394665/97db3779-9dd7-4d62-ae15-5d2fda68f311
-[reference_only]: https://github.com/PaddlePaddle/PaddleNLP/assets/50394665/4d67e752-cddc-40ab-9524-39e8d9b4a428
-[reference_adain]: https://github.com/PaddlePaddle/PaddleNLP/assets/50394665/266968c7-5065-4589-9bd8-47515d50c6de
-[reference_adain+attn]: https://github.com/PaddlePaddle/PaddleNLP/assets/50394665/73d53a4f-e601-4969-9cb8-e3fdf719ae0c
-
-
-### Stable Diffusion Mixture Tiling
-`StableDiffusionTilingPipeline`是一个基于Mixture机制的多文本大图生成Stable Diffusion Pipeline。使用方式如下所示：
-
-```python
-from ppdiffusers import LMSDiscreteScheduler, DiffusionPipeline
-
-# Creater scheduler and model (similar to StableDiffusionPipeline)
-scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
-pipeline = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", scheduler=scheduler, custom_pipeline="mixture_tiling")
-
-# Mixture of Diffusers generation
-image = pipeline(
-    prompt=[[
-        "A charming house in the countryside, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
-        "A dirt road in the countryside crossing pastures, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
-        "An old and rusty giant robot lying on a dirt road, by jakub rozalski, dark sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece"
-    ]],
-    tile_height=640,
-    tile_width=640,
-    tile_row_overlap=0,
-    tile_col_overlap=256,
-    guidance_scale=8,
-    seed=7178915308,
-    num_inference_steps=50,
-)["images"][0]
-image.save('mixture_tiling' + ".png")
-```
-生成的图片如下所示：
-<center><img src="https://user-images.githubusercontent.com/20476674/250050184-c3d26d20-dbdf-42f6-9723-5f35f628f68e.png" width=100%></center>
-
-### CLIP Guided Images Mixing With Stable Diffusion
-`CLIPGuidedImagesMixingStableDiffusion` 基于Stable Diffusion来针对输入的两个图片进行融合：
-```python
-import requests
-from io import BytesIO
-
-import PIL
-import paddle
-import open_clip
-from open_clip import SimpleTokenizer
-from ppdiffusers import DiffusionPipeline
-from paddlenlp.transformers import CLIPFeatureExtractor, CLIPModel
-
-
-def download_image(url):
-    response = requests.get(url)
-    return PIL.Image.open(BytesIO(response.content)).convert("RGB")
-
-# Loading additional models
-feature_extractor = CLIPFeatureExtractor.from_pretrained(
-    "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
-)
-clip_model = CLIPModel.from_pretrained(
-    "laion/CLIP-ViT-B-32-laion2B-s34B-b79K", paddle_dtype=paddle.float16
-)
-
-mixing_pipeline = DiffusionPipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4",
-    custom_pipeline="clip_guided_images_mixing_stable_diffusion",
-    clip_model=clip_model,
-    feature_extractor=feature_extractor,
-    paddle_dtype=paddle.float16,
-)
-mixing_pipeline.enable_attention_slicing()
-
-# Pipline running
-generator = paddle.Generator().manual_seed(17)
-
-def download_image(url):
-    response = requests.get(url)
-    return PIL.Image.open(BytesIO(response.content)).convert("RGB")
-
-content_image = download_image("https://paddlenlp.bj.bcebos.com/models/community/westfish/develop/clip_guided_images_mixing_stable_diffusion_images/boromir.jpg")
-style_image = download_image("https://paddlenlp.bj.bcebos.com/models/community/westfish/develop/clip_guided_images_mixing_stable_diffusion_images/gigachad.jpg")
-
-pipe_images = mixing_pipeline(
-    num_inference_steps=50,
-    content_image=content_image,
-    style_image=style_image,
-    content_prompt="boromir",
-    style_prompt="gigachad",
-    noise_strength=0.65,
-    slerp_latent_style_strength=0.9,
-    slerp_prompt_style_strength=0.1,
-    slerp_clip_image_style_strength=0.1,
-    guidance_scale=9.0,
-    batch_size=1,
-    clip_guidance_scale=100,
-    generator=generator,
-).images
-
-pipe_images[0].save('clip_guided_images_mixing_stable_diffusion.png')
-```
-图片生成效果如下所示：
-<div align="center">
-<center><img src="https://user-images.githubusercontent.com/20476674/251700919-8abd694f-d93f-4ead-8379-f99405aff1c4.jpg" width=30%></center>
-<center>内容图像</center>
-<div align="center">
-<center><img src="https://user-images.githubusercontent.com/20476674/251700932-4ff5f914-bbd6-4c99-abc4-c7a7fc0fa826.jpg" width=30%></center>
-<center>风格图像</center>
-<div align="center">
-<center><img src="https://user-images.githubusercontent.com/20476674/251701022-c11ea706-f865-4b3f-ab99-9eb79c87439b.png" width=30%></center>
-<center>生成图像</center>
diff --git a/ppdiffusers/examples/community/clip_guided_images_mixing_stable_diffusion.py b/ppdiffusers/examples/community/clip_guided_images_mixing_stable_diffusion.py
deleted file mode 100644
index f15356d5a855..000000000000
--- a/ppdiffusers/examples/community/clip_guided_images_mixing_stable_diffusion.py
+++ /dev/null
@@ -1,408 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import Optional, Union
-
-import numpy as np
-import paddle
-import paddle.nn.functional as F
-import PIL
-from einops import rearrange
-
-from paddlenlp.transformers import (
-    CLIPFeatureExtractor,
-    CLIPModel,
-    CLIPTextModel,
-    CLIPTokenizer,
-)
-from ppdiffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    DiffusionPipeline,
-    DPMSolverMultistepScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    UNet2DConditionModel,
-)
-from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import (
-    StableDiffusionPipelineOutput,
-)
-from ppdiffusers.utils import PIL_INTERPOLATION, randn_tensor
-
-
-def preprocess(image, w, h):
-    if isinstance(image, paddle.Tensor):
-        return image
-    elif isinstance(image, PIL.Image.Image):
-        image = [image]
-    if isinstance(image[0], PIL.Image.Image):
-        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[(None), :] for i in image]
-        image = np.concatenate(image, axis=0)
-        image = np.array(image).astype(np.float32) / 255.0
-        image = image.transpose(0, 3, 1, 2)
-        image = 2.0 * image - 1.0
-        image = paddle.to_tensor(data=image)
-    elif isinstance(image[0], paddle.Tensor):
-        image = paddle.concat(x=image, axis=0)
-    return image
-
-
-def slerp(t, v0, v1, DOT_THRESHOLD=0.9995):
-    if not isinstance(v0, np.ndarray):
-        inputs_are_paddle = True
-        # input_device = v0.place
-        v0 = v0.cpu().numpy()
-        v1 = v1.cpu().numpy()
-    dot = np.sum(v0 * v1 / (np.linalg.norm(v0) * np.linalg.norm(v1)))
-    if np.abs(dot) > DOT_THRESHOLD:
-        v2 = (1 - t) * v0 + t * v1
-    else:
-        theta_0 = np.arccos(dot)
-        sin_theta_0 = np.sin(theta_0)
-        theta_t = theta_0 * t
-        sin_theta_t = np.sin(theta_t)
-        s0 = np.sin(theta_0 - theta_t) / sin_theta_0
-        s1 = sin_theta_t / sin_theta_0
-        v2 = s0 * v0 + s1 * v1
-    if inputs_are_paddle:
-        v2 = paddle.to_tensor(data=v2)
-    return v2
-
-
-def spherical_dist_loss(x, y):
-    x = F.normalize(x=x, axis=-1)
-    y = F.normalize(x=y, axis=-1)
-    return (
-        paddle.divide((x - y).norm(axis=-1), paddle.to_tensor(2, dtype=x.dtype))
-        .asin()
-        .pow(y=paddle.to_tensor(2, dtype=x.dtype))
-        .multiply(y=paddle.to_tensor(2, dtype=x.dtype))
-    )
-
-
-def set_requires_grad(model, value):
-    for param in model.parameters():
-        param.stop_gradient = not value
-
-
-class CLIPGuidedImagesMixingStableDiffusion(DiffusionPipeline):
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        clip_model: CLIPModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: Union[PNDMScheduler, LMSDiscreteScheduler, DDIMScheduler, DPMSolverMultistepScheduler],
-        feature_extractor: CLIPFeatureExtractor,
-    ):
-        super().__init__()
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            clip_model=clip_model,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            feature_extractor=feature_extractor,
-        )
-        self.feature_extractor_size = (
-            feature_extractor.size
-            if isinstance(feature_extractor.size, int)
-            else feature_extractor.size["shortest_edge"]
-        )
-        self.normalize = paddle.vision.transforms.Normalize(
-            mean=feature_extractor.image_mean, std=feature_extractor.image_std
-        )
-        set_requires_grad(self.text_encoder, False)
-        set_requires_grad(self.clip_model, False)
-
-    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
-        if slice_size == "auto":
-            slice_size = self.unet.config.attention_head_dim // 2
-        self.unet.set_attention_slice(slice_size)
-
-    def disable_attention_slicing(self):
-        self.enable_attention_slicing(None)
-
-    def freeze_vae(self):
-        set_requires_grad(self.vae, False)
-
-    def unfreeze_vae(self):
-        set_requires_grad(self.vae, True)
-
-    def freeze_unet(self):
-        set_requires_grad(self.unet, False)
-
-    def unfreeze_unet(self):
-        set_requires_grad(self.unet, True)
-
-    def get_timesteps(self, num_inference_steps, strength):
-        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
-        t_start = max(num_inference_steps - init_timestep, 0)
-        timesteps = self.scheduler.timesteps[t_start:]
-        return timesteps, num_inference_steps - t_start
-
-    def prepare_latents(self, image, timestep, batch_size, dtype, generator=None):
-        if not isinstance(image, paddle.Tensor):
-            raise ValueError(f"`image` has to be of type `torch.Tensor` but is {type(image)}")
-        image = image.cast(dtype)
-        if isinstance(generator, list):
-            init_latents = [
-                self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
-            ]
-            init_latents = paddle.concat(x=init_latents, axis=0)
-        else:
-            init_latents = self.vae.encode(image).latent_dist.sample(generator)
-        init_latents = 0.18215 * init_latents
-        init_latents = init_latents.repeat_interleave(repeats=batch_size, axis=0)
-        noise = randn_tensor(init_latents.shape, generator=generator, dtype=dtype)
-
-        # get latents
-        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
-        latents = init_latents
-        return latents
-
-    def get_clip_image_embeddings(self, image, batch_size):
-        clip_image_input = self.feature_extractor.preprocess(image)
-        clip_image_features = (
-            paddle.to_tensor(data=clip_image_input["pixel_values"][0]).unsqueeze(axis=0).astype(dtype="float16")
-        )
-        image_embeddings_clip = self.clip_model.get_image_features(clip_image_features)
-        image_embeddings_clip = image_embeddings_clip / image_embeddings_clip.norm(p=2, axis=-1, keepdim=True)
-        image_embeddings_clip = image_embeddings_clip.repeat_interleave(repeats=batch_size, axis=0)
-        return image_embeddings_clip
-
-    @paddle.enable_grad()
-    def cond_fn(
-        self,
-        latents,
-        timestep,
-        index,
-        text_embeddings,
-        noise_pred_original,
-        original_image_embeddings_clip,
-        clip_guidance_scale,
-    ):
-        out_0 = latents.detach()
-        out_0.stop_gradient = not True
-        latents = out_0
-        latent_model_input = self.scheduler.scale_model_input(latents, timestep)
-
-        # predict the noise residual
-        noise_pred = self.unet(latent_model_input, timestep, encoder_hidden_states=text_embeddings).sample
-        if isinstance(self.scheduler, (PNDMScheduler, DDIMScheduler, DPMSolverMultistepScheduler)):
-            alpha_prod_t = self.scheduler.alphas_cumprod[timestep]
-            beta_prod_t = 1 - alpha_prod_t
-
-            # compute predicted original sample from predicted noise also called
-            # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-            pred_original_sample = (latents - beta_prod_t**0.5 * noise_pred) / alpha_prod_t**0.5
-            fac = paddle.sqrt(x=beta_prod_t)
-            sample = pred_original_sample * fac + latents * (1 - fac)
-        elif isinstance(self.scheduler, LMSDiscreteScheduler):
-            sigma = self.scheduler.sigmas[index]
-            sample = latents - sigma * noise_pred
-        else:
-            raise ValueError(f"scheduler type {type(self.scheduler)} not supported")
-
-        # Hardcode 0.18215 because stable-diffusion-2-base has not self.vae.config.scaling_factor
-        sample = 1 / 0.18215 * sample
-        image = self.vae.decode(sample).sample
-        image = (image / 2 + 0.5).clip(min=0, max=1)
-
-        # image = paddle.vision.transforms.Resize(self.feature_extractor_size)(image)
-        c_size = image.shape[0]
-        image = rearrange(image, "c t h w -> (c t) h w")
-        image = paddle.vision.transforms.Resize(self.feature_extractor_size)(image)
-        image = rearrange(image, "(c t) h w -> c t h w", c=c_size)
-
-        image = self.normalize(image)
-        image_embeddings_clip = self.clip_model.get_image_features(image)
-        image_embeddings_clip = image_embeddings_clip / image_embeddings_clip.norm(p=2, axis=-1, keepdim=True)
-        loss = spherical_dist_loss(image_embeddings_clip, original_image_embeddings_clip).mean() * clip_guidance_scale
-        grads = -paddle.autograd.grad(loss, latents)[0]
-        if isinstance(self.scheduler, LMSDiscreteScheduler):
-            latents = latents.detach() + grads * sigma**2
-            noise_pred = noise_pred_original
-        else:
-            noise_pred = noise_pred_original - paddle.sqrt(x=beta_prod_t) * grads
-        return noise_pred, latents
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        style_image: Union[paddle.Tensor, PIL.Image.Image],
-        content_image: Union[paddle.Tensor, PIL.Image.Image],
-        style_prompt: Optional[str] = None,
-        content_prompt: Optional[str] = None,
-        height: Optional[int] = 512,
-        width: Optional[int] = 512,
-        noise_strength: float = 0.6,
-        num_inference_steps: Optional[int] = 50,
-        guidance_scale: Optional[float] = 7.5,
-        batch_size: Optional[int] = 1,
-        eta: float = 0.0,
-        clip_guidance_scale: Optional[float] = 100,
-        generator: Optional[paddle.Generator] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        slerp_latent_style_strength: float = 0.8,
-        slerp_prompt_style_strength: float = 0.1,
-        slerp_clip_image_style_strength: float = 0.1,
-    ):
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(f"You have passed {batch_size} batch_size, but only {len(generator)} generators.")
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        content_text_input = self.tokenizer(
-            content_prompt,
-            padding="max_length",
-            max_length=self.tokenizer.model_max_length,
-            truncation=True,
-            return_tensors="pd",
-        )
-        content_text_embeddings = self.text_encoder(content_text_input.input_ids)[0]
-        style_text_input = self.tokenizer(
-            style_prompt,
-            padding="max_length",
-            max_length=self.tokenizer.model_max_length,
-            truncation=True,
-            return_tensors="pd",
-        )
-        style_text_embeddings = self.text_encoder(style_text_input.input_ids)[0]
-
-        text_embeddings = slerp(slerp_prompt_style_strength, content_text_embeddings, style_text_embeddings)
-
-        # duplicate text embeddings for each generation per prompt
-        text_embeddings = text_embeddings.repeat_interleave(repeats=batch_size, axis=0)
-
-        # set timesteps
-        accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys())
-        extra_set_kwargs = {}
-        if accepts_offset:
-            extra_set_kwargs["offset"] = 1
-        self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
-        # Some schedulers like PNDM have timesteps as arrays
-        # It's more optimized to move all timesteps to correct device beforehand
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, noise_strength)
-        latent_timestep = timesteps[:1].tile(repeat_times=[batch_size])
-
-        # Preprocess image
-        preprocessed_content_image = preprocess(content_image, width, height)
-        content_latents = self.prepare_latents(
-            preprocessed_content_image, latent_timestep, batch_size, text_embeddings.dtype, generator
-        )
-        preprocessed_style_image = preprocess(style_image, width, height)
-        style_latents = self.prepare_latents(
-            preprocessed_style_image, latent_timestep, batch_size, text_embeddings.dtype, generator
-        )
-        latents = slerp(slerp_latent_style_strength, content_latents, style_latents)
-        if clip_guidance_scale > 0:
-            content_clip_image_embedding = self.get_clip_image_embeddings(content_image, batch_size)
-            style_clip_image_embedding = self.get_clip_image_embeddings(style_image, batch_size)
-            clip_image_embeddings = slerp(
-                slerp_clip_image_style_strength, content_clip_image_embedding, style_clip_image_embedding
-            )
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance:
-            max_length = content_text_input.input_ids.shape[-1]
-            uncond_input = self.tokenizer([""], padding="max_length", max_length=max_length, return_tensors="pd")
-            uncond_embeddings = self.text_encoder(uncond_input.input_ids)[0]
-            # duplicate unconditional embeddings for each generation per prompt
-            uncond_embeddings = uncond_embeddings.repeat_interleave(repeats=batch_size, axis=0)
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            text_embeddings = paddle.concat(x=[uncond_embeddings, text_embeddings])
-
-        # get the initial random noise unless the user supplied it
-
-        # Unlike in other pipelines, latents need to be generated in the target device
-        # for 1-to-1 results reproducibility with the CompVis implementation.
-        # However this currently doesn't work in `mps`.
-        latents_shape = [batch_size, self.unet.config.in_channels, height // 8, width // 8]
-        latents_dtype = text_embeddings.dtype
-        if latents is None:
-            latents = paddle.randn(shape=latents_shape, generator=generator, dtype=latents_dtype)
-        else:
-            if latents.shape != latents_shape:
-                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        with self.progress_bar(total=num_inference_steps):
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat(x=[latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                # predict the noise residual
-                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
-
-                # perform classifier free guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(chunks=2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                # perform clip guidance
-                if clip_guidance_scale > 0:
-                    text_embeddings_for_guidance = (
-                        text_embeddings.chunk(chunks=2)[1] if do_classifier_free_guidance else text_embeddings
-                    )
-                    noise_pred, latents = self.cond_fn(
-                        latents,
-                        t,
-                        i,
-                        text_embeddings_for_guidance,
-                        noise_pred,
-                        clip_image_embeddings,
-                        clip_guidance_scale,
-                    )
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-        # Hardcode 0.18215 because stable-diffusion-2-base has not self.vae.config.scaling_factor
-        latents = 1 / 0.18215 * latents
-        image = self.vae.decode(latents).sample
-        image = (image / 2 + 0.5).clip(min=0, max=1)
-        image = image.cpu().transpose(perm=[0, 2, 3, 1]).numpy()
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-        if not return_dict:
-            return image, None
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=None)
diff --git a/ppdiffusers/examples/community/clip_guided_stable_diffusion.py b/ppdiffusers/examples/community/clip_guided_stable_diffusion.py
deleted file mode 100644
index 0629f1d08857..000000000000
--- a/ppdiffusers/examples/community/clip_guided_stable_diffusion.py
+++ /dev/null
@@ -1,425 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import Callable, List, Optional, Union
-
-import paddle
-from paddle import nn
-from paddle.nn import functional as F
-from paddle.vision import transforms
-
-from paddlenlp.transformers import (
-    CLIPFeatureExtractor,
-    CLIPModel,
-    CLIPTextModel,
-    CLIPTokenizer,
-)
-from ppdiffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    DiffusionPipeline,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    UNet2DConditionModel,
-)
-from ppdiffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
-from ppdiffusers.utils import logging
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-class MakeCutouts(nn.Layer):
-    def __init__(self, cut_size, cut_power=1.0):
-        super().__init__()
-
-        self.cut_size = cut_size
-        self.cut_power = cut_power
-
-    def forward(self, pixel_values, num_cutouts):
-        sideY, sideX = pixel_values.shape[2:4]
-        max_size = min(sideX, sideY)
-        min_size = min(sideX, sideY, self.cut_size)
-        cutouts = []
-        for _ in range(num_cutouts):
-            size = int(paddle.rand((1,)) ** self.cut_power * (max_size - min_size) + min_size)
-            offsetx = int(paddle.randint(0, sideX - size + 1, (1,)))
-            offsety = int(paddle.randint(0, sideY - size + 1, (1,)))
-            cutout = pixel_values[:, :, offsety : offsety + size, offsetx : offsetx + size]
-            cutouts.append(F.adaptive_avg_pool2d(cutout, self.cut_size))
-        return paddle.concat(cutouts)
-
-
-def spherical_dist_loss(x, y):
-    x = F.normalize(x, axis=-1)
-    y = F.normalize(y, axis=-1)
-    return ((x - y).norm(axis=-1) / 2).asin().pow(2) * 2
-
-
-def set_stop_gradient(model, value):
-    for param in model.parameters():
-        param.stop_gradient = value
-
-
-class CLIPGuidedStableDiffusion(DiffusionPipeline):
-    """CLIP guided stable diffusion based on the amazing repo by @crowsonkb and @Jack000
-    - https://github.com/Jack000/glid-3-xl
-    - https://github.com/crowsonkb/k-diffusion
-    """
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        clip_model: CLIPModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: Union[PNDMScheduler, LMSDiscreteScheduler, DDIMScheduler],
-        feature_extractor: CLIPFeatureExtractor,
-    ):
-        super().__init__()
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            clip_model=clip_model,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            feature_extractor=feature_extractor,
-        )
-
-        self.normalize = transforms.Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
-        self.cut_out_size = (
-            feature_extractor.size
-            if isinstance(feature_extractor.size, int)
-            else feature_extractor.size["shortest_edge"]
-        )
-        self.make_cutouts = MakeCutouts(self.cut_out_size)
-
-        set_stop_gradient(self.text_encoder, True)
-        set_stop_gradient(self.clip_model, True)
-
-    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
-        if slice_size == "auto":
-            # half the attention head size is usually a good trade-off between
-            # speed and memory
-            slice_size = self.unet.config.attention_head_dim // 2
-        self.unet.set_attention_slice(slice_size)
-
-    def disable_attention_slicing(self):
-        self.enable_attention_slicing(None)
-
-    def freeze_vae(self):
-        set_stop_gradient(self.vae, True)
-
-    def unfreeze_vae(self):
-        set_stop_gradient(self.vae, False)
-
-    def freeze_unet(self):
-        set_stop_gradient(self.unet, True)
-
-    def unfreeze_unet(self):
-        set_stop_gradient(self.unet, False)
-
-    def cond_fn(
-        self,
-        latents,
-        timestep,
-        index,
-        text_embeddings,
-        noise_pred_original,
-        text_embeddings_clip,
-        clip_guidance_scale,
-        num_cutouts,
-        use_cutouts=True,
-    ):
-        # https://github.com/PaddlePaddle/Paddle/issues/54306  in 2.5rc paddle.set_grad_enabled has bug
-        with paddle.set_grad_enabled(True):
-            latents = latents.detach()
-            latents.stop_gradient = False
-
-            if isinstance(self.scheduler, LMSDiscreteScheduler):
-                sigma = self.scheduler.sigmas[index]
-                # the model input needs to be scaled to match the continuous ODE formulation in K-LMS
-                latent_model_input = latents / ((sigma**2 + 1) ** 0.5)
-            else:
-                latent_model_input = latents
-
-            # predict the noise residual
-            noise_pred = self.unet(latent_model_input, timestep, encoder_hidden_states=text_embeddings).sample
-
-            if isinstance(self.scheduler, (PNDMScheduler, DDIMScheduler)):
-                alpha_prod_t = self.scheduler.alphas_cumprod[timestep]
-                beta_prod_t = 1 - alpha_prod_t
-                # compute predicted original sample from predicted noise also called
-                # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-                pred_original_sample = (latents - beta_prod_t ** (0.5) * noise_pred) / alpha_prod_t ** (0.5)
-
-                fac = paddle.sqrt(beta_prod_t)
-                sample = pred_original_sample * (fac) + latents * (1 - fac)
-            elif isinstance(self.scheduler, LMSDiscreteScheduler):
-                sigma = self.scheduler.sigmas[index]
-                sample = latents - sigma * noise_pred
-            else:
-                raise ValueError(f"scheduler type {type(self.scheduler)} not supported")
-
-            sample = 1 / 0.18215 * sample
-            image = self.vae.decode(sample).sample
-            image = (image / 2 + 0.5).clip(0, 1)
-
-            if use_cutouts:
-                image = self.make_cutouts(image, num_cutouts)
-            else:
-                resize_transform = transforms.Resize(self.cut_out_size)
-                image = paddle.stack([resize_transform(img) for img in image], axis=0)
-            image = self.normalize(image).astype(latents.dtype)
-
-            image_embeddings_clip = self.clip_model.get_image_features(image)
-            image_embeddings_clip = image_embeddings_clip / image_embeddings_clip.norm(p=2, axis=-1, keepdim=True)
-
-            if use_cutouts:
-                dists = spherical_dist_loss(image_embeddings_clip, text_embeddings_clip)
-                dists = dists.reshape([num_cutouts, sample.shape[0], -1])
-                loss = dists.sum(2).mean(0).sum() * clip_guidance_scale
-            else:
-                loss = spherical_dist_loss(image_embeddings_clip, text_embeddings_clip).mean() * clip_guidance_scale
-
-            grads = -paddle.autograd.grad(loss, latents)[0]
-
-            if isinstance(self.scheduler, LMSDiscreteScheduler):
-                latents = latents.detach() + grads * (sigma**2)
-                noise_pred = noise_pred_original
-            else:
-                noise_pred = noise_pred_original - paddle.sqrt(beta_prod_t) * grads
-            return noise_pred, latents
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]],
-        height: Optional[int] = 512,
-        width: Optional[int] = 512,
-        num_inference_steps: Optional[int] = 50,
-        guidance_scale: Optional[float] = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        clip_guidance_scale: Optional[float] = 100,
-        clip_prompt: Optional[Union[str, List[str]]] = None,
-        num_cutouts: Optional[int] = 4,
-        use_cutouts: Optional[bool] = True,
-        generator: Optional[paddle.Generator] = None,
-        latents: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        **kwargs,
-    ):
-        if isinstance(prompt, str):
-            batch_size = 1
-        elif isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        # get prompt text embeddings
-        text_inputs = self.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=self.tokenizer.model_max_length,
-            truncation=True,
-            return_tensors="pd",
-        )
-        text_input_ids = text_inputs.input_ids
-
-        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
-            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
-            logger.warning(
-                "The following part of your input was truncated because CLIP can only handle sequences up to"
-                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-            )
-            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
-
-        attention_mask = paddle.ones_like(text_input_ids)
-        text_embeddings = self.text_encoder(text_input_ids, attention_mask=attention_mask)[0]
-
-        # duplicate text embeddings for each generation per prompt
-        bs_embed, seq_len, _ = text_embeddings.shape
-        text_embeddings = text_embeddings.tile([1, num_images_per_prompt, 1])
-        text_embeddings = text_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        if clip_guidance_scale > 0:
-            if clip_prompt is not None:
-                clip_text_input_ids = self.tokenizer(
-                    clip_prompt,
-                    padding="max_length",
-                    max_length=self.tokenizer.model_max_length,
-                    truncation=True,
-                    return_tensors="pd",
-                ).input_ids
-            else:
-                clip_text_input_ids = text_inputs.input_ids
-            text_embeddings_clip = self.clip_model.get_text_features(clip_text_input_ids)
-            text_embeddings_clip = text_embeddings_clip / text_embeddings_clip.norm(p=2, axis=-1, keepdim=True)
-            # duplicate text embeddings clip for each generation per prompt
-            bs_embed, _ = text_embeddings_clip.shape
-            text_embeddings_clip = text_embeddings_clip.tile([1, num_images_per_prompt])
-            text_embeddings_clip = text_embeddings_clip.reshape([bs_embed * num_images_per_prompt, -1])
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""]
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            max_length = text_input_ids.shape[-1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-            attention_mask = paddle.ones_like(uncond_input.input_ids)
-            uncond_embeddings = self.text_encoder(uncond_input.input_ids, attention_mask=attention_mask)[0]
-
-            # duplicate unconditional embeddings for each generation per prompt
-            seq_len = uncond_embeddings.shape[1]
-            uncond_embeddings = uncond_embeddings.tile([batch_size, num_images_per_prompt, 1])
-            uncond_embeddings = uncond_embeddings.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            text_embeddings = paddle.concat([uncond_embeddings, text_embeddings])
-
-        # get the initial random noise unless the user supplied it
-
-        # Unlike in other pipelines, latents need to be generated in the target device
-        # for 1-to-1 results reproducibility with the CompVis implementation.
-        # However this currently doesn't work in `mps`.
-        latents_shape = [batch_size * num_images_per_prompt, self.unet.in_channels, height // 8, width // 8]
-        if latents is None:
-            latents = paddle.randn(latents_shape, generator=generator, dtype=text_embeddings.dtype)
-        else:
-            if latents.shape != latents_shape:
-                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
-
-        # set timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-
-        # Some schedulers like PNDM have timesteps as arrays
-        # It's more optimized to move all timesteps to correct device beforehand
-        timesteps_tensor = self.scheduler.timesteps
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-
-        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
-            # expand the latents if we are doing classifier free guidance
-            latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-            # predict the noise residual
-            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
-
-            # perform classifier free guidance
-            if do_classifier_free_guidance:
-                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-            # perform clip guidance
-            if clip_guidance_scale > 0:
-                text_embeddings_for_guidance = (
-                    text_embeddings.chunk(2)[1] if do_classifier_free_guidance else text_embeddings
-                )
-                noise_pred, latents = self.cond_fn(
-                    latents,
-                    t,
-                    i,
-                    text_embeddings_for_guidance,
-                    noise_pred,
-                    text_embeddings_clip,
-                    clip_guidance_scale,
-                    num_cutouts,
-                    use_cutouts,
-                )
-
-            # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-            # call the callback, if provided
-            if callback is not None and i % callback_steps == 0:
-                callback(i, t, latents)
-
-        # scale and decode the image latents with vae
-        latents = 1 / 0.18215 * latents
-        image = self.vae.decode(latents).sample
-
-        image = (image / 2 + 0.5).clip(0, 1)
-        image = image.transpose([0, 2, 3, 1]).astype("float32").numpy()
-
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image, None)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=None)
diff --git a/ppdiffusers/examples/community/composable_stable_diffusion.py b/ppdiffusers/examples/community/composable_stable_diffusion.py
deleted file mode 100644
index 7bd973f959f1..000000000000
--- a/ppdiffusers/examples/community/composable_stable_diffusion.py
+++ /dev/null
@@ -1,424 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import inspect
-from typing import Callable, Optional, Union
-
-import paddle
-
-from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
-from ppdiffusers.configuration_utils import FrozenDict
-from ppdiffusers.models import AutoencoderKL, UNet2DConditionModel
-from ppdiffusers.pipeline_utils import DiffusionPipeline
-from ppdiffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
-from ppdiffusers.pipelines.stable_diffusion.safety_checker import (
-    StableDiffusionSafetyChecker,
-)
-from ppdiffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
-from ppdiffusers.utils import deprecate, logging
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-class ComposableStableDiffusionPipeline(DiffusionPipeline):
-    r"""
-    Pipeline for text-to-image generation using Stable Diffusion.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/junnyu/stable-diffusion-v1-4-paddle) for details.
-        feature_extractor ([`CLIPFeatureExtractor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPFeatureExtractor,
-    ):
-        super().__init__()
-        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
-                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
-                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
-                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
-                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file"
-            )
-            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["steps_offset"] = 1
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if safety_checker is None:
-            logger.warn(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. PaddleNLP team, diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-
-    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
-        r"""
-        Enable sliced attention computation.
-
-        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
-        in several steps. This is useful to save some memory in exchange for a small speed decrease.
-
-        Args:
-            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
-                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
-                a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
-                `attention_head_dim` must be a multiple of `slice_size`.
-        """
-        if slice_size == "auto":
-            # half the attention head size is usually a good trade-off between
-            # speed and memory
-            slice_size = self.unet.config.attention_head_dim // 2
-        self.unet.set_attention_slice(slice_size)
-
-    def disable_attention_slicing(self):
-        r"""
-        Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
-        back to computing attention in one step.
-        """
-        # set slice_size = `None` to disable `attention slicing`
-        self.enable_attention_slicing(None)
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        prompt: str,
-        height: Optional[int] = 512,
-        width: Optional[int] = 512,
-        num_inference_steps: Optional[int] = 50,
-        guidance_scale: Optional[float] = 7.5,
-        negative_prompt: str = None,
-        # num_images_per_prompt: Optional[int] = 1,
-        eta: Optional[float] = 0.0,
-        seed: Optional[int] = None,
-        latents: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        weights: Optional[str] = "",
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        reduce_memory: Optional[bool] = True,
-        **kwargs,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str``):
-                The prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to 512):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to 512):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str`, *optional*):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            seed (`int`, *optional*):
-                Random number seed.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `seed`.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            reduce_memory (`bool`, *optional*, defaults to True):
-                Whether or not reduce_memory when unet forward.
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-
-        if isinstance(prompt, str):
-            batch_size = 1
-        else:
-            raise ValueError(f"`prompt` has to be of type `str`but is {type(prompt)}")
-        if negative_prompt is not None and not isinstance(negative_prompt, str):
-            raise ValueError(f"`negative_prompt` has to be of type `str`but is {type(prompt)}")
-
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if "|" in prompt:
-            prompt = [x.strip() for x in prompt.split("|")]
-            print(f"composing {prompt}...")
-
-        # get prompt text embeddings
-        text_inputs = self.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=self.tokenizer.model_max_length,
-            truncation=True,
-            return_tensors="pd",
-        )
-        text_input_ids = text_inputs.input_ids
-
-        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
-            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
-            logger.warning(
-                "The following part of your input was truncated because CLIP can only handle sequences up to"
-                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-            )
-            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
-        attention_mask = paddle.ones_like(text_input_ids)
-        text_embeddings = self.text_encoder(text_input_ids, attention_mask=attention_mask)[0]
-
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        # bs_embed, seq_len, _ = text_embeddings.shape
-        # text_embeddings = text_embeddings.tile([1, num_images_per_prompt, 1])
-        # text_embeddings = text_embeddings.reshape(
-        #     [bs_embed * num_images_per_prompt, seq_len, -1])
-
-        if not weights:
-            # specify weights for prompts (excluding the unconditional score)
-            print("using equal weights for all prompts...")
-            pos_weights = paddle.to_tensor(
-                [1 / (text_embeddings.shape[0] - 1)] * (text_embeddings.shape[0] - 1)
-            ).reshape([-1, 1, 1, 1])
-            neg_weights = paddle.to_tensor([1.0]).reshape([-1, 1, 1, 1])
-            mask = paddle.to_tensor([False] + [True] * pos_weights.shape[0], dtype=paddle.bool)
-        else:
-            # set prompt weight for each
-            num_prompts = len(prompt) if isinstance(prompt, list) else 1
-            weights = [float(w.strip()) for w in weights.split("|")]
-            if len(weights) < num_prompts:
-                weights.append(1.0)
-            assert len(weights) == text_embeddings.shape[0], "weights specified are not equal to the number of prompts"
-            pos_weights = []
-            neg_weights = []
-            mask = []  # first one is unconditional score
-            for w in weights:
-                if w > 0:
-                    pos_weights.append(w)
-                    mask.append(True)
-                else:
-                    neg_weights.append(abs(w))
-                    mask.append(False)
-            # normalize the weights
-            pos_weights = paddle.to_tensor(pos_weights).reshape([-1, 1, 1, 1])
-            pos_weights = pos_weights / pos_weights.sum()
-            if neg_weights:
-                neg_weights = paddle.to_tensor(neg_weights).reshape([-1, 1, 1, 1])
-                neg_weights = neg_weights / neg_weights.sum()
-            mask = paddle.to_tensor(mask, dtype=paddle.bool)
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance:
-            if paddle.all(mask):
-                uncond_tokens: str
-                if negative_prompt is None:
-                    uncond_tokens = ""
-                else:
-                    uncond_tokens = negative_prompt
-
-                max_length = text_input_ids.shape[-1]
-                uncond_input = self.tokenizer(
-                    uncond_tokens,
-                    padding="max_length",
-                    max_length=max_length,
-                    truncation=True,
-                    return_tensors="pd",
-                )
-                attention_mask = paddle.ones_like(uncond_input.input_ids)
-                uncond_embeddings = self.text_encoder(uncond_input.input_ids, attention_mask=attention_mask)[0]
-
-                # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-                # seq_len = uncond_embeddings.shape[1]
-                # uncond_embeddings = uncond_embeddings.tile(
-                #     [batch_size, num_images_per_prompt, 1])
-                # uncond_embeddings = uncond_embeddings.reshape(
-                #     [batch_size * num_images_per_prompt, seq_len, -1])
-
-                # For classifier free guidance, we need to do two forward passes.
-                # Here we concatenate the unconditional and text embeddings into a single batch
-                # to avoid doing two forward passes
-                text_embeddings = paddle.concat([uncond_embeddings, text_embeddings])
-
-                # update negative weights
-                neg_weights = paddle.to_tensor([1.0]).reshape([-1, 1, 1, 1])
-                mask = paddle.to_tensor([False] + mask.tolist(), dtype=paddle.bool)
-
-        # get the initial random noise unless the user supplied it
-
-        # Unlike in other pipelines, latents need to be generated in the target device
-        # for 1-to-1 results reproducibility with the CompVis implementation.
-        # However this currently doesn't work in `mps`.
-        latents_shape = [batch_size, self.unet.in_channels, height // 8, width // 8]
-        if latents is None:
-            if seed is not None:
-                paddle.seed(seed)
-            latents = paddle.randn(latents_shape, dtype=text_embeddings.dtype)
-        else:
-            if latents.shape != latents_shape:
-                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
-
-        # set timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-
-        # Some schedulers like PNDM have timesteps as arrays
-        # It's more optimized to move all timesteps to correct device beforehand
-        timesteps_tensor = self.scheduler.timesteps
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
-            # expand the latents if we are doing classifier free guidance
-            latent_model_input = (
-                paddle.concat([latents] * text_embeddings.shape[0]) if do_classifier_free_guidance else latents
-            )
-            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-            if reduce_memory:
-                # reduce memory by predicting each score sequentially
-                noise_preds = []
-                # predict the noise residual
-                for latent_in, text_embedding_in in zip(
-                    latent_model_input.chunk(latent_model_input.shape[0], axis=0),
-                    text_embeddings.chunk(text_embeddings.shape[0], axis=0),
-                ):
-                    noise_preds.append(self.unet(latent_in, t, encoder_hidden_states=text_embedding_in).sample)
-                noise_preds = paddle.concat(noise_preds, axis=0)
-            else:
-                # predict the noise residual
-                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
-
-            # perform guidance
-            if do_classifier_free_guidance:
-                mask_index = paddle.nonzero(mask).reshape([-1])
-                non_mask_index = paddle.nonzero(~mask).reshape([-1])
-                noise_pred_uncond = (noise_preds[non_mask_index] * neg_weights).sum(axis=0, keepdim=True)
-                noise_pred_text = (noise_preds[mask_index] * pos_weights).sum(axis=0, keepdim=True)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-            # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-            # call the callback, if provided
-            if callback is not None and i % callback_steps == 0:
-                callback(i, t, latents)
-
-        # scale and decode the image latents with vae
-        latents = 1 / 0.18215 * latents
-        image = self.vae.decode(latents).sample
-
-        image = (image / 2 + 0.5).clip(0, 1)
-
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
-        image = image.transpose([0, 2, 3, 1]).astype("float32").numpy()
-
-        # run safety checker
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.astype(text_embeddings.dtype)
-            )
-        else:
-            has_nsfw_concept = None
-
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/examples/community/inference_clip_guided_stable_diffusion.py b/ppdiffusers/examples/community/inference_clip_guided_stable_diffusion.py
deleted file mode 100644
index 9b06e1ee5a19..000000000000
--- a/ppdiffusers/examples/community/inference_clip_guided_stable_diffusion.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-from clip_guided_stable_diffusion import CLIPGuidedStableDiffusion
-from IPython.display import display
-from PIL import Image
-
-from paddlenlp.transformers import CLIPFeatureExtractor, CLIPModel
-from ppdiffusers import LMSDiscreteScheduler, StableDiffusionPipeline
-
-
-def image_grid(imgs, rows, cols):
-    assert len(imgs) == rows * cols
-    w, h = imgs[0].size
-    grid = Image.new("RGB", size=(cols * w, rows * h))
-
-    for i, img in enumerate(imgs):
-        grid.paste(img, box=(i % cols * w, i // cols * h))
-    return grid
-
-
-def create_clip_guided_pipeline(
-    model_id="CompVis/stable-diffusion-v1-4", clip_model_id="openai/clip-vit-large-patch14", scheduler="plms"
-):
-    pipeline = StableDiffusionPipeline.from_pretrained(model_id, paddle_dtype=paddle.float16)
-
-    if scheduler == "lms":
-        scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
-    else:
-        scheduler = pipeline.scheduler
-
-    clip_model = CLIPModel.from_pretrained(clip_model_id)
-    feature_extractor = CLIPFeatureExtractor()
-
-    guided_pipeline = CLIPGuidedStableDiffusion(
-        unet=pipeline.unet,
-        vae=pipeline.vae,
-        tokenizer=pipeline.tokenizer,
-        text_encoder=pipeline.text_encoder,
-        scheduler=scheduler,
-        clip_model=clip_model,
-        feature_extractor=feature_extractor,
-    )
-
-    return guided_pipeline
-
-
-def infer(
-    prompt,
-    clip_prompt,
-    num_return_images=1,
-    num_images_per_prompt=1,
-    num_inference_steps=50,
-    clip_guidance_scale=100,
-    guidance_scale=7.5,
-    guided_pipeline=None,
-    negative_prompt="",
-    use_cutouts=True,
-    num_cutouts=4,
-    seed=None,
-    unfreeze_unet=True,
-    unfreeze_vae=True,
-):
-    clip_prompt = clip_prompt if clip_prompt.strip() != "" else None
-    if unfreeze_unet:
-        guided_pipeline.unfreeze_unet()
-    else:
-        guided_pipeline.freeze_unet()
-
-    if unfreeze_vae:
-        guided_pipeline.unfreeze_vae()
-    else:
-        guided_pipeline.freeze_vae()
-
-    images = []
-    for i in range(num_return_images):
-        image = guided_pipeline(
-            prompt=prompt,
-            negative_prompt=negative_prompt,
-            clip_prompt=clip_prompt,
-            num_inference_steps=num_inference_steps,
-            guidance_scale=guidance_scale,
-            clip_guidance_scale=clip_guidance_scale,
-            num_cutouts=num_cutouts,
-            use_cutouts=use_cutouts,
-            seed=seed,
-            num_images_per_prompt=num_images_per_prompt,
-        ).images
-        images.extend(image)
-
-    return image_grid(images, 1, len(images))
-
-
-if __name__ == "__main__":
-    prompt = "fantasy book cover, full moon, fantasy forest landscape, golden vector elements, fantasy magic, dark light night, intricate, elegant, sharp focus, illustration, highly detailed, digital painting, concept art, matte, art by WLOP and Artgerm and Albert Bierstadt, masterpiece"  # @param {type: "string"}
-    # @markdown `clip_prompt` is optional, if you leave it blank the same prompt is sent to Stable Diffusion and CLIP
-    clip_prompt = ""  # @param {type: "string"}
-    negative_prompt = ""
-    num_return_images = 1  # @param {type: "number"}
-    num_images_per_prompt = 1  # @param {type: "number"}
-
-    num_inference_steps = 50  # @param {type: "number"}
-    guidance_scale = 7.5  # @param {type: "number"}
-    clip_guidance_scale = 100  # @param {type: "number"}
-    num_cutouts = 4  # @param {type: "number"}
-    use_cutouts = False  # @param ["False", "True"]
-    unfreeze_unet = False  # @param ["False", "True"]
-    unfreeze_vae = False  # @param ["False", "True"]
-    seed = 3788086447  # @param {type: "number"}
-
-    model_id = "CompVis/stable-diffusion-v1-4"
-    clip_model_id = "openai/clip-vit-base-patch32"  # @param ["openai/clip-vit-base-patch32", "openai/clip-vit-base-patch14", "openai/clip-rn101", "openai/clip-rn50"] {allow-input: true}
-    scheduler = "plms"  # @param ['plms', 'lms']
-    guided_pipeline = create_clip_guided_pipeline(model_id, clip_model_id)
-
-    with paddle.amp.auto_cast(True, level="O2"):
-        grid_image = infer(
-            prompt=prompt,
-            negative_prompt=negative_prompt,
-            clip_prompt=clip_prompt,
-            num_return_images=num_return_images,
-            num_images_per_prompt=num_images_per_prompt,
-            num_inference_steps=num_inference_steps,
-            clip_guidance_scale=clip_guidance_scale,
-            guidance_scale=guidance_scale,
-            guided_pipeline=guided_pipeline,
-            use_cutouts=use_cutouts,
-            num_cutouts=num_cutouts,
-            seed=seed,
-            unfreeze_unet=unfreeze_unet,
-            unfreeze_vae=unfreeze_vae,
-        )
-
-    display(grid_image)
diff --git a/ppdiffusers/examples/community/interpolate_stable_diffusion.py b/ppdiffusers/examples/community/interpolate_stable_diffusion.py
deleted file mode 100644
index 09e741b75b33..000000000000
--- a/ppdiffusers/examples/community/interpolate_stable_diffusion.py
+++ /dev/null
@@ -1,523 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-import time
-from pathlib import Path
-from typing import Callable, List, Optional, Union
-
-import numpy as np
-import paddle
-
-from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
-from ppdiffusers.configuration_utils import FrozenDict
-from ppdiffusers.models import AutoencoderKL, UNet2DConditionModel
-from ppdiffusers.pipeline_utils import DiffusionPipeline
-from ppdiffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
-from ppdiffusers.pipelines.stable_diffusion.safety_checker import (
-    StableDiffusionSafetyChecker,
-)
-from ppdiffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
-from ppdiffusers.utils import deprecate, logging
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-def slerp(t, v0, v1, DOT_THRESHOLD=0.9995):
-    """helper function to spherically interpolate two arrays v1 v2"""
-
-    if not isinstance(v0, np.ndarray):
-        inputs_are_paddle = True
-        v0 = v0.cpu().numpy()
-        v1 = v1.cpu().numpy()
-
-    dot = np.sum(v0 * v1 / (np.linalg.norm(v0) * np.linalg.norm(v1)))
-    if np.abs(dot) > DOT_THRESHOLD:
-        v2 = (1 - t) * v0 + t * v1
-    else:
-        theta_0 = np.arccos(dot)
-        sin_theta_0 = np.sin(theta_0)
-        theta_t = theta_0 * t
-        sin_theta_t = np.sin(theta_t)
-        s0 = np.sin(theta_0 - theta_t) / sin_theta_0
-        s1 = sin_theta_t / sin_theta_0
-        v2 = s0 * v0 + s1 * v1
-
-    if inputs_are_paddle:
-        v2 = paddle.to_tensor(v2)
-
-    return v2
-
-
-class StableDiffusionWalkPipeline(DiffusionPipeline):
-    r"""
-    Pipeline for text-to-image generation using Stable Diffusion.
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPFeatureExtractor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPFeatureExtractor,
-    ):
-        super().__init__()
-
-        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
-                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
-                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
-                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
-                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file"
-            )
-            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["steps_offset"] = 1
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if safety_checker is None:
-            logger.warn(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. PaddleNLP team, diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-
-    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
-        r"""
-        Enable sliced attention computation.
-        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
-        in several steps. This is useful to save some memory in exchange for a small speed decrease.
-        Args:
-            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
-                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
-                a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
-                `attention_head_dim` must be a multiple of `slice_size`.
-        """
-        if slice_size == "auto":
-            # half the attention head size is usually a good trade-off between
-            # speed and memory
-            slice_size = self.unet.config.attention_head_dim // 2
-        self.unet.set_attention_slice(slice_size)
-
-    def disable_attention_slicing(self):
-        r"""
-        Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
-        back to computing attention in one step.
-        """
-        # set slice_size = `None` to disable `attention slicing`
-        self.enable_attention_slicing(None)
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        prompt: Optional[Union[str, List[str]]] = None,
-        height: int = 512,
-        width: int = 512,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[paddle.Generator] = None,
-        latents: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        text_embeddings: Optional[paddle.Tensor] = None,
-        **kwargs,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-        Args:
-            prompt (`str` or `List[str]`, *optional*, defaults to `None`):
-                The prompt or prompts to guide the image generation. If not provided, `text_embeddings` is required.
-            height (`int`, *optional*, defaults to 512):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to 512):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            seed (`int`, *optional*):
-                Random number seed.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `seed`.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            text_embeddings (`paddle.Tensor`, *optional*, defaults to `None`):
-                Pre-generated text embeddings to be used as inputs for image generation. Can be used in place of
-                `prompt` to avoid re-computing the embeddings. If not provided, the embeddings will be generated from
-                the supplied `prompt`.
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if text_embeddings is None:
-            if isinstance(prompt, str):
-                batch_size = 1
-            elif isinstance(prompt, list):
-                batch_size = len(prompt)
-            else:
-                raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-            # get prompt text embeddings
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                return_tensors="pd",
-            )
-            text_input_ids = text_inputs.input_ids
-
-            if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
-                removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
-                print(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-                text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
-            attention_mask = paddle.ones_like(text_input_ids)
-            text_embeddings = self.text_encoder(text_input_ids, attention_mask=attention_mask)[0]
-        else:
-            batch_size = text_embeddings.shape[0]
-
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        bs_embed, seq_len, _ = text_embeddings.shape
-        text_embeddings = text_embeddings.tile([1, num_images_per_prompt, 1])
-        text_embeddings = text_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""]
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            max_length = self.tokenizer.model_max_length
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-            attention_mask = paddle.ones_like(uncond_input.input_ids)
-            uncond_embeddings = self.text_encoder(uncond_input.input_ids, attention_mask=attention_mask)[0]
-
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = uncond_embeddings.shape[1]
-            uncond_embeddings = uncond_embeddings.tile([batch_size, num_images_per_prompt, 1])
-            uncond_embeddings = uncond_embeddings.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            text_embeddings = paddle.concat([uncond_embeddings, text_embeddings])
-
-        # get the initial random noise unless the user supplied it
-
-        # Unlike in other pipelines, latents need to be generated in the target device
-        # for 1-to-1 results reproducibility with the CompVis implementation.
-        # However this currently doesn't work in `mps`.
-        latents_shape = [batch_size * num_images_per_prompt, self.unet.in_channels, height // 8, width // 8]
-        latents_dtype = text_embeddings.dtype
-        if latents is None:
-            latents = paddle.randn(latents_shape, generator=generator, dtype=latents_dtype)
-        else:
-            if latents.shape != latents_shape:
-                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
-            latents = latents
-
-        # set timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-
-        # Some schedulers like PNDM have timesteps as arrays
-        # It's more optimized to move all timesteps to correct device beforehand
-        timesteps_tensor = self.scheduler.timesteps
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
-            # expand the latents if we are doing classifier free guidance
-            latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-            # predict the noise residual
-            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
-
-            # perform guidance
-            if do_classifier_free_guidance:
-                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-            # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-            # call the callback, if provided
-            if callback is not None and i % callback_steps == 0:
-                callback(i, t, latents)
-
-        latents = 1 / 0.18215 * latents
-        image = self.vae.decode(latents).sample
-
-        image = (image / 2 + 0.5).clip(0, 1)
-
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
-        image = image.transpose([0, 2, 3, 1]).astype("float32").numpy()
-
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.astype(text_embeddings.dtype)
-            )
-        else:
-            has_nsfw_concept = None
-
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
-
-    def embed_text(self, text):
-        """takes in text and turns it into text embeddings"""
-        text_input = self.tokenizer(
-            text,
-            padding="max_length",
-            max_length=self.tokenizer.model_max_length,
-            truncation=True,
-            return_tensors="pd",
-        )
-        with paddle.no_grad():
-            embed = self.text_encoder(text_input.input_ids)[0]
-        return embed
-
-    def get_noise(self, seed, dtype=paddle.float32, height=512, width=512):
-        """Takes in random seed and returns corresponding noise vector"""
-        return paddle.randn(
-            (1, self.unet.in_channels, height // 8, width // 8),
-            generator=paddle.Generator().manual_seed(seed),
-            dtype=dtype,
-        )
-
-    def walk(
-        self,
-        prompts: List[str],
-        seeds: List[int],
-        num_interpolation_steps: Optional[int] = 6,
-        output_dir: Optional[str] = "./dreams",
-        name: Optional[str] = None,
-        batch_size: Optional[int] = 1,
-        height: Optional[int] = 512,
-        width: Optional[int] = 512,
-        guidance_scale: Optional[float] = 7.5,
-        num_inference_steps: Optional[int] = 50,
-        eta: Optional[float] = 0.0,
-    ) -> List[str]:
-        """
-        Walks through a series of prompts and seeds, interpolating between them and saving the results to disk.
-        Args:
-            prompts (`List[str]`):
-                List of prompts to generate images for.
-            seeds (`List[int]`):
-                List of seeds corresponding to provided prompts. Must be the same length as prompts.
-            num_interpolation_steps (`int`, *optional*, defaults to 6):
-                Number of interpolation steps to take between prompts.
-            output_dir (`str`, *optional*, defaults to `./dreams`):
-                Directory to save the generated images to.
-            name (`str`, *optional*, defaults to `None`):
-                Subdirectory of `output_dir` to save the generated images to. If `None`, the name will
-                be the current time.
-            batch_size (`int`, *optional*, defaults to 1):
-                Number of images to generate at once.
-            height (`int`, *optional*, defaults to 512):
-                Height of the generated images.
-            width (`int`, *optional*, defaults to 512):
-                Width of the generated images.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-        Returns:
-            `List[str]`: List of paths to the generated images.
-        """
-        if not len(prompts) == len(seeds):
-            raise ValueError(
-                f"Number of prompts and seeds must be equalGot {len(prompts)} prompts and {len(seeds)} seeds"
-            )
-
-        name = name or time.strftime("%Y%m%d-%H%M%S")
-        save_path = Path(output_dir) / name
-        save_path.mkdir(exist_ok=True, parents=True)
-
-        frame_idx = 0
-        frame_filepaths = []
-        for prompt_a, prompt_b, seed_a, seed_b in zip(prompts, prompts[1:], seeds, seeds[1:]):
-            # Embed Text
-            embed_a = self.embed_text(prompt_a)
-            embed_b = self.embed_text(prompt_b)
-
-            # Get Noise
-            noise_dtype = embed_a.dtype
-            noise_a = self.get_noise(seed_a, noise_dtype, height, width)
-            noise_b = self.get_noise(seed_b, noise_dtype, height, width)
-
-            noise_batch, embeds_batch = None, None
-            T = np.linspace(0.0, 1.0, num_interpolation_steps)
-            for i, t in enumerate(T):
-                noise = slerp(float(t), noise_a, noise_b)
-                embed = paddle.lerp(embed_a, embed_b, t)
-
-                noise_batch = noise if noise_batch is None else paddle.concat([noise_batch, noise], axis=0)
-                embeds_batch = embed if embeds_batch is None else paddle.concat([embeds_batch, embed], axis=0)
-
-                batch_is_ready = embeds_batch.shape[0] == batch_size or i + 1 == T.shape[0]
-                if batch_is_ready:
-                    outputs = self(
-                        latents=noise_batch,
-                        text_embeddings=embeds_batch,
-                        height=height,
-                        width=width,
-                        guidance_scale=guidance_scale,
-                        eta=eta,
-                        num_inference_steps=num_inference_steps,
-                    )
-                    noise_batch, embeds_batch = None, None
-
-                    for image in outputs["images"]:
-                        frame_filepath = str(save_path / f"frame_{frame_idx:06d}.png")
-                        image.save(frame_filepath)
-                        frame_filepaths.append(frame_filepath)
-                        frame_idx += 1
-        return frame_filepaths
diff --git a/ppdiffusers/examples/community/lpw_stable_diffusion.py b/ppdiffusers/examples/community/lpw_stable_diffusion.py
deleted file mode 100644
index d6c64880620d..000000000000
--- a/ppdiffusers/examples/community/lpw_stable_diffusion.py
+++ /dev/null
@@ -1,1078 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import re
-from typing import Callable, List, Optional, Union
-
-import numpy as np
-import paddle
-import PIL
-
-from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
-from paddlenlp.utils.tools import compare_version
-from ppdiffusers.models import AutoencoderKL, UNet2DConditionModel
-from ppdiffusers.pipelines.stable_diffusion import (
-    StableDiffusionPipeline,
-    StableDiffusionPipelineOutput,
-)
-from ppdiffusers.pipelines.stable_diffusion.safety_checker import (
-    StableDiffusionSafetyChecker,
-)
-from ppdiffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
-from ppdiffusers.utils import logging
-
-if compare_version(PIL.__version__, "9.1.0") >= 0:
-    Resampling = PIL.Image.Resampling
-else:
-    Resampling = PIL.Image
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-re_attention = re.compile(
-    r"""
-\\\(|
-\\\)|
-\\\[|
-\\]|
-\\\\|
-\\|
-\(|
-\[|
-:([+-]?[.\d]+)\)|
-\)|
-]|
-[^\\()\[\]:]+|
-:
-""",
-    re.X,
-)
-
-
-def parse_prompt_attention(text):
-    r"""
-    Parses a string with attention tokens and returns a list of pairs: text and its associated weight.
-    Accepted tokens are:
-      (abc) - increases attention to abc by a multiplier of 1.1
-      (abc:3.12) - increases attention to abc by a multiplier of 3.12
-      [abc] - decreases attention to abc by a multiplier of 1.1
-      \( - literal character '('
-      \[ - literal character '['
-      \) - literal character ')'
-      \] - literal character ']'
-      \\ - literal character '\'
-      anything else - just text
-    >>> parse_prompt_attention('normal text')
-    [['normal text', 1.0]]
-    >>> parse_prompt_attention('an (important) word')
-    [['an ', 1.0], ['important', 1.1], [' word', 1.0]]
-    >>> parse_prompt_attention('(unbalanced')
-    [['unbalanced', 1.1]]
-    >>> parse_prompt_attention('\(literal\]')
-    [['(literal]', 1.0]]
-    >>> parse_prompt_attention('(unnecessary)(parens)')
-    [['unnecessaryparens', 1.1]]
-    >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
-    [['a ', 1.0],
-     ['house', 1.5730000000000004],
-     [' ', 1.1],
-     ['on', 1.0],
-     [' a ', 1.1],
-     ['hill', 0.55],
-     [', sun, ', 1.1],
-     ['sky', 1.4641000000000006],
-     ['.', 1.1]]
-    """
-
-    res = []
-    round_brackets = []
-    square_brackets = []
-
-    round_bracket_multiplier = 1.1
-    square_bracket_multiplier = 1 / 1.1
-
-    def multiply_range(start_position, multiplier):
-        for p in range(start_position, len(res)):
-            res[p][1] *= multiplier
-
-    for m in re_attention.finditer(text):
-        text = m.group(0)
-        weight = m.group(1)
-
-        if text.startswith("\\"):
-            res.append([text[1:], 1.0])
-        elif text == "(":
-            round_brackets.append(len(res))
-        elif text == "[":
-            square_brackets.append(len(res))
-        elif weight is not None and len(round_brackets) > 0:
-            multiply_range(round_brackets.pop(), float(weight))
-        elif text == ")" and len(round_brackets) > 0:
-            multiply_range(round_brackets.pop(), round_bracket_multiplier)
-        elif text == "]" and len(square_brackets) > 0:
-            multiply_range(square_brackets.pop(), square_bracket_multiplier)
-        else:
-            res.append([text, 1.0])
-
-    for pos in round_brackets:
-        multiply_range(pos, round_bracket_multiplier)
-
-    for pos in square_brackets:
-        multiply_range(pos, square_bracket_multiplier)
-
-    if len(res) == 0:
-        res = [["", 1.0]]
-
-    # merge runs of identical weights
-    i = 0
-    while i + 1 < len(res):
-        if res[i][1] == res[i + 1][1]:
-            res[i][0] += res[i + 1][0]
-            res.pop(i + 1)
-        else:
-            i += 1
-
-    return res
-
-
-def get_prompts_with_weights(pipe: StableDiffusionPipeline, prompt: List[str], max_length: int):
-    r"""
-    Tokenize a list of prompts and return its tokens with weights of each token.
-    No padding, starting or ending token is included.
-    """
-    tokens = []
-    weights = []
-    truncated = False
-    for text in prompt:
-        texts_and_weights = parse_prompt_attention(text)
-        text_token = []
-        text_weight = []
-        for word, weight in texts_and_weights:
-            # tokenize and discard the starting and the ending token
-            token = pipe.tokenizer(word).input_ids[1:-1]
-            text_token += token
-            # copy the weight by length of token
-            text_weight += [weight] * len(token)
-            # stop if the text is too long (longer than truncation limit)
-            if len(text_token) > max_length:
-                truncated = True
-                break
-        # truncate
-        if len(text_token) > max_length:
-            truncated = True
-            text_token = text_token[:max_length]
-            text_weight = text_weight[:max_length]
-        tokens.append(text_token)
-        weights.append(text_weight)
-    if truncated:
-        logger.warning("Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples")
-    return tokens, weights
-
-
-def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, pad, no_boseos_middle=True, chunk_length=77):
-    r"""
-    Pad the tokens (with starting and ending tokens) and weights (with 1.0) to max_length.
-    """
-    max_embeddings_multiples = (max_length - 2) // (chunk_length - 2)
-    weights_length = max_length if no_boseos_middle else max_embeddings_multiples * chunk_length
-    for i in range(len(tokens)):
-        tokens[i] = [bos] + tokens[i] + [eos] + [pad] * (max_length - 2 - len(tokens[i]))
-        if no_boseos_middle:
-            weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 - len(weights[i]))
-        else:
-            w = []
-            if len(weights[i]) == 0:
-                w = [1.0] * weights_length
-            else:
-                for j in range(max_embeddings_multiples):
-                    w.append(1.0)  # weight for starting token in this chunk
-                    w += weights[i][j * (chunk_length - 2) : min(len(weights[i]), (j + 1) * (chunk_length - 2))]
-                    w.append(1.0)  # weight for ending token in this chunk
-                w += [1.0] * (weights_length - len(w))
-            weights[i] = w[:]
-
-    return tokens, weights
-
-
-def get_unweighted_text_embeddings(
-    pipe: StableDiffusionPipeline,
-    text_input: paddle.Tensor,
-    chunk_length: int,
-    no_boseos_middle: Optional[bool] = True,
-):
-    """
-    When the length of tokens is a multiple of the capacity of the text encoder,
-    it should be split into chunks and sent to the text encoder individually.
-    """
-    max_embeddings_multiples = (text_input.shape[1] - 2) // (chunk_length - 2)
-    if max_embeddings_multiples > 1:
-        text_embeddings = []
-        for i in range(max_embeddings_multiples):
-            # extract the i-th chunk
-            text_input_chunk = text_input[:, i * (chunk_length - 2) : (i + 1) * (chunk_length - 2) + 2].clone()
-
-            # cover the head and the tail by the starting and the ending tokens
-            text_input_chunk[:, 0] = text_input[0, 0]
-            text_input_chunk[:, -1] = text_input[0, -1]
-            text_embedding = pipe.text_encoder(text_input_chunk)[0]
-
-            if no_boseos_middle:
-                if i == 0:
-                    # discard the ending token
-                    text_embedding = text_embedding[:, :-1]
-                elif i == max_embeddings_multiples - 1:
-                    # discard the starting token
-                    text_embedding = text_embedding[:, 1:]
-                else:
-                    # discard both starting and ending tokens
-                    text_embedding = text_embedding[:, 1:-1]
-
-            text_embeddings.append(text_embedding)
-        text_embeddings = paddle.concat(text_embeddings, axis=1)
-    else:
-        text_embeddings = pipe.text_encoder(text_input)[0]
-    return text_embeddings
-
-
-def get_weighted_text_embeddings(
-    pipe: StableDiffusionPipeline,
-    prompt: Union[str, List[str]],
-    uncond_prompt: Optional[Union[str, List[str]]] = None,
-    max_embeddings_multiples: Optional[int] = 1,
-    no_boseos_middle: Optional[bool] = False,
-    skip_parsing: Optional[bool] = False,
-    skip_weighting: Optional[bool] = False,
-    **kwargs,
-):
-    r"""
-    Prompts can be assigned with local weights using brackets. For example,
-    prompt 'A (very beautiful) masterpiece' highlights the words 'very beautiful',
-    and the embedding tokens corresponding to the words get multiplied by a constant, 1.1.
-    Also, to regularize of the embedding, the weighted embedding would be scaled to preserve the original mean.
-    Args:
-        pipe (`DiffusionPipeline`):
-            Pipe to provide access to the tokenizer and the text encoder.
-        prompt (`str` or `List[str]`):
-            The prompt or prompts to guide the image generation.
-        uncond_prompt (`str` or `List[str]`):
-            The unconditional prompt or prompts for guide the image generation. If unconditional prompt
-            is provided, the embeddings of prompt and uncond_prompt are concatenated.
-        max_embeddings_multiples (`int`, *optional*, defaults to `1`):
-            The max multiple length of prompt embeddings compared to the max output length of text encoder.
-        no_boseos_middle (`bool`, *optional*, defaults to `False`):
-            If the length of text token is multiples of the capacity of text encoder, whether reserve the starting and
-            ending token in each of the chunk in the middle.
-        skip_parsing (`bool`, *optional*, defaults to `False`):
-            Skip the parsing of brackets.
-        skip_weighting (`bool`, *optional*, defaults to `False`):
-            Skip the weighting. When the parsing is skipped, it is forced True.
-    """
-    max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
-    if isinstance(prompt, str):
-        prompt = [prompt]
-
-    if not skip_parsing:
-        prompt_tokens, prompt_weights = get_prompts_with_weights(pipe, prompt, max_length - 2)
-        if uncond_prompt is not None:
-            if isinstance(uncond_prompt, str):
-                uncond_prompt = [uncond_prompt]
-            uncond_tokens, uncond_weights = get_prompts_with_weights(pipe, uncond_prompt, max_length - 2)
-    else:
-        prompt_tokens = [
-            token[1:-1] for token in pipe.tokenizer(prompt, max_length=max_length, truncation=True).input_ids
-        ]
-        prompt_weights = [[1.0] * len(token) for token in prompt_tokens]
-        if uncond_prompt is not None:
-            if isinstance(uncond_prompt, str):
-                uncond_prompt = [uncond_prompt]
-            uncond_tokens = [
-                token[1:-1]
-                for token in pipe.tokenizer(uncond_prompt, max_length=max_length, truncation=True).input_ids
-            ]
-            uncond_weights = [[1.0] * len(token) for token in uncond_tokens]
-
-    # round up the longest length of tokens to a multiple of (model_max_length - 2)
-    max_length = max([len(token) for token in prompt_tokens])
-    if uncond_prompt is not None:
-        max_length = max(max_length, max([len(token) for token in uncond_tokens]))
-
-    max_embeddings_multiples = min(
-        max_embeddings_multiples,
-        (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1,
-    )
-    max_embeddings_multiples = max(1, max_embeddings_multiples)
-    max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
-
-    # pad the length of tokens and weights
-    # support bert tokenizer
-    bos = pipe.tokenizer.bos_token_id if pipe.tokenizer.bos_token_id is not None else pipe.tokenizer.cls_token_id
-    eos = pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id is not None else pipe.tokenizer.sep_token_id
-    pad = pipe.tokenizer.pad_token_id
-    prompt_tokens, prompt_weights = pad_tokens_and_weights(
-        prompt_tokens,
-        prompt_weights,
-        max_length,
-        bos,
-        eos,
-        pad,
-        no_boseos_middle=no_boseos_middle,
-        chunk_length=pipe.tokenizer.model_max_length,
-    )
-    prompt_tokens = paddle.to_tensor(prompt_tokens, dtype=paddle.int64)
-    if uncond_prompt is not None:
-        uncond_tokens, uncond_weights = pad_tokens_and_weights(
-            uncond_tokens,
-            uncond_weights,
-            max_length,
-            bos,
-            eos,
-            pad,
-            no_boseos_middle=no_boseos_middle,
-            chunk_length=pipe.tokenizer.model_max_length,
-        )
-        uncond_tokens = paddle.to_tensor(uncond_tokens, dtype=paddle.int64)
-
-    # get the embeddings
-    text_embeddings = get_unweighted_text_embeddings(
-        pipe,
-        prompt_tokens,
-        pipe.tokenizer.model_max_length,
-        no_boseos_middle=no_boseos_middle,
-    )
-    prompt_weights = paddle.to_tensor(prompt_weights, dtype=text_embeddings.dtype)
-    if uncond_prompt is not None:
-        uncond_embeddings = get_unweighted_text_embeddings(
-            pipe,
-            uncond_tokens,
-            pipe.tokenizer.model_max_length,
-            no_boseos_middle=no_boseos_middle,
-        )
-        uncond_weights = paddle.to_tensor(uncond_weights, dtype=uncond_embeddings.dtype)
-
-    # assign weights to the prompts and normalize in the sense of mean
-    # TODO: should we normalize by chunk or in a whole (current implementation)?
-    if (not skip_parsing) and (not skip_weighting):
-        previous_mean = text_embeddings.mean(axis=[-2, -1])
-        text_embeddings *= prompt_weights.unsqueeze(-1)
-        text_embeddings *= (previous_mean / text_embeddings.mean(axis=[-2, -1])).unsqueeze(-1).unsqueeze(-1)
-        if uncond_prompt is not None:
-            previous_mean = uncond_embeddings.mean(axis=[-2, -1])
-            uncond_embeddings *= uncond_weights.unsqueeze(-1)
-            uncond_embeddings *= (previous_mean / uncond_embeddings.mean(axis=[-2, -1])).unsqueeze(-1).unsqueeze(-1)
-
-    if uncond_prompt is not None:
-        return text_embeddings, uncond_embeddings
-    return text_embeddings, None
-
-
-def preprocess_image(image):
-    w, h = image.size
-    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
-    image = image.resize((w, h), resample=Resampling.LANCZOS)
-    image = np.array(image).astype(np.float32) / 255.0
-    image = image[None].transpose(0, 3, 1, 2)
-    image = paddle.to_tensor(image)
-    return 2.0 * image - 1.0
-
-
-def preprocess_mask(mask, scale_factor=8):
-    mask = mask.convert("L")
-    w, h = mask.size
-    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
-    mask = mask.resize((w // scale_factor, h // scale_factor), resample=Resampling.NEAREST)
-    mask = np.array(mask).astype(np.float32) / 255.0
-    mask = np.tile(mask, (4, 1, 1))
-    mask = mask[None].transpose(0, 1, 2, 3)  # what does this step do?
-    mask = 1 - mask  # repaint white, keep black
-    mask = paddle.to_tensor(mask)
-    return mask
-
-
-class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
-    r"""
-    Pipeline for text-to-image generation using Stable Diffusion without tokens length limit, and support parsing
-    weighting in prompt.
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPFeatureExtractor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPFeatureExtractor,
-        requires_safety_checker: Optional[bool] = True,
-    ):
-        super().__init__(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-            requires_safety_checker=requires_safety_checker,
-        )
-        self.__init__additional__()
-
-    def __init__additional__(self):
-        if not hasattr(self, "vae_scale_factor"):
-            setattr(self, "vae_scale_factor", 2 ** (len(self.vae.config.block_out_channels) - 1))
-
-    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
-        r"""
-        Enable sliced attention computation.
-        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
-        in several steps. This is useful to save some memory in exchange for a small speed decrease.
-        Args:
-            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
-                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
-                a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
-                `attention_head_dim` must be a multiple of `slice_size`.
-        """
-        if slice_size == "auto":
-            # half the attention head size is usually a good trade-off between
-            # speed and memory
-            slice_size = self.unet.config.attention_head_dim // 2
-        self.unet.set_attention_slice(slice_size)
-
-    def disable_attention_slicing(self):
-        r"""
-        Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
-        back to computing attention in one step.
-        """
-        # set slice_size = `None` to disable `attention slicing`
-        self.enable_attention_slicing(None)
-
-    def check_inputs(self, prompt, height, width, strength, callback_steps):
-        if not isinstance(prompt, str) and not isinstance(prompt, list):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if strength < 0 or strength > 1:
-            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
-
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-    def _encode_prompt(
-        self,
-        prompt: Union[str, List[str]],
-        num_images_per_prompt: int,
-        do_classifier_free_guidance: bool,
-        negative_prompt: Union[str, List[str]],
-        max_embeddings_multiples: Optional[int] = 3,
-        **kwargs,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `list(int)`):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
-                The max multiple length of prompt embeddings compared to the max output length of text encoder.
-        """
-        batch_size = len(prompt) if isinstance(prompt, list) else 1
-
-        if negative_prompt is None:
-            negative_prompt = [""] * batch_size
-        elif isinstance(negative_prompt, str):
-            negative_prompt = [negative_prompt] * batch_size
-        if batch_size != len(negative_prompt):
-            raise ValueError(
-                f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                " the batch size of `prompt`."
-            )
-
-        text_embeddings, uncond_embeddings = get_weighted_text_embeddings(
-            pipe=self,
-            prompt=prompt,
-            uncond_prompt=negative_prompt if do_classifier_free_guidance else None,
-            max_embeddings_multiples=max_embeddings_multiples,
-            **kwargs,
-        )
-        bs_embed, seq_len, _ = text_embeddings.shape
-        text_embeddings = text_embeddings.tile([1, num_images_per_prompt, 1])
-        text_embeddings = text_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        if do_classifier_free_guidance:
-            seq_len = uncond_embeddings.shape[1]
-            uncond_embeddings = uncond_embeddings.tile([1, num_images_per_prompt, 1])
-            uncond_embeddings = uncond_embeddings.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-            text_embeddings = paddle.concat([uncond_embeddings, text_embeddings])
-
-        return text_embeddings
-
-    def get_timesteps(self, num_inference_steps, strength, is_text2img):
-        if is_text2img:
-            return self.scheduler.timesteps, num_inference_steps
-        else:
-            # get the original timestep using init_timestep
-            offset = self.scheduler.config.get("steps_offset", 0)
-            init_timestep = int(num_inference_steps * strength) + offset
-            init_timestep = min(init_timestep, num_inference_steps)
-
-            t_start = max(num_inference_steps - init_timestep + offset, 0)
-            timesteps = self.scheduler.timesteps[t_start:]
-            return timesteps, num_inference_steps - t_start
-
-    def prepare_latents(self, image, timestep, batch_size, height, width, dtype, generator, latents=None):
-        if image is None:
-            shape = (
-                batch_size,
-                self.unet.in_channels,
-                height // self.vae_scale_factor,
-                width // self.vae_scale_factor,
-            )
-
-            if latents is None:
-                latents = paddle.randn(shape, generator=generator, dtype=dtype)
-            else:
-                if latents.shape != shape:
-                    raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
-
-            # scale the initial noise by the standard deviation required by the scheduler
-            latents = latents * self.scheduler.init_noise_sigma
-            return latents, None, None
-        else:
-            init_latent_dist = self.vae.encode(image).latent_dist
-            init_latents = init_latent_dist.sample(generator=generator)
-            init_latents = 0.18215 * init_latents
-            init_latents = paddle.concat([init_latents] * batch_size, axis=0)
-            init_latents_orig = init_latents
-            shape = init_latents.shape
-
-            # add noise to latents using the timesteps
-            noise = paddle.randn(shape, generator=generator, dtype=dtype)
-            latents = self.scheduler.add_noise(init_latents, noise, timestep)
-            return latents, init_latents_orig, noise
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]],
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        image: Union[paddle.Tensor, PIL.Image.Image] = None,
-        mask_image: Union[paddle.Tensor, PIL.Image.Image] = None,
-        height: Optional[int] = 512,
-        width: Optional[int] = 512,
-        num_inference_steps: Optional[int] = 50,
-        guidance_scale: Optional[int] = 7.5,
-        strength: Optional[int] = 0.8,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: Optional[int] = 0.0,
-        generator: Optional[paddle.Generator] = None,
-        latents: Optional[paddle.Tensor] = None,
-        max_embeddings_multiples: Optional[int] = 3,
-        output_type: Optional[str] = "pil",
-        return_dict: Optional[bool] = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        is_cancelled_callback: Optional[Callable[[], bool]] = None,
-        callback_steps: Optional[int] = 1,
-        **kwargs,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-        Args:
-            prompt (`str` or `List[str]`):
-                The prompt or prompts to guide the image generation.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            image (`paddle.Tensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, that will be used as the starting point for the
-                process.
-            mask_image (`paddle.Tensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
-                replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
-                PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
-                contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
-            height (`int`, *optional*, defaults to 512):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to 512):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            strength (`float`, *optional*, defaults to 0.8):
-                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
-                `image` will be used as a starting point, adding more noise to it the larger the `strength`. The
-                number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
-                noise will be maximum and the denoising process will run for the full number of iterations specified in
-                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator`, *optional*):
-                A paddle.Generator to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
-                The max multiple length of prompt embeddings compared to the max output length of text encoder.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * self.vae_scale_factor
-        width = width or self.unet.config.sample_size * self.vae_scale_factor
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(prompt, height, width, strength, callback_steps)
-
-        # 2. Define call parameters
-        batch_size = 1 if isinstance(prompt, str) else len(prompt)
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input prompt
-        text_embeddings = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            max_embeddings_multiples,
-        )
-        dtype = text_embeddings.dtype
-
-        # 4. Preprocess image and mask
-        if isinstance(image, PIL.Image.Image):
-            image = preprocess_image(image)
-        if image is not None:
-            image = image.astype(dtype=dtype)
-        if isinstance(mask_image, PIL.Image.Image):
-            mask_image = preprocess_mask(mask_image, self.vae_scale_factor)
-        if mask_image is not None:
-            mask_image = mask_image.astype(dtype=dtype)
-            mask = paddle.concat([mask_image] * batch_size * num_images_per_prompt)
-        else:
-            mask = None
-
-        # 5. set timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, image is None)
-        latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
-
-        # 6. Prepare latent variables
-        latents, init_latents_orig, noise = self.prepare_latents(
-            image,
-            latent_timestep,
-            batch_size * num_images_per_prompt,
-            height,
-            width,
-            dtype,
-            generator,
-            latents,
-        )
-
-        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 8. Denoising loop
-        for i, t in enumerate(self.progress_bar(timesteps)):
-            # expand the latents if we are doing classifier free guidance
-            latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-            # predict the noise residual
-            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
-
-            # perform guidance
-            if do_classifier_free_guidance:
-                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-            # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-            if mask is not None:
-                # masking
-                init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, t)
-                latents = (init_latents_proper * mask) + (latents * (1 - mask))
-
-            # call the callback, if provided
-            if i % callback_steps == 0:
-                if callback is not None:
-                    callback(i, t, latents)
-                if is_cancelled_callback is not None and is_cancelled_callback():
-                    return None
-
-        # 9. Post-processing
-        image = self.decode_latents(latents)
-
-        # 10. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image, text_embeddings.dtype)
-
-        # 11. Convert to PIL
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return image, has_nsfw_concept
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
-
-    def text2img(
-        self,
-        prompt: Union[str, List[str]],
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        height: Optional[int] = 512,
-        width: Optional[int] = 512,
-        num_inference_steps: Optional[int] = 50,
-        guidance_scale: Optional[float] = 7.5,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: Optional[int] = 0.0,
-        generator: Optional[paddle.Generator] = None,
-        latents: Optional[paddle.Tensor] = None,
-        max_embeddings_multiples: Optional[int] = 3,
-        output_type: Optional[str] = "pil",
-        return_dict: Optional[bool] = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        is_cancelled_callback: Optional[Callable[[], bool]] = None,
-        callback_steps: Optional[int] = 1,
-        **kwargs,
-    ):
-        r"""
-        Function for text-to-image generation.
-        Args:
-            prompt (`str` or `List[str]`):
-                The prompt or prompts to guide the image generation.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            height (`int`, *optional*, defaults to 512):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to 512):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator`, *optional*):
-                A paddle.Generator to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
-                The max multiple length of prompt embeddings compared to the max output length of text encoder.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            is_cancelled_callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. If the function returns
-                `True`, the inference will be cancelled.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        return self.__call__(
-            prompt=prompt,
-            negative_prompt=negative_prompt,
-            height=height,
-            width=width,
-            num_inference_steps=num_inference_steps,
-            guidance_scale=guidance_scale,
-            num_images_per_prompt=num_images_per_prompt,
-            eta=eta,
-            generator=generator,
-            latents=latents,
-            max_embeddings_multiples=max_embeddings_multiples,
-            output_type=output_type,
-            return_dict=return_dict,
-            callback=callback,
-            callback_steps=callback_steps,
-            **kwargs,
-        )
-
-    def img2img(
-        self,
-        prompt: Union[str, List[str]],
-        image: Union[paddle.Tensor, PIL.Image.Image],
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        strength: Optional[float] = 0.8,
-        num_inference_steps: Optional[int] = 50,
-        guidance_scale: Optional[float] = 7.5,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: Optional[float] = 0.0,
-        generator: Optional[paddle.Generator] = None,
-        max_embeddings_multiples: Optional[int] = 3,
-        output_type: Optional[str] = "pil",
-        return_dict: Optional[bool] = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        is_cancelled_callback: Optional[Callable[[], bool]] = None,
-        callback_steps: Optional[int] = 1,
-        **kwargs,
-    ):
-        r"""
-        Function for image-to-image generation.
-        Args:
-            image (`paddle.Tensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, that will be used as the starting point for the
-                process.
-            prompt (`str` or `List[str]`):
-                The prompt or prompts to guide the image generation.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            strength (`float`, *optional*, defaults to 0.8):
-                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
-                `image` will be used as a starting point, adding more noise to it the larger the `strength`. The
-                number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
-                noise will be maximum and the denoising process will run for the full number of iterations specified in
-                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference. This parameter will be modulated by `strength`.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator`, *optional*):
-                A paddle.Generator to make generation deterministic.
-            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
-                The max multiple length of prompt embeddings compared to the max output length of text encoder.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            is_cancelled_callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. If the function returns
-                `True`, the inference will be cancelled.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        return self.__call__(
-            prompt=prompt,
-            negative_prompt=negative_prompt,
-            image=image,
-            num_inference_steps=num_inference_steps,
-            guidance_scale=guidance_scale,
-            strength=strength,
-            num_images_per_prompt=num_images_per_prompt,
-            eta=eta,
-            generator=generator,
-            max_embeddings_multiples=max_embeddings_multiples,
-            output_type=output_type,
-            return_dict=return_dict,
-            callback=callback,
-            is_cancelled_callback=is_cancelled_callback,
-            callback_steps=callback_steps,
-            **kwargs,
-        )
-
-    def inpaint(
-        self,
-        prompt: Union[str, List[str]],
-        image: Union[paddle.Tensor, PIL.Image.Image],
-        mask_image: Union[paddle.Tensor, PIL.Image.Image],
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        strength: Optional[float] = 0.8,
-        num_inference_steps: Optional[int] = 50,
-        guidance_scale: Optional[float] = 7.5,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: Optional[float] = 0.0,
-        generator: Optional[paddle.Generator] = None,
-        max_embeddings_multiples: Optional[int] = 3,
-        output_type: Optional[str] = "pil",
-        return_dict: Optional[bool] = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        is_cancelled_callback: Optional[Callable[[], bool]] = None,
-        callback_steps: Optional[int] = 1,
-        **kwargs,
-    ):
-        r"""
-        Function for inpaint.
-        Args:
-            image (`paddle.Tensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, that will be used as the starting point for the
-                process. This is the image whose masked region will be inpainted.
-            mask_image (`paddle.Tensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
-                replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
-                PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
-                contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
-            prompt (`str` or `List[str]`):
-                The prompt or prompts to guide the image generation.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            strength (`float`, *optional*, defaults to 0.8):
-                Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
-                is 1, the denoising process will be run on the masked area for the full number of iterations specified
-                in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more
-                noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
-                the expense of slower inference. This parameter will be modulated by `strength`, as explained above.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator`, *optional*):
-                A paddle.Generator to make generation deterministic.
-            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
-                The max multiple length of prompt embeddings compared to the max output length of text encoder.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        return self.__call__(
-            prompt=prompt,
-            negative_prompt=negative_prompt,
-            image=image,
-            mask_image=mask_image,
-            num_inference_steps=num_inference_steps,
-            guidance_scale=guidance_scale,
-            strength=strength,
-            num_images_per_prompt=num_images_per_prompt,
-            eta=eta,
-            generator=generator,
-            max_embeddings_multiples=max_embeddings_multiples,
-            output_type=output_type,
-            return_dict=return_dict,
-            callback=callback,
-            callback_steps=callback_steps,
-            **kwargs,
-        )
diff --git a/ppdiffusers/examples/community/mixture_tiling.py b/ppdiffusers/examples/community/mixture_tiling.py
deleted file mode 100644
index 9a5626e5c790..000000000000
--- a/ppdiffusers/examples/community/mixture_tiling.py
+++ /dev/null
@@ -1,417 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from copy import deepcopy
-from enum import Enum
-from typing import List, Optional, Tuple, Union
-
-import paddle
-from tqdm.auto import tqdm
-
-from ppdiffusers.models import AutoencoderKL, UNet2DConditionModel
-from ppdiffusers.pipeline_utils import DiffusionPipeline
-from ppdiffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
-from ppdiffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
-from ppdiffusers.utils import logging
-
-try:
-    from ligo.segments import segment
-
-    from paddlenlp.transformers import (
-        CLIPFeatureExtractor,
-        CLIPTextModel,
-        CLIPTokenizer,
-    )
-except ImportError:
-    raise ImportError("Please install paddlenlp and ligo-segments to use the mixture pipeline")
-logger = logging.get_logger(__name__)
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> from ppdiffusers import LMSDiscreteScheduler, DiffusionPipeline
-
-        >>> scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
-        >>> pipeline = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", scheduler=scheduler, custom_pipeline="mixture_tiling")
-
-        >>> image = pipeline(
-        >>>     prompt=[[
-        >>>         "A charming house in the countryside, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
-        >>>         "A dirt road in the countryside crossing pastures, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
-        >>>         "An old and rusty giant robot lying on a dirt road, by jakub rozalski, dark sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece"
-        >>>     ]],
-        >>>     tile_height=640,
-        >>>     tile_width=640,
-        >>>     tile_row_overlap=0,
-        >>>     tile_col_overlap=256,
-        >>>     guidance_scale=8,
-        >>>     seed=7178915308,
-        >>>     num_inference_steps=50,
-    >>> )["images"][0]
-        ```
-"""
-
-
-def _tile2pixel_indices(tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap):
-    """Given a tile row and column numbers returns the range of pixels affected by that tiles in the overall image
-
-    Returns a tuple with:
-        - Starting coordinates of rows in pixel space
-        - Ending coordinates of rows in pixel space
-        - Starting coordinates of columns in pixel space
-        - Ending coordinates of columns in pixel space
-    """
-    px_row_init = 0 if tile_row == 0 else tile_row * (tile_height - tile_row_overlap)
-    px_row_end = px_row_init + tile_height
-    px_col_init = 0 if tile_col == 0 else tile_col * (tile_width - tile_col_overlap)
-    px_col_end = px_col_init + tile_width
-    return px_row_init, px_row_end, px_col_init, px_col_end
-
-
-def _pixel2latent_indices(px_row_init, px_row_end, px_col_init, px_col_end):
-    """Translates coordinates in pixel space to coordinates in latent space"""
-    return px_row_init // 8, px_row_end // 8, px_col_init // 8, px_col_end // 8
-
-
-def _tile2latent_indices(tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap):
-    """Given a tile row and column numbers returns the range of latents affected by that tiles in the overall image
-
-    Returns a tuple with:
-        - Starting coordinates of rows in latent space
-        - Ending coordinates of rows in latent space
-        - Starting coordinates of columns in latent space
-        - Ending coordinates of columns in latent space
-    """
-    px_row_init, px_row_end, px_col_init, px_col_end = _tile2pixel_indices(
-        tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap
-    )
-    return _pixel2latent_indices(px_row_init, px_row_end, px_col_init, px_col_end)
-
-
-def _tile2latent_exclusive_indices(
-    tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap, rows, columns
-):
-    """Given a tile row and column numbers returns the range of latents affected only by that tile in the overall image
-
-    Returns a tuple with:
-        - Starting coordinates of rows in latent space
-        - Ending coordinates of rows in latent space
-        - Starting coordinates of columns in latent space
-        - Ending coordinates of columns in latent space
-    """
-    row_init, row_end, col_init, col_end = _tile2latent_indices(
-        tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap
-    )
-    row_segment = segment(row_init, row_end)
-    col_segment = segment(col_init, col_end)
-    # Iterate over the rest of tiles, clipping the region for the current tile
-    for row in range(rows):
-        for column in range(columns):
-            if row != tile_row and column != tile_col:
-                (clip_row_init, clip_row_end, clip_col_init, clip_col_end) = _tile2latent_indices(
-                    row, column, tile_width, tile_height, tile_row_overlap, tile_col_overlap
-                )
-                row_segment = row_segment - segment(clip_row_init, clip_row_end)
-                col_segment = col_segment - segment(clip_col_init, clip_col_end)
-    # return row_init, row_end, col_init, col_end
-    return row_segment[0], row_segment[1], col_segment[0], col_segment[1]
-
-
-class StableDiffusionExtrasMixin:
-    """Mixin providing additional convenience method to Stable Diffusion pipelines"""
-
-    def decode_latents(self, latents, cpu_vae=False):
-        """Decodes a given array of latents into pixel space"""
-        # scale and decode the image latents with vae
-        if cpu_vae:
-            lat = deepcopy(latents).cpu()
-            vae = deepcopy(self.vae).cpu()
-        else:
-            lat = latents
-            vae = self.vae
-        lat = 1 / 0.18215 * lat
-        image = vae.decode(lat).sample
-        image = (image / 2 + 0.5).clip(min=0, max=1)
-        image = image.cpu().transpose(perm=[0, 2, 3, 1]).numpy()
-        return self.numpy_to_pil(image)
-
-
-class StableDiffusionTilingPipeline(DiffusionPipeline, StableDiffusionExtrasMixin):
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: Union[DDIMScheduler, PNDMScheduler],
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPFeatureExtractor,
-    ):
-        super().__init__()
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-
-    class SeedTilesMode(Enum):
-        """Modes in which the latents of a particular tile can be re-seeded"""
-
-        FULL = "full"
-        EXCLUSIVE = "exclusive"
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[List[str]]],
-        num_inference_steps: Optional[int] = 50,
-        guidance_scale: Optional[float] = 7.5,
-        eta: Optional[float] = 0.0,
-        seed: Optional[int] = None,
-        tile_height: Optional[int] = 512,
-        tile_width: Optional[int] = 512,
-        tile_row_overlap: Optional[int] = 256,
-        tile_col_overlap: Optional[int] = 256,
-        guidance_scale_tiles: Optional[List[List[float]]] = None,
-        seed_tiles: Optional[List[List[int]]] = None,
-        seed_tiles_mode: Optional[Union[str, List[List[str]]]] = "full",
-        seed_reroll_regions: Optional[List[Tuple[int, int, int, int, int]]] = None,
-        cpu_vae: Optional[bool] = False,
-    ):
-        """
-        Function to run the diffusion pipeline with tiling support.
-
-        Args:
-            prompt: either a single string (no tiling) or a list of lists with all the prompts to use (one list for each row of tiles). This will also define the tiling structure.
-            num_inference_steps: number of diffusions steps.
-            guidance_scale: classifier-free guidance.
-            seed: general random seed to initialize latents.
-            tile_height: height in pixels of each grid tile.
-            tile_width: width in pixels of each grid tile.
-            tile_row_overlap: number of overlap pixels between tiles in consecutive rows.
-            tile_col_overlap: number of overlap pixels between tiles in consecutive columns.
-            guidance_scale_tiles: specific weights for classifier-free guidance in each tile.
-            guidance_scale_tiles: specific weights for classifier-free guidance in each tile. If None, the value provided in guidance_scale will be used.
-            seed_tiles: specific seeds for the initialization latents in each tile. These will override the latents generated for the whole canvas using the standard seed parameter.
-            seed_tiles_mode: either "full" "exclusive". If "full", all the latents affected by the tile be overriden. If "exclusive", only the latents that are affected exclusively by this tile (and no other tiles) will be overrriden.
-            seed_reroll_regions: a list of tuples in the form (start row, end row, start column, end column, seed) defining regions in pixel space for which the latents will be overriden using the given seed. Takes priority over seed_tiles.
-            cpu_vae: the decoder from latent space to pixel space can require too mucho GPU RAM for large images. If you find out of memory errors at the end of the generation process, try setting this parameter to True to run the decoder in CPU. Slower, but should run without memory issues.
-
-        Examples:
-
-        Returns:
-            A PIL image with the generated image.
-
-        """
-        if not isinstance(prompt, list) or not all(isinstance(row, list) for row in prompt):
-            raise ValueError(f"`prompt` has to be a list of lists but is {type(prompt)}")
-        grid_rows = len(prompt)
-        grid_cols = len(prompt[0])
-        if not all(len(row) == grid_cols for row in prompt):
-            raise ValueError("All prompt rows must have the same number of prompt columns")
-        if not isinstance(seed_tiles_mode, str) and (
-            not isinstance(seed_tiles_mode, list) or not all(isinstance(row, list) for row in seed_tiles_mode)
-        ):
-            raise ValueError(f"`seed_tiles_mode` has to be a string or list of lists but is {type(prompt)}")
-        if isinstance(seed_tiles_mode, str):
-            seed_tiles_mode = [[seed_tiles_mode for _ in range(len(row))] for row in prompt]
-        modes = [mode.value for mode in self.SeedTilesMode]
-        if any(mode not in modes for row in seed_tiles_mode for mode in row):
-            raise ValueError(f"Seed tiles mode must be one of {modes}")
-        if seed_reroll_regions is None:
-            seed_reroll_regions = []
-        batch_size = 1
-
-        # create original noisy latents using the timesteps
-        height = tile_height + (grid_rows - 1) * (tile_height - tile_row_overlap)
-        width = tile_width + (grid_cols - 1) * (tile_width - tile_col_overlap)
-        latents_shape = (batch_size, self.unet.config.in_channels, height // 8, width // 8)
-        generator = paddle.Generator().manual_seed(seed)
-        latents = paddle.randn(shape=latents_shape, generator=generator)
-
-        # overwrite latents for specific tiles if provided
-        if seed_tiles is not None:
-            for row in range(grid_rows):
-                for col in range(grid_cols):
-                    if (seed_tile := seed_tiles[row][col]) is not None:
-                        mode = seed_tiles_mode[row][col]
-                        if mode == self.SeedTilesMode.FULL.value:
-                            row_init, row_end, col_init, col_end = _tile2latent_indices(
-                                row, col, tile_width, tile_height, tile_row_overlap, tile_col_overlap
-                            )
-                        else:
-                            row_init, row_end, col_init, col_end = _tile2latent_exclusive_indices(
-                                row,
-                                col,
-                                tile_width,
-                                tile_height,
-                                tile_row_overlap,
-                                tile_col_overlap,
-                                grid_rows,
-                                grid_cols,
-                            )
-                        tile_generator = paddle.Generator().manual_seed(seed_tile)
-                        tile_shape = latents_shape[0], latents_shape[1], row_end - row_init, col_end - col_init
-                        latents[:, :, row_init:row_end, col_init:col_end] = paddle.randn(
-                            shape=tile_shape, generator=tile_generator
-                        )
-
-        # overwrite again for seed reroll regions
-        for row_init, row_end, col_init, col_end, seed_reroll in seed_reroll_regions:
-            row_init, row_end, col_init, col_end = _pixel2latent_indices(
-                row_init, row_end, col_init, col_end
-            )  # to latent space coordinates
-            reroll_generator = paddle.Generator().manual_seed(seed_reroll)
-            region_shape = latents_shape[0], latents_shape[1], row_end - row_init, col_end - col_init
-            latents[:, :, row_init:row_end, col_init:col_end] = paddle.randn(
-                shape=region_shape, generator=reroll_generator
-            )
-
-        # Prepare scheduler
-        accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys())
-        extra_set_kwargs = {}
-        if accepts_offset:
-            extra_set_kwargs["offset"] = 1
-        self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
-        # if we use LMSDiscreteScheduler, let's make sure latents are multiplied by sigmas
-        if isinstance(self.scheduler, LMSDiscreteScheduler):
-            latents = latents * self.scheduler.sigmas[0]
-
-        # get prompts text embeddings
-        text_input = [
-            [
-                self.tokenizer(
-                    col,
-                    padding="max_length",
-                    max_length=self.tokenizer.model_max_length,
-                    truncation=True,
-                    return_tensors="pd",
-                )
-                for col in row
-            ]
-            for row in prompt
-        ]
-        text_embeddings = [[self.text_encoder(col.input_ids)[0] for col in row] for row in text_input]
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance:
-            for i in range(grid_rows):
-                for j in range(grid_cols):
-                    max_length = text_input[i][j].input_ids.shape[-1]
-                    uncond_input = self.tokenizer(
-                        [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pd"
-                    )
-                    uncond_embeddings = self.text_encoder(uncond_input.input_ids)[0]
-
-                    # For classifier free guidance, we need to do two forward passes.
-                    # Here we concatenate the unconditional and text embeddings into a single batch
-                    # to avoid doing two forward passes
-                    text_embeddings[i][j] = paddle.concat(x=[uncond_embeddings, text_embeddings[i][j]])
-
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # Mask for tile weights strenght
-        tile_weights = self._gaussian_weights(tile_width, tile_height, batch_size)
-
-        # Diffusion timesteps
-        for i, t in tqdm(enumerate(self.scheduler.timesteps)):
-            # Diffuse each tile
-            noise_preds = []
-            for row in range(grid_rows):
-                noise_preds_row = []
-                for col in range(grid_cols):
-                    px_row_init, px_row_end, px_col_init, px_col_end = _tile2latent_indices(
-                        row, col, tile_width, tile_height, tile_row_overlap, tile_col_overlap
-                    )
-                    tile_latents = latents[:, :, px_row_init:px_row_end, px_col_init:px_col_end]
-                    # expand the latents if we are doing classifier free guidance
-                    latent_model_input = (
-                        paddle.concat(x=[tile_latents] * 2) if do_classifier_free_guidance else tile_latents
-                    )
-                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                    # predict the noise residual
-                    noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings[row][col])[
-                        "sample"
-                    ]
-                    # perform guidance
-                    if do_classifier_free_guidance:
-                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(chunks=2)
-                        guidance = (
-                            guidance_scale
-                            if guidance_scale_tiles is None or guidance_scale_tiles[row][col] is None
-                            else guidance_scale_tiles[row][col]
-                        )
-                        noise_pred_tile = noise_pred_uncond + guidance * (noise_pred_text - noise_pred_uncond)
-                        noise_preds_row.append(noise_pred_tile)
-                noise_preds.append(noise_preds_row)
-            # Stitch noise predictions for all tiles
-            noise_pred = paddle.zeros(shape=latents.shape)
-            contributors = paddle.zeros(shape=latents.shape)
-            # Add each tile contribution to overall latents
-            for row in range(grid_rows):
-                for col in range(grid_cols):
-                    px_row_init, px_row_end, px_col_init, px_col_end = _tile2latent_indices(
-                        row, col, tile_width, tile_height, tile_row_overlap, tile_col_overlap
-                    )
-                    noise_pred[:, :, px_row_init:px_row_end, px_col_init:px_col_end] += (
-                        noise_preds[row][col] * tile_weights
-                    )
-                    contributors[:, :, px_row_init:px_row_end, px_col_init:px_col_end] += tile_weights
-            # Average overlapping areas with more than 1 contributor
-            noise_pred /= contributors
-            # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents).prev_sample
-
-        # scale and decode the image latents with vae
-        image = self.decode_latents(latents, cpu_vae)
-        return {"images": image}
-
-    def _gaussian_weights(self, tile_width, tile_height, nbatches):
-        """Generates a gaussian mask of weights for tile contributions"""
-        import numpy as np
-        from numpy import exp, pi, sqrt
-
-        latent_width = tile_width // 8
-        latent_height = tile_height // 8
-        var = 0.01
-        midpoint = (latent_width - 1) / 2
-        x_probs = [
-            (exp(-(x - midpoint) * (x - midpoint) / (latent_width * latent_width) / (2 * var)) / sqrt(2 * pi * var))
-            for x in range(latent_width)
-        ]
-        midpoint = latent_height / 2
-        y_probs = [
-            (exp(-(y - midpoint) * (y - midpoint) / (latent_height * latent_height) / (2 * var)) / sqrt(2 * pi * var))
-            for y in range(latent_height)
-        ]
-        weights = np.outer(y_probs, x_probs)
-        return paddle.tile(
-            x=paddle.to_tensor(data=weights), repeat_times=(nbatches, self.unet.config.in_channels, 1, 1)
-        )
diff --git a/ppdiffusers/examples/community/one_step_unet.py b/ppdiffusers/examples/community/one_step_unet.py
deleted file mode 100644
index 5baffefdab06..000000000000
--- a/ppdiffusers/examples/community/one_step_unet.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import paddle
-
-from ppdiffusers import DiffusionPipeline
-
-
-class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
-    def __init__(self, unet, scheduler):
-        super().__init__()
-
-        self.register_modules(unet=unet, scheduler=scheduler)
-
-    def __call__(self):
-        image = paddle.randn(
-            (1, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size),
-        )
-        timestep = 1
-
-        model_output = self.unet(image, timestep).sample
-        scheduler_output = self.scheduler.step(model_output, timestep, image).prev_sample
-
-        result = scheduler_output - scheduler_output + paddle.ones_like(scheduler_output)
-
-        return result
diff --git a/ppdiffusers/examples/community/pipeline_fastdeploy_stable_diffusion_hires_fix.py b/ppdiffusers/examples/community/pipeline_fastdeploy_stable_diffusion_hires_fix.py
deleted file mode 100644
index a2f1393f3a3b..000000000000
--- a/ppdiffusers/examples/community/pipeline_fastdeploy_stable_diffusion_hires_fix.py
+++ /dev/null
@@ -1,589 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-from typing import Callable, Dict, List, Optional, Union
-
-import paddle
-import paddle.nn.functional as F
-import PIL
-
-from paddlenlp.transformers import CLIPImageProcessor, CLIPTokenizer
-from ppdiffusers import DiffusionPipeline
-from ppdiffusers.pipelines.fastdeploy_utils import (
-    FastDeployDiffusionPipelineMixin,
-    FastDeployRuntimeModel,
-)
-from ppdiffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
-from ppdiffusers.schedulers import KarrasDiffusionSchedulers
-from ppdiffusers.utils import logging, randn_tensor
-
-logger = logging.get_logger(__name__)
-
-
-class FastStableDiffusionHiresFixPipeline(DiffusionPipeline, FastDeployDiffusionPipelineMixin):
-    r"""
-    Pipeline for text-to-image generation with high resolution fixing(hires.fix) based on Stable Diffusion.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving etc.)
-
-    Args:
-        vae_encoder ([`FastDeployRuntimeModel`]):
-            Variational Auto-Encoder (VAE) Model to encode images to latent representations.
-        vae_decoder ([`FastDeployRuntimeModel`]):
-            Variational Auto-Encoder (VAE) Model to decode images from latent representations.
-        text_encoder ([`FastDeployRuntimeModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`FastDeployRuntimeModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], [`PNDMScheduler`], [`EulerDiscreteScheduler`], [`EulerAncestralDiscreteScheduler`]
-            or [`DPMSolverMultistepScheduler`].
-        safety_checker ([`FastDeployRuntimeModel`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPImageProcessor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-    _optional_components = ["vae_encoder", "safety_checker", "feature_extractor"]
-
-    def __init__(
-        self,
-        vae_encoder: FastDeployRuntimeModel,
-        vae_decoder: FastDeployRuntimeModel,
-        text_encoder: FastDeployRuntimeModel,
-        tokenizer: CLIPTokenizer,
-        unet: FastDeployRuntimeModel,
-        scheduler: KarrasDiffusionSchedulers,
-        safety_checker: FastDeployRuntimeModel,
-        feature_extractor: CLIPImageProcessor,
-        requires_safety_checker: bool = False,
-    ):
-        super().__init__()
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. PaddleNLP team, diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                f"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-
-        self.register_modules(
-            vae_encoder=vae_encoder,
-            vae_decoder=vae_decoder,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-        self.post_init()
-
-    def get_timesteps(self, denoising_steps, denoising_strength):
-        steps = int(denoising_steps / min(denoising_strength, 0.999))
-        self.scheduler.set_timesteps(steps)
-
-        t_start = max(steps - denoising_steps, 0)
-        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
-
-        if hasattr(self.scheduler, "step_index_offset"):
-            self.scheduler.step_index_offset = t_start * self.scheduler.order
-
-        return timesteps.cast("float32"), denoising_steps
-
-    def check_inputs(
-        self,
-        prompt,
-        height,
-        width,
-        callback_steps,
-        hr_scale,
-        hr_resize_height,
-        hr_resize_width,
-        denoising_strength,
-        latent_scale_mode,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-    ):
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if hr_scale < 0:
-            raise ValueError("hr_scale shoule be greater that 0, but acceived {hr_scale}")
-
-        if hr_resize_height % 8 != 0 or hr_resize_width % 8 != 0:
-            raise ValueError(
-                f"`hr_resize_height` and `hr_resize_width` have to be divisible by 8 but are {hr_resize_height} and {hr_resize_width}."
-            )
-
-        if denoising_strength > 1 or denoising_strength < 0:
-            raise ValueError(f"denoising_strength should be set between 0 and 1., but acceived {denoising_strength}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if latent_scale_mode not in ["nearest", "bilinear", "bicubic", "area"]:
-            raise ValueError(
-                f"Only such interpolate method supported for latent_scale_mode in [nearest, bilinear, bicubic, area]. but acceived {latent_scale_mode}."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-    def get_upscaled_width_and_height(self, width, height, hr_scale=2, hr_resize_width=0, hr_resize_height=0):
-        if hr_resize_width == 0 and hr_resize_height == 0:
-            hr_upscale_to_width = int(width * hr_scale)
-            hr_upscale_to_height = int(height * hr_scale)
-        else:
-            if hr_resize_height == 0:
-                hr_upscale_to_width = hr_resize_width
-                hr_upscale_to_height = hr_resize_width * height // width
-            elif hr_resize_width == 0:
-                hr_upscale_to_width = hr_resize_height * width // height
-                hr_upscale_to_height = hr_resize_height
-            else:
-                src_ratio = width / height
-                dst_ratio = hr_resize_width / hr_resize_height
-
-                if src_ratio < dst_ratio:
-                    hr_upscale_to_width = hr_resize_width
-                    hr_upscale_to_height = hr_resize_width * height // width
-                else:
-                    hr_upscale_to_width = hr_resize_height * width // height
-                    hr_upscale_to_height = hr_resize_height
-
-        return hr_upscale_to_width, hr_upscale_to_height
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 40,
-        hires_ratio: Optional[float] = 0.5,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        parse_prompt_type: Optional[str] = "lpw",
-        max_embeddings_multiples: Optional[int] = 3,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        enable_hr: Optional[bool] = True,
-        hr_scale: Optional[float] = 2.0,
-        hr_resize_width: Optional[int] = 0,
-        hr_resize_height: Optional[int] = 0,
-        denoising_strength: Optional[float] = 0.7,
-        latent_scale_mode: Optional[str] = "nearest",
-        controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
-        controlnet_conditioning_scale: float = 1.0,
-        infer_op_dict: Dict[str, str] = None,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 40):
-                The number of denoising steps, equal to sample_steps and hr_steps. samples_steps means the initial
-                denoising steps, and hr_steps means hires denoising steps. More denoising steps usually lead to a
-                higher quality image at the expense of slower inference.
-            hires_ratio (`float`, *optional*, defaults to 0.5):
-                The step proportion of hires.fix, that means hr_steps = int(num_inference_steps * hires_ratio).
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
-                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            hr_steps (`int`, *optional*, defaults to 30):
-                The number of second denoising steps about high resolution fixing.
-            hr_scale (`float`, *optional*, defaults to 2.0):
-                The upscaler to expand the width and height of image. if set 2.0, it means that expand width and height of a image to width*2.0 and height*2.0.
-            hr_resize_width (`int`, *optional*, defaults to 0):
-                It enable users to specify the upscaled width mannually. if hr_resize_width!=0, program will use it to compute scaled width and height instead of hr_scale.
-            hr_resize_height (`int`, *optional*, defaults to 0):
-                It enable users to specify the upscaled height mannually. if hr_resize_height!=0, program will use it to compute scaled width and height instead of hr_scale.
-            denoising_strength (`float`, *optional*, defaults to 0.7):
-                The denoising strength applying on hires.fix steps. It take a value between 0 and 1.
-            latent_scale_mode (`str`, *optional*, defaults to nearest):
-                The interpolate method applying upscale initial images, you can set it in [nearest, bilinear, bicubic, area].
-
-        Examples:
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        # 0. Default height and width to unet
-        height = height or 512
-        width = width or 512
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt,
-            height,
-            width,
-            callback_steps,
-            hr_scale,
-            hr_resize_height,
-            hr_resize_width,
-            denoising_strength,
-            latent_scale_mode,
-            negative_prompt,
-            prompt_embeds,
-            negative_prompt_embeds,
-        )
-        infer_op_dict = self.prepare_infer_op_dict(infer_op_dict)
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # do_controlnet
-        do_controlnet = controlnet_cond is not None
-        if do_controlnet:
-            control_image, control_conditioning_scale = self.prepare_controlnet_cond(
-                controlnet_cond=controlnet_cond,
-                controlnet_conditioning_scale=controlnet_conditioning_scale,
-                width=width,
-                height=height,
-                batch_size=batch_size,
-                num_images_per_prompt=num_images_per_prompt,
-                do_classifier_free_guidance=do_classifier_free_guidance,
-            )
-
-        # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            parse_prompt_type=parse_prompt_type,
-            max_embeddings_multiples=max_embeddings_multiples,
-            infer_op=infer_op_dict.get("text_encoder", None),
-        )
-
-        # 4. Prepare timesteps
-        if enable_hr:
-            hr_steps = int(num_inference_steps * hires_ratio)
-            sample_steps = num_inference_steps - hr_steps
-        else:
-            hr_steps = 0
-            sample_steps = num_inference_steps
-
-        self.scheduler.set_timesteps(sample_steps)
-        timesteps = self.scheduler.timesteps.cast("float32")
-
-        # 5. Prepare latent variables
-        if generator is None:
-            generator_state = paddle.get_cuda_rng_state()
-            paddle.Generator().states_["initial_generator"] = copy.deepcopy(generator_state)
-        else:
-            paddle.Generator().states_["initial_generator"] = copy.deepcopy(paddle.Generator().states_[generator])
-
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            height,
-            width,
-            generator,
-            latents,
-        )
-
-        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 7. Denoising loop
-        num_warmup_steps = len(timesteps) - sample_steps * self.scheduler.order
-        is_scheduler_support_step_index = self.is_scheduler_support_step_index()
-        with self.progress_bar(total=sample_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-                if is_scheduler_support_step_index:
-                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i)
-                else:
-                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                unet_inputs = dict(
-                    sample=latent_model_input,
-                    timestep=t,
-                    encoder_hidden_states=prompt_embeds,
-                    infer_op=infer_op_dict.get("unet", None),
-                    output_shape=latent_model_input.shape,
-                )
-                if do_controlnet:
-                    unet_inputs["controlnet_cond"] = control_image
-                    unet_inputs["controlnet_conditioning_scale"] = control_conditioning_scale
-                # predict the noise residual
-                noise_pred_unet = self.unet(**unet_inputs)[0]
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                else:
-                    noise_pred = noise_pred_unet
-
-                # compute the previous noisy sample x_t -> x_t-1
-                if is_scheduler_support_step_index:
-                    scheduler_output = self.scheduler.step(
-                        noise_pred, t, latents, step_index=i, return_pred_original_sample=False, **extra_step_kwargs
-                    )
-                else:
-                    scheduler_output = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)
-                latents = scheduler_output.prev_sample
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-                    if i == len(timesteps) - 1:
-                        # sync for accuracy it/s measure
-                        paddle.device.cuda.synchronize()
-
-        # start to apply hires.fix on initial latents
-        if enable_hr:
-            # 8. determine the upscaled width and height for upscaled images
-            truncate_width = 0
-            truncate_height = 0
-            hr_upscale_to_width, hr_upscale_to_height = self.get_upscaled_width_and_height(
-                width, height, hr_scale=hr_scale, hr_resize_width=hr_resize_width, hr_resize_height=hr_resize_height
-            )
-            if hr_resize_width != 0 and hr_resize_height != 0:
-                truncate_width = (hr_upscale_to_width - hr_resize_width) // self.vae_scale_factor
-                truncate_height = (hr_upscale_to_height - hr_resize_height) // self.vae_scale_factor
-
-            # 9. special case: do nothing if upscaling is not nesscessary
-            if hr_upscale_to_width == width and hr_upscale_to_height == height:
-                enable_hr = False
-                denoising_strength = None
-
-        if enable_hr:
-            if do_controlnet:
-                control_image, control_conditioning_scale = self.prepare_controlnet_cond(
-                    controlnet_cond=controlnet_cond,
-                    controlnet_conditioning_scale=controlnet_conditioning_scale,
-                    width=hr_upscale_to_width,
-                    height=hr_upscale_to_height,
-                    batch_size=batch_size,
-                    num_images_per_prompt=num_images_per_prompt,
-                    do_classifier_free_guidance=do_classifier_free_guidance,
-                )
-
-            # 10. prepare init latents
-            timesteps, hr_steps = self.get_timesteps(hr_steps, denoising_strength)
-            init_timestep = timesteps[:1].tile([latents.shape[0]])
-
-            latents = F.interpolate(
-                latents,
-                size=(
-                    hr_upscale_to_height // self.vae_scale_factor,
-                    hr_upscale_to_width // self.vae_scale_factor,
-                ),
-                mode=latent_scale_mode,
-            )
-            latents = latents[
-                :,
-                :,
-                truncate_height // 2 : latents.shape[2] - (truncate_height + 1) // 2,
-                truncate_width // 2 : latents.shape[3] - (truncate_width + 1) // 2,
-            ]
-
-            noise = randn_tensor(latents.shape, dtype=latents.dtype, generator="initial_generator")
-            latents = self.scheduler.add_noise(latents, noise, init_timestep)
-
-            # 11. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-            extra_step_kwargs = self.prepare_extra_step_kwargs("initial_generator", eta)
-
-            # 12. denoising on hires.fix steps
-            num_warmup_steps = len(timesteps) - hr_steps * self.scheduler.order
-            with self.progress_bar(total=hr_steps) as progress_bar:
-                for i, t in enumerate(timesteps):
-                    # expand the latents if we are doing classifier free guidance
-                    latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-                    if is_scheduler_support_step_index:
-                        latent_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i)
-                    else:
-                        latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                    unet_inputs = dict(
-                        sample=latent_model_input,
-                        timestep=t,
-                        encoder_hidden_states=prompt_embeds,
-                        infer_op=infer_op_dict.get("unet", None),
-                        output_shape=latent_model_input.shape,
-                    )
-                    if do_controlnet:
-                        unet_inputs["controlnet_cond"] = control_image
-                        unet_inputs["controlnet_conditioning_scale"] = control_conditioning_scale
-                    # predict the noise residual
-                    noise_pred_unet = self.unet(**unet_inputs)[0]
-
-                    # perform guidance
-                    if do_classifier_free_guidance:
-                        noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2)
-                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                    else:
-                        noise_pred = noise_pred_unet
-
-                    # compute the previous noisy sample x_t -> x_t-1
-                    if is_scheduler_support_step_index:
-                        scheduler_output = self.scheduler.step(
-                            noise_pred,
-                            t,
-                            latents,
-                            step_index=i,
-                            return_pred_original_sample=False,
-                            **extra_step_kwargs,
-                        )
-                    else:
-                        scheduler_output = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)
-                    latents = scheduler_output.prev_sample
-
-                    # call the callback, if provided
-                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                        progress_bar.update()
-                        if callback is not None and i % callback_steps == 0:
-                            callback(i, t, latents)
-                    if i == len(timesteps) - 1:
-                        # sync for accuracy it/s measure
-                        paddle.device.cuda.synchronize()
-
-        if not output_type == "latent":
-            image = self._decode_vae_latents(
-                latents / self.vae_scaling_factor, infer_op=infer_op_dict.get("vae_decoder", None)
-            )
-            image, has_nsfw_concept = self.run_safety_checker(image)
-        else:
-            image = latents
-            has_nsfw_concept = None
-
-        if has_nsfw_concept is None:
-            do_denormalize = [True] * image.shape[0]
-        else:
-            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
-
-        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/examples/community/pipeline_fastdeploy_stable_diffusion_mixture_tiling.py b/ppdiffusers/examples/community/pipeline_fastdeploy_stable_diffusion_mixture_tiling.py
deleted file mode 100644
index 594d9d360fc6..000000000000
--- a/ppdiffusers/examples/community/pipeline_fastdeploy_stable_diffusion_mixture_tiling.py
+++ /dev/null
@@ -1,453 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from copy import deepcopy
-from enum import Enum
-from typing import Dict, List, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-from tqdm.auto import tqdm
-
-# from ppdiffusers.models import AutoencoderKL, UNet2DConditionModel
-from ppdiffusers.pipeline_utils import DiffusionPipeline
-from ppdiffusers.pipelines.fastdeploy_utils import (
-    FastDeployDiffusionPipelineMixin,
-    FastDeployRuntimeModel,
-)
-
-# from ppdiffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
-from ppdiffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
-from ppdiffusers.utils import logging
-
-try:
-    from ligo.segments import segment
-
-    from paddlenlp.transformers import (  # CLIPTextModel,
-        CLIPFeatureExtractor,
-        CLIPTokenizer,
-    )
-except ImportError:
-    raise ImportError("Please install paddlenlp and ligo-segments to use the mixture pipeline")
-logger = logging.get_logger(__name__)
-
-
-def _tile2pixel_indices(tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap):
-    """Given a tile row and column numbers returns the range of pixels affected by that tiles in the overall image
-
-    Returns a tuple with:
-        - Starting coordinates of rows in pixel space
-        - Ending coordinates of rows in pixel space
-        - Starting coordinates of columns in pixel space
-        - Ending coordinates of columns in pixel space
-    """
-    px_row_init = 0 if tile_row == 0 else tile_row * (tile_height - tile_row_overlap)
-    px_row_end = px_row_init + tile_height
-    px_col_init = 0 if tile_col == 0 else tile_col * (tile_width - tile_col_overlap)
-    px_col_end = px_col_init + tile_width
-    return px_row_init, px_row_end, px_col_init, px_col_end
-
-
-def _pixel2latent_indices(px_row_init, px_row_end, px_col_init, px_col_end):
-    """Translates coordinates in pixel space to coordinates in latent space"""
-    return px_row_init // 8, px_row_end // 8, px_col_init // 8, px_col_end // 8
-
-
-def _tile2latent_indices(tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap):
-    """Given a tile row and column numbers returns the range of latents affected by that tiles in the overall image
-
-    Returns a tuple with:
-        - Starting coordinates of rows in latent space
-        - Ending coordinates of rows in latent space
-        - Starting coordinates of columns in latent space
-        - Ending coordinates of columns in latent space
-    """
-    px_row_init, px_row_end, px_col_init, px_col_end = _tile2pixel_indices(
-        tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap
-    )
-    return _pixel2latent_indices(px_row_init, px_row_end, px_col_init, px_col_end)
-
-
-def _tile2latent_exclusive_indices(
-    tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap, rows, columns
-):
-    """Given a tile row and column numbers returns the range of latents affected only by that tile in the overall image
-
-    Returns a tuple with:
-        - Starting coordinates of rows in latent space
-        - Ending coordinates of rows in latent space
-        - Starting coordinates of columns in latent space
-        - Ending coordinates of columns in latent space
-    """
-    row_init, row_end, col_init, col_end = _tile2latent_indices(
-        tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap
-    )
-    row_segment = segment(row_init, row_end)
-    col_segment = segment(col_init, col_end)
-    # Iterate over the rest of tiles, clipping the region for the current tile
-    for row in range(rows):
-        for column in range(columns):
-            if row != tile_row and column != tile_col:
-                (clip_row_init, clip_row_end, clip_col_init, clip_col_end) = _tile2latent_indices(
-                    row, column, tile_width, tile_height, tile_row_overlap, tile_col_overlap
-                )
-                row_segment = row_segment - segment(clip_row_init, clip_row_end)
-                col_segment = col_segment - segment(clip_col_init, clip_col_end)
-    # return row_init, row_end, col_init, col_end
-    return row_segment[0], row_segment[1], col_segment[0], col_segment[1]
-
-
-class StableDiffusionExtrasMixin:
-    """Mixin providing additional convenience method to Stable Diffusion pipelines"""
-
-    def _decode_vae_latents(self, latents: paddle.Tensor, infer_op=None, **kwargs):
-        latents_shape = latents.shape
-        output_shape = [
-            latents_shape[0],
-            4,
-            latents_shape[2] * self.vae_scale_factor,
-            latents_shape[3] * self.vae_scale_factor,
-        ]
-        print(output_shape)
-        print(latents.shape)
-        images_vae = self.vae_decoder(
-            latent_sample=latents,
-            infer_op=infer_op,
-            output_shape=output_shape,
-        )[0]
-
-        return images_vae
-
-    def decode_latents(self, latents, cpu_vae=False):
-        """Decodes a given array of latents into pixel space"""
-        # scale and decode the image latents with vae
-        if cpu_vae:
-            lat = deepcopy(latents).cpu()
-            vae = deepcopy(self.vae).cpu()
-        else:
-            lat = latents
-            vae = self.vae
-        lat = 1 / 0.18215 * lat
-        image = vae.decode(lat).sample
-        image = (image / 2 + 0.5).clip(min=0, max=1)
-        image = image.cpu().transpose(perm=[0, 2, 3, 1]).numpy()
-        return self.numpy_to_pil(image)
-
-
-class FastDeployStableDiffusionTilingPipeline(
-    DiffusionPipeline, StableDiffusionExtrasMixin, FastDeployDiffusionPipelineMixin
-):
-    def __init__(
-        self,
-        vae_encoder: FastDeployRuntimeModel,
-        vae_decoder: FastDeployRuntimeModel,
-        text_encoder: FastDeployRuntimeModel,
-        tokenizer: CLIPTokenizer,
-        unet: FastDeployRuntimeModel,
-        scheduler: Union[DDIMScheduler, PNDMScheduler],
-        safety_checker: FastDeployRuntimeModel,
-        feature_extractor: CLIPFeatureExtractor,
-    ):
-        super().__init__()
-        self.register_modules(
-            vae_encoder=vae_encoder,
-            vae_decoder=vae_decoder,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-        self.post_init()
-
-    class SeedTilesMode(Enum):
-        """Modes in which the latents of a particular tile can be re-seeded"""
-
-        FULL = "full"
-        EXCLUSIVE = "exclusive"
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[List[str]]],
-        num_inference_steps: Optional[int] = 50,
-        guidance_scale: Optional[float] = 7.5,
-        eta: Optional[float] = 0.0,
-        seed: Optional[int] = None,
-        tile_height: Optional[int] = 512,
-        tile_width: Optional[int] = 512,
-        tile_row_overlap: Optional[int] = 256,
-        tile_col_overlap: Optional[int] = 256,
-        guidance_scale_tiles: Optional[List[List[float]]] = None,
-        seed_tiles: Optional[List[List[int]]] = None,
-        seed_tiles_mode: Optional[Union[str, List[List[str]]]] = "full",
-        seed_reroll_regions: Optional[List[Tuple[int, int, int, int, int]]] = None,
-        # parse_prompt_type: Optional[str] = "lpw",
-        # max_embeddings_multiples: Optional[int] = 3,
-        infer_op_dict: Dict[str, str] = None,
-    ):
-        """
-        Function to run the diffusion pipeline with tiling support.
-
-        Args:
-            prompt: either a single string (no tiling) or a list of lists with all the prompts to use (one list for each row of tiles). This will also define the tiling structure.
-            num_inference_steps: number of diffusions steps.
-            guidance_scale: classifier-free guidance.
-            seed: general random seed to initialize latents.
-            tile_height: height in pixels of each grid tile.
-            tile_width: width in pixels of each grid tile.
-            tile_row_overlap: number of overlap pixels between tiles in consecutive rows.
-            tile_col_overlap: number of overlap pixels between tiles in consecutive columns.
-            guidance_scale_tiles: specific weights for classifier-free guidance in each tile.
-            guidance_scale_tiles: specific weights for classifier-free guidance in each tile. If None, the value provided in guidance_scale will be used.
-            seed_tiles: specific seeds for the initialization latents in each tile. These will override the latents generated for the whole canvas using the standard seed parameter.
-            seed_tiles_mode: either "full" "exclusive". If "full", all the latents affected by the tile be overriden. If "exclusive", only the latents that are affected exclusively by this tile (and no other tiles) will be overrriden.
-            seed_reroll_regions: a list of tuples in the form (start row, end row, start column, end column, seed) defining regions in pixel space for which the latents will be overriden using the given seed. Takes priority over seed_tiles.
-            # cpu_vae: the decoder from latent space to pixel space can require too mucho GPU RAM for large images. If you find out of memory errors at the end of the generation process, try setting this parameter to True to run the decoder in CPU. Slower, but should run without memory issues.
-            # parse_prompt_type: This parameter specifies the type of prompt parsing to be performed. Choosen from: "None", "lpw", "raw", "webui".
-            # max_embeddings_multiples: This parameter determines the maximum number of embeddings that can be generated. The value of 3 suggests that the maximum number of embeddings allowed will be three times the size of the original number.
-            infer_op_dict: The parameter infer_op_dict is a dictionary that maps module to it's inference op. The purpose of this dictionary is to store inferred operations or operations that have been deduced or determined during some process. The op are choosen from the following: 'None', 'zero_copy_infer', 'raw'.
-
-        Examples:
-
-        Returns:
-            A PIL image with the generated image.
-
-        """
-        infer_op_dict = self.prepare_infer_op_dict(infer_op_dict)
-
-        if not isinstance(prompt, list) or not all(isinstance(row, list) for row in prompt):
-            raise ValueError(f"`prompt` has to be a list of lists but is {type(prompt)}")
-        grid_rows = len(prompt)
-        grid_cols = len(prompt[0])
-        if not all(len(row) == grid_cols for row in prompt):
-            raise ValueError("All prompt rows must have the same number of prompt columns")
-        if not isinstance(seed_tiles_mode, str) and (
-            not isinstance(seed_tiles_mode, list) or not all(isinstance(row, list) for row in seed_tiles_mode)
-        ):
-            raise ValueError(f"`seed_tiles_mode` has to be a string or list of lists but is {type(prompt)}")
-        if isinstance(seed_tiles_mode, str):
-            seed_tiles_mode = [[seed_tiles_mode for _ in range(len(row))] for row in prompt]
-        modes = [mode.value for mode in self.SeedTilesMode]
-        if any(mode not in modes for row in seed_tiles_mode for mode in row):
-            raise ValueError(f"Seed tiles mode must be one of {modes}")
-        if seed_reroll_regions is None:
-            seed_reroll_regions = []
-        batch_size = 1
-
-        # create original noisy latents using the timesteps
-        height = tile_height + (grid_rows - 1) * (tile_height - tile_row_overlap)
-        width = tile_width + (grid_cols - 1) * (tile_width - tile_col_overlap)
-        latents_shape = (batch_size, self.vae_decoder_num_latent_channels, height // 8, width // 8)
-        generator = paddle.Generator().manual_seed(seed)
-        latents = paddle.randn(shape=latents_shape, generator=generator)
-
-        # overwrite latents for specific tiles if provided
-        if seed_tiles is not None:
-            for row in range(grid_rows):
-                for col in range(grid_cols):
-                    seed_tile = seed_tiles[row][col]
-                    if seed_tile is not None:
-                        mode = seed_tiles_mode[row][col]
-                        if mode == self.SeedTilesMode.FULL.value:
-                            row_init, row_end, col_init, col_end = _tile2latent_indices(
-                                row, col, tile_width, tile_height, tile_row_overlap, tile_col_overlap
-                            )
-                        else:
-                            row_init, row_end, col_init, col_end = _tile2latent_exclusive_indices(
-                                row,
-                                col,
-                                tile_width,
-                                tile_height,
-                                tile_row_overlap,
-                                tile_col_overlap,
-                                grid_rows,
-                                grid_cols,
-                            )
-                        tile_generator = paddle.Generator().manual_seed(seed_tile)
-                        tile_shape = latents_shape[0], latents_shape[1], row_end - row_init, col_end - col_init
-                        latents[:, :, row_init:row_end, col_init:col_end] = paddle.randn(
-                            shape=tile_shape, generator=tile_generator
-                        )
-
-        # overwrite again for seed reroll regions
-        for row_init, row_end, col_init, col_end, seed_reroll in seed_reroll_regions:
-            row_init, row_end, col_init, col_end = _pixel2latent_indices(
-                row_init, row_end, col_init, col_end
-            )  # to latent space coordinates
-            reroll_generator = paddle.Generator().manual_seed(seed_reroll)
-            region_shape = latents_shape[0], latents_shape[1], row_end - row_init, col_end - col_init
-            latents[:, :, row_init:row_end, col_init:col_end] = paddle.randn(
-                shape=region_shape, generator=reroll_generator
-            )
-
-        # Prepare scheduler
-        accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys())
-        extra_set_kwargs = {}
-        if accepts_offset:
-            extra_set_kwargs["offset"] = 1
-        self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
-        # if we use LMSDiscreteScheduler, let's make sure latents are multiplied by sigmas
-        if isinstance(self.scheduler, LMSDiscreteScheduler):
-            latents = latents * self.scheduler.sigmas[0]
-
-        # get prompts text embeddings
-        text_input = [
-            [
-                self.tokenizer(
-                    col,
-                    padding="max_length",
-                    max_length=self.tokenizer.model_max_length,
-                    truncation=True,
-                    return_tensors="pd",
-                )
-                for col in row
-            ]
-            for row in prompt
-        ]
-        text_embeddings = [
-            [self.text_encoder(input_ids=col.input_ids.astype(np.int64))[0] for col in row] for row in text_input
-        ]
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance:
-            for i in range(grid_rows):
-                for j in range(grid_cols):
-                    max_length = text_input[i][j].input_ids.shape[-1]
-                    uncond_input = self.tokenizer(
-                        [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pd"
-                    )
-                    uncond_embeddings = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int64))[0]
-
-                    # For classifier free guidance, we need to do two forward passes.
-                    # Here we concatenate the unconditional and text embeddings into a single batch
-                    # to avoid doing two forward passes
-                    text_embeddings[i][j] = paddle.concat(x=[uncond_embeddings, text_embeddings[i][j]])
-
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # Mask for tile weights strenght
-        tile_weights = self._gaussian_weights(tile_width, tile_height, batch_size)
-
-        # Diffusion timesteps
-        is_scheduler_support_step_index = self.is_scheduler_support_step_index()
-        for i, t in tqdm(enumerate(self.scheduler.timesteps)):
-            t = t.cast("float32")
-            # Diffuse each tile
-            noise_preds = []
-            for row in range(grid_rows):
-                noise_preds_row = []
-                for col in range(grid_cols):
-                    px_row_init, px_row_end, px_col_init, px_col_end = _tile2latent_indices(
-                        row, col, tile_width, tile_height, tile_row_overlap, tile_col_overlap
-                    )
-                    tile_latents = latents[:, :, px_row_init:px_row_end, px_col_init:px_col_end]
-                    # expand the latents if we are doing classifier free guidance
-                    latent_model_input = (
-                        paddle.concat(x=[tile_latents] * 2) if do_classifier_free_guidance else tile_latents
-                    )
-                    if is_scheduler_support_step_index:
-                        latent_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i)
-                    else:
-                        latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-                    # predict the noise residual
-                    unet_inputs = dict(
-                        sample=latent_model_input,
-                        timestep=t,
-                        encoder_hidden_states=text_embeddings[row][col],
-                        infer_op=infer_op_dict.get("unet", None),
-                        output_shape=latent_model_input.shape,
-                    )
-                    noise_pred = self.unet(**unet_inputs)[0]
-
-                    # perform guidance
-                    if do_classifier_free_guidance:
-                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(chunks=2)
-                        guidance = (
-                            guidance_scale
-                            if guidance_scale_tiles is None or guidance_scale_tiles[row][col] is None
-                            else guidance_scale_tiles[row][col]
-                        )
-                        noise_pred_tile = noise_pred_uncond + guidance * (noise_pred_text - noise_pred_uncond)
-                        noise_preds_row.append(noise_pred_tile)
-                noise_preds.append(noise_preds_row)
-            # Stitch noise predictions for all tiles
-            noise_pred = paddle.zeros(shape=latents.shape)
-            contributors = paddle.zeros(shape=latents.shape)
-            # Add each tile contribution to overall latents
-            for row in range(grid_rows):
-                for col in range(grid_cols):
-                    px_row_init, px_row_end, px_col_init, px_col_end = _tile2latent_indices(
-                        row, col, tile_width, tile_height, tile_row_overlap, tile_col_overlap
-                    )
-                    noise_pred[:, :, px_row_init:px_row_end, px_col_init:px_col_end] += (
-                        noise_preds[row][col] * tile_weights
-                    )
-                    contributors[:, :, px_row_init:px_row_end, px_col_init:px_col_end] += tile_weights
-            # Average overlapping areas with more than 1 contributor
-            noise_pred /= contributors
-            # compute the previous noisy sample x_t -> x_t-1
-            if is_scheduler_support_step_index:
-                latents = self.scheduler.step(
-                    noise_pred, t, latents, step_index=i, return_pred_original_sample=False
-                ).prev_sample
-            else:
-                latents = self.scheduler.step(noise_pred, t, latents).prev_sample
-            if i == len(self.scheduler.timesteps) - 1:
-                # sync for accuracy it/s measure
-                paddle.device.cuda.synchronize()
-
-        # scale and decode the image latents with vae
-        image = self._decode_vae_latents(latents)
-        image = (image / 2 + 0.5).clip(min=0, max=1)
-        image = image.cpu().transpose(perm=[0, 2, 3, 1]).numpy()
-        image = self.numpy_to_pil(image)
-        return {"images": image}
-
-    def _gaussian_weights(self, tile_width, tile_height, nbatches):
-        """Generates a gaussian mask of weights for tile contributions"""
-        import numpy as np
-        from numpy import exp, pi, sqrt
-
-        latent_width = tile_width // 8
-        latent_height = tile_height // 8
-        var = 0.01
-        midpoint = (latent_width - 1) / 2
-        x_probs = [
-            (exp(-(x - midpoint) * (x - midpoint) / (latent_width * latent_width) / (2 * var)) / sqrt(2 * pi * var))
-            for x in range(latent_width)
-        ]
-        midpoint = latent_height / 2
-        y_probs = [
-            (exp(-(y - midpoint) * (y - midpoint) / (latent_height * latent_height) / (2 * var)) / sqrt(2 * pi * var))
-            for y in range(latent_height)
-        ]
-        weights = np.outer(y_probs, x_probs)
-        return paddle.tile(
-            x=paddle.to_tensor(data=weights),
-            repeat_times=(nbatches, self.vae_decoder_num_latent_channels, 1, 1),
-        )
diff --git a/ppdiffusers/examples/community/reference_only.py b/ppdiffusers/examples/community/reference_only.py
deleted file mode 100644
index 91c48d754e60..000000000000
--- a/ppdiffusers/examples/community/reference_only.py
+++ /dev/null
@@ -1,1110 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import numpy as np
-import paddle
-import PIL
-from packaging import version
-from PIL import Image
-
-from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
-from ppdiffusers.configuration_utils import FrozenDict
-from ppdiffusers.models import AutoencoderKL, UNet2DConditionModel
-from ppdiffusers.models.cross_attention import CrossAttention
-from ppdiffusers.models.transformer_2d import Transformer2DModelOutput
-from ppdiffusers.models.unet_2d_blocks import (
-    ResnetBlock2D,
-    Transformer2DModel,
-    Upsample2D,
-)
-from ppdiffusers.pipeline_utils import DiffusionPipeline
-from ppdiffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
-from ppdiffusers.pipelines.stable_diffusion.safety_checker import (
-    StableDiffusionSafetyChecker,
-)
-from ppdiffusers.schedulers import KarrasDiffusionSchedulers
-from ppdiffusers.utils import (
-    PIL_INTERPOLATION,
-    check_min_version,
-    deprecate,
-    logging,
-    randn_tensor,
-    replace_example_docstring,
-)
-
-check_min_version("0.14.1")
-
-EPS = 1e-6
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> import paddle
-        >>> from ppdiffusers import ReferenceOnlyPipeline
-        >>> from ppdiffusers.utils import load_image
-        >>> pipe = ReferenceOnlyPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", paddle_dtype=paddle.float16)
-        >>> image = load_image("dog_rel.png").resize((512, 512))
-        >>> prompt = "a dog running on grassland, best quality"
-        >>> image = pipe(prompt,
-        ...     image=image,
-        ...     width=512,
-        ...     height=512,
-        ...     control_name="refernce_only", # "none", "reference_only", "reference_adain", "reference_adain+attn"
-        ...     attention_auto_machine_weight=1.0,
-        ...     gn_auto_machine_weight=1.0,
-        ...     current_style_fidelity=1.0).images[0]
-        >>> image.save("refernce_only_dog.png")
-        ```
-"""
-
-
-def stable_var(x, axis=None, unbiased=True, keepdim=False, name=None):
-    dtype = x.dtype
-    u = paddle.mean(x, axis=axis, keepdim=True, name=name)
-    n = paddle.cast(paddle.numel(x), paddle.int64) / paddle.cast(paddle.numel(u), paddle.int64)
-    n = n.astype(dtype)
-    if unbiased:
-        one_const = paddle.ones([], x.dtype)
-        n = paddle.where(n > one_const, n - 1.0, one_const)
-    n = n**0.5
-    n.stop_gradient = True
-    out = paddle.sum(paddle.pow((x - u) / n, 2), axis=axis, keepdim=keepdim, name=name)
-    return out
-
-
-def var_mean(x, axis=-1, keepdim=True, unbiased=True, correction=None):
-    if correction is not None:
-        unbiased = correction
-    var = stable_var(x, axis=axis, keepdim=keepdim, unbiased=unbiased)
-    mean = paddle.mean(x, axis=axis, keepdim=keepdim)
-    return var, mean
-
-
-def self_attn_forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, **cross_attention_kwargs):
-    attn_output = None
-
-    if getattr(self, "enable_attn", False):
-        assert attention_mask is None, "attention_mask must be None!"
-        if self.attention_auto_machine_weight > self.attn_weight:
-            do_classifier_free_guidance = len(self.current_uc_indices) > 0
-            chunk_num = 2 if do_classifier_free_guidance else 1
-            latent_hidden_states = hidden_states[:chunk_num]  # uc, c
-            image_hidden_states = hidden_states[chunk_num:]  # uc, c
-
-            image_self_attn1 = self.processor(
-                self,
-                hidden_states=image_hidden_states,
-                encoder_hidden_states=image_hidden_states,
-                attention_mask=attention_mask,
-                **cross_attention_kwargs,
-            )
-
-            latent_self_attn1_uc = self.processor(
-                self,
-                latent_hidden_states,
-                encoder_hidden_states=paddle.concat(
-                    [latent_hidden_states]
-                    + image_hidden_states.split([chunk_num] * (image_hidden_states.shape[0] // chunk_num)),
-                    axis=1,
-                ),
-                attention_mask=attention_mask,
-                **cross_attention_kwargs,
-            )
-
-            if do_classifier_free_guidance and self.current_style_fidelity > 1e-5:
-                latent_self_attn1_c = latent_self_attn1_uc.clone()
-                latent_self_attn1_c[self.current_uc_indices] = self.processor(
-                    self,
-                    hidden_states=latent_hidden_states[self.current_uc_indices],
-                    encoder_hidden_states=latent_hidden_states[self.current_uc_indices],
-                    attention_mask=attention_mask,
-                    **cross_attention_kwargs,
-                )
-                latent_self_attn1 = (
-                    self.current_style_fidelity * latent_self_attn1_c
-                    + (1.0 - self.current_style_fidelity) * latent_self_attn1_uc
-                )
-            else:
-                latent_self_attn1 = latent_self_attn1_uc
-
-            attn_output = paddle.concat([latent_self_attn1, image_self_attn1])
-
-    if attn_output is None:
-        attn_output = self.processor(
-            self,
-            hidden_states=hidden_states,
-            encoder_hidden_states=encoder_hidden_states,
-            attention_mask=attention_mask,
-            **cross_attention_kwargs,
-        )
-    return attn_output
-
-
-def transformer_2d_model_forward(
-    self,
-    hidden_states,
-    encoder_hidden_states=None,
-    timestep=None,
-    class_labels=None,
-    cross_attention_kwargs=None,
-    return_dict: bool = True,
-):
-    x = self.original_forward(
-        hidden_states,
-        encoder_hidden_states=encoder_hidden_states,
-        timestep=timestep,
-        class_labels=class_labels,
-        cross_attention_kwargs=cross_attention_kwargs,
-        return_dict=return_dict,
-    )[0]
-    output = None
-    if getattr(self, "enable_gn", False):
-        if self.gn_auto_machine_weight > self.gn_weight:
-            do_classifier_free_guidance = len(self.current_uc_indices) > 0
-            chunk_num = 2 if do_classifier_free_guidance else 1
-
-            latent_hidden_states = x[:chunk_num]  # uc, c
-            image_hidden_states = x[chunk_num:]  # uc, c
-            image_var, image_mean = var_mean(image_hidden_states, axis=(2, 3), keepdim=True, unbiased=False)
-            var, mean = var_mean(latent_hidden_states, axis=(2, 3), keepdim=True, unbiased=False)
-            std = paddle.maximum(var, paddle.zeros_like(var) + EPS) ** 0.5
-
-            div_num = image_hidden_states.shape[0] // chunk_num
-            mean_acc = sum(image_mean.split([chunk_num] * div_num)) / div_num
-            var_acc = sum(image_var.split([chunk_num] * div_num)) / div_num
-
-            std_acc = paddle.maximum(var_acc, paddle.zeros_like(var_acc) + EPS) ** 0.5
-            y_uc = (((latent_hidden_states - mean) / std) * std_acc) + mean_acc
-            if do_classifier_free_guidance and self.current_style_fidelity > 1e-5:
-                y_c = y_uc.clone()
-                y_c[self.current_uc_indices] = latent_hidden_states[self.current_uc_indices]
-                latent_hidden_states = self.current_style_fidelity * y_c + (1.0 - self.current_style_fidelity) * y_uc
-            else:
-                latent_hidden_states = y_uc
-            output = paddle.concat([latent_hidden_states, image_hidden_states])
-
-    if output is None:
-        output = x
-    if not return_dict:
-        return (output,)
-
-    return Transformer2DModelOutput(sample=output)
-
-
-def resnet_block_2d_forward(self, input_tensor, temb):
-    x = self.original_forward(input_tensor, temb=temb)
-    output = None
-    if getattr(self, "enable_gn", False):
-        if self.gn_auto_machine_weight > self.gn_weight:
-            do_classifier_free_guidance = len(self.current_uc_indices) > 0
-            chunk_num = 2 if do_classifier_free_guidance else 1
-
-            latent_hidden_states = x[:chunk_num]  # uc, c
-            image_hidden_states = x[chunk_num:]  # uc, c
-            image_var, image_mean = var_mean(image_hidden_states, axis=(2, 3), keepdim=True, unbiased=False)
-            var, mean = var_mean(latent_hidden_states, axis=(2, 3), keepdim=True, unbiased=False)
-            std = paddle.maximum(var, paddle.zeros_like(var) + EPS) ** 0.5
-
-            div_num = image_hidden_states.shape[0] // chunk_num
-            mean_acc = sum(image_mean.split([chunk_num] * div_num)) / div_num
-            var_acc = sum(image_var.split([chunk_num] * div_num)) / div_num
-
-            std_acc = paddle.maximum(var_acc, paddle.zeros_like(var_acc) + EPS) ** 0.5
-            y_uc = (((latent_hidden_states - mean) / std) * std_acc) + mean_acc
-            if do_classifier_free_guidance and self.current_style_fidelity > 1e-5:
-                y_c = y_uc.clone()
-                y_c[self.current_uc_indices] = latent_hidden_states[self.current_uc_indices]
-                latent_hidden_states = self.current_style_fidelity * y_c + (1.0 - self.current_style_fidelity) * y_uc
-            else:
-                latent_hidden_states = y_uc
-            output = paddle.concat([latent_hidden_states, image_hidden_states])
-
-    if output is None:
-        output = x
-
-    return output
-
-
-def upsample_2d_forward(self, hidden_states, output_size=None):
-    x = self.original_forward(hidden_states, output_size=output_size)
-    output = None
-    if getattr(self, "enable_gn", False):
-        if self.gn_auto_machine_weight > self.gn_weight:
-            do_classifier_free_guidance = len(self.current_uc_indices) > 0
-            chunk_num = 2 if do_classifier_free_guidance else 1
-
-            latent_hidden_states = x[:chunk_num]  # uc, c
-            image_hidden_states = x[chunk_num:]  # uc, c
-            image_var, image_mean = var_mean(image_hidden_states, axis=(2, 3), keepdim=True, unbiased=False)
-            var, mean = var_mean(latent_hidden_states, axis=(2, 3), keepdim=True, unbiased=False)
-            std = paddle.maximum(var, paddle.zeros_like(var) + EPS) ** 0.5
-
-            div_num = image_hidden_states.shape[0] // chunk_num
-            mean_acc = sum(image_mean.split([chunk_num] * div_num)) / div_num
-            var_acc = sum(image_var.split([chunk_num] * div_num)) / div_num
-
-            std_acc = paddle.maximum(var_acc, paddle.zeros_like(var_acc) + EPS) ** 0.5
-            y_uc = (((latent_hidden_states - mean) / std) * std_acc) + mean_acc
-            if do_classifier_free_guidance and self.current_style_fidelity > 1e-5:
-                y_c = y_uc.clone()
-                y_c[self.current_uc_indices] = latent_hidden_states[self.current_uc_indices]
-                latent_hidden_states = self.current_style_fidelity * y_c + (1.0 - self.current_style_fidelity) * y_uc
-            else:
-                latent_hidden_states = y_uc
-            output = paddle.concat([latent_hidden_states, image_hidden_states])
-
-    if output is None:
-        output = x
-
-    return output
-
-
-try:
-    # in ppdiffusers 0.16.1, we need patch `Attention`
-    from ppdiffusers.models.attention_processor import Attention
-
-    if not hasattr(Attention, "original_forward"):
-        Attention.original_forward = Attention.forward
-    Attention.forward = self_attn_forward
-except ImportError:
-    pass
-if not hasattr(CrossAttention, "original_forward"):
-    CrossAttention.original_forward = CrossAttention.forward
-if not hasattr(Transformer2DModel, "original_forward"):
-    Transformer2DModel.original_forward = Transformer2DModel.forward
-if not hasattr(ResnetBlock2D, "original_forward"):
-    ResnetBlock2D.original_forward = ResnetBlock2D.forward
-if not hasattr(Upsample2D, "original_forward"):
-    Upsample2D.original_forward = Upsample2D.forward
-CrossAttention.forward = self_attn_forward
-Transformer2DModel.forward = transformer_2d_model_forward
-ResnetBlock2D.forward = resnet_block_2d_forward
-Upsample2D.forward = upsample_2d_forward
-
-
-def preprocess(image, resize_mode, width, height):
-    if isinstance(image, paddle.Tensor):
-        return image
-    elif isinstance(image, PIL.Image.Image):
-        image = resize_image(resize_mode=resize_mode, im=image, width=width, height=height)
-        image = [image]
-
-    if isinstance(image[0], PIL.Image.Image):
-        image = [resize_image(resize_mode=resize_mode, im=im, width=width, height=height) for im in image]
-
-        w, h = image[0].size
-        w, h = map(lambda x: x - x % 8, (w, h))  # resize to integer multiple of 8
-
-        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
-        image = np.concatenate(image, axis=0)
-        image = np.array(image).astype(np.float32) / 255.0
-        image = image.transpose(0, 3, 1, 2)
-        image = 2.0 * image - 1.0
-        image = paddle.to_tensor(image)
-    elif isinstance(image[0], paddle.Tensor):
-        image = paddle.concat(image, axis=0)
-    return image
-
-
-def resize_image(resize_mode, im, width, height, upscaler_name=None):
-    """
-    Resizes an image with the specified resize_mode, width, and height.
-
-    Args:
-        resize_mode: The mode to use when resizing the image.
-           -1: do nothing.
-            0: Resize the image to the specified width and height.
-            1: Resize the image to fill the specified width and height, maintaining the aspect ratio, and then center the image within the dimensions, cropping the excess.
-            2: Resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image within the dimensions, filling empty with data from image.
-        im: The image to resize.
-        width: The width to resize the image to.
-        height: The height to resize the image to.
-        upscaler_name: The name of the upscaler to use. If not provided, defaults to opts.upscaler_for_img2img.
-    """
-    # ["Just resize", "Crop and resize", "Resize and fill", "Do nothing"]
-    #         0              1                   2               -1
-    def resize(im, w, h):
-        if upscaler_name is None or upscaler_name == "None" or im.mode == "L":
-            return im.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
-
-    if resize_mode == -1:
-        return im
-    elif resize_mode == 0:
-        res = resize(im, width, height)
-
-    elif resize_mode == 1:
-        ratio = width / height
-        src_ratio = im.width / im.height
-
-        src_w = width if ratio > src_ratio else im.width * height // im.height
-        src_h = height if ratio <= src_ratio else im.height * width // im.width
-
-        resized = resize(im, src_w, src_h)
-        res = Image.new("RGB", (width, height))
-        res.paste(resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2))
-
-    else:
-        ratio = width / height
-        src_ratio = im.width / im.height
-
-        src_w = width if ratio < src_ratio else im.width * height // im.height
-        src_h = height if ratio >= src_ratio else im.height * width // im.width
-
-        resized = resize(im, src_w, src_h)
-        res = Image.new("RGB", (width, height))
-        res.paste(resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2))
-
-        if ratio < src_ratio:
-            fill_height = height // 2 - src_h // 2
-            res.paste(resized.resize((width, fill_height), box=(0, 0, width, 0)), box=(0, 0))
-            res.paste(
-                resized.resize((width, fill_height), box=(0, resized.height, width, resized.height)),
-                box=(0, fill_height + src_h),
-            )
-        elif ratio > src_ratio:
-            fill_width = width // 2 - src_w // 2
-            res.paste(resized.resize((fill_width, height), box=(0, 0, 0, height)), box=(0, 0))
-            res.paste(
-                resized.resize((fill_width, height), box=(resized.width, 0, resized.width, height)),
-                box=(fill_width + src_w, 0),
-            )
-
-    return res
-
-
-class ReferenceOnlyPipeline(DiffusionPipeline):
-    r"""
-    Pipeline for text-to-image generation using Stable Diffusion with refernce only.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], [`PNDMScheduler`], [`EulerDiscreteScheduler`], [`EulerAncestralDiscreteScheduler`]
-            or [`DPMSolverMultistepScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPImageProcessor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-    _optional_components = ["safety_checker", "feature_extractor"]
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPImageProcessor,
-        requires_safety_checker: bool = True,
-    ):
-        super().__init__()
-
-        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
-                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
-                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
-                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
-                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file"
-            )
-            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["steps_offset"] = 1
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
-                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
-                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
-                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
-                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
-            )
-            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["clip_sample"] = False
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. PaddleNLP team, diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                f"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-
-        is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse(
-            version.parse(unet.config._ppdiffusers_version).base_version
-        ) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
-        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
-            deprecation_message = (
-                "The configuration file of the unet has set the default `sample_size` to smaller than"
-                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
-                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
-                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
-                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
-                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
-                " in the config might lead to incorrect results in future versions. If you have downloaded this"
-                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
-                " the `unet/config.json` file"
-            )
-            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(unet.config)
-            new_config["sample_size"] = 64
-            unet._internal_dict = FrozenDict(new_config)
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-
-        self.attn_modules = None
-        self.gn_modules = None
-
-    def set_reference_only(
-        self,
-        attention_auto_machine_weight=1.0,
-        gn_auto_machine_weight=1.0,
-        current_style_fidelity=0.5,
-        enable_attn=True,
-        enable_gn=True,
-        do_classifier_free_guidance=True,
-    ):
-        assert 0.0 <= attention_auto_machine_weight <= 1.0
-        assert 0.0 <= gn_auto_machine_weight <= 2.0
-        assert 0.0 <= current_style_fidelity <= 1.0
-
-        if self.attn_modules is not None:
-            for module in self.attn_modules:
-                module.enable_attn = enable_attn
-                module.attention_auto_machine_weight = attention_auto_machine_weight
-                module.current_style_fidelity = current_style_fidelity
-                module.current_uc_indices = [0] if do_classifier_free_guidance else []
-
-        if self.gn_modules is not None:
-            for module in self.gn_modules:
-                module.enable_gn = enable_gn
-                module.gn_auto_machine_weight = gn_auto_machine_weight
-                module.current_style_fidelity = current_style_fidelity
-                module.current_uc_indices = [0] if do_classifier_free_guidance else []
-
-        # init attn_modules
-        if self.attn_modules is None:
-            attn_modules = []
-            self_attn_processors_keys = []
-            for name in self.unet.attn_processors.keys():
-                if not name.endswith("attn1.processor"):
-                    continue
-                name = name.replace(".processor", "")
-                if name.startswith("mid_block"):
-                    hidden_size = self.unet.config.block_out_channels[-1]
-                elif name.startswith("up_blocks"):
-                    block_id = int(name[len("up_blocks.")])
-                    hidden_size = list(reversed(self.unet.config.block_out_channels))[block_id]
-                elif name.startswith("down_blocks"):
-                    block_id = int(name[len("down_blocks.")])
-                    hidden_size = self.unet.config.block_out_channels[block_id]
-                self_attn_processors_keys.append([name, hidden_size])
-
-            # sorted by (-hidden_size, name)，down -> mid -> up.
-            for i, (name, _) in enumerate(sorted(self_attn_processors_keys, key=lambda x: (-x[1], x[0]))):
-                module = self.unet.get_sublayer(name)
-                module.attn_weight = float(i) / float(len(self_attn_processors_keys))
-
-                module.enable_attn = enable_attn
-                module.attention_auto_machine_weight = attention_auto_machine_weight
-                module.current_style_fidelity = current_style_fidelity
-                module.current_uc_indices = [0] if do_classifier_free_guidance else []
-
-                attn_modules.append(module)
-            self.attn_modules = attn_modules
-
-        # init gn_modules
-        if self.gn_modules is None:
-            gn_modules = [
-                self.unet.mid_block.attentions[-1],
-            ]
-            self.unet.mid_block.attentions[-1].gn_weight = 0.0  # mid             0.0
-
-            input_block_names = [
-                ("down_blocks.1.resnets.0", "down_blocks.1.attentions.0"),  # 4   2.0
-                ("down_blocks.1.resnets.1", "down_blocks.1.attentions.1"),  # 5   1.66
-                ("down_blocks.2.resnets.0", "down_blocks.2.attentions.0"),  # 7   1.33
-                ("down_blocks.2.resnets.1", "down_blocks.2.attentions.1"),  # 8   1.0
-                ("down_blocks.3.resnets.0",),  # 10                               0.66
-                ("down_blocks.3.resnets.1",),  # 11                               0.33
-            ]
-            for w, block_names in enumerate(input_block_names):
-                module = self.unet.get_sublayer(block_names[-1])
-                module.gn_weight = 1.0 - float(w) / float(len(input_block_names))
-                gn_modules.append(module)
-
-            output_block_names = [
-                ("up_blocks.0.resnets.0",),  # 0                                 0.0
-                ("up_blocks.0.resnets.1",),  # 1                                 0.25
-                ("up_blocks.0.resnets.2", "up_blocks.0.upsamplers.0"),  # 2      0.5
-                ("up_blocks.1.resnets.0", "up_blocks.1.attentions.0"),  # 3      0.75
-                ("up_blocks.1.resnets.1", "up_blocks.1.attentions.1"),  # 4      1.0
-                ("up_blocks.1.resnets.2", "up_blocks.1.attentions.2"),  # 5      1.25
-                ("up_blocks.2.resnets.0", "up_blocks.2.attentions.0"),  # 6      1.5
-                ("up_blocks.2.resnets.1", "up_blocks.2.attentions.1"),  # 7      1.75
-            ]
-            for w, block_names in enumerate(output_block_names):
-                module = self.unet.get_sublayer(block_names[-1])
-                module.gn_weight = float(w) / float(len(output_block_names))
-                gn_modules.append(module)
-
-            for module in gn_modules:
-                module.gn_weight *= 2
-                module.enable_gn = enable_gn
-                module.gn_auto_machine_weight = gn_auto_machine_weight
-                module.current_style_fidelity = current_style_fidelity
-                module.current_uc_indices = [0] if do_classifier_free_guidance else []
-
-            self.gn_modules = gn_modules
-
-    def _encode_prompt(
-        self,
-        prompt,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
-                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-        """
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask
-            else:
-                attention_mask = None
-
-            prompt_embeds = self.text_encoder(
-                text_input_ids,
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = uncond_input.attention_mask
-            else:
-                attention_mask = None
-
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids,
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
-
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
-
-        return prompt_embeds
-
-    def run_safety_checker(self, image, dtype):
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
-            )
-        else:
-            has_nsfw_concept = None
-        return image, has_nsfw_concept
-
-    def decode_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
-        image = self.vae.decode(latents).sample
-        image = (image / 2 + 0.5).clip(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        image = image.transpose([0, 2, 3, 1]).cast("float32").numpy()
-        return image
-
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    def check_inputs(
-        self,
-        prompt,
-        height,
-        width,
-        callback_steps,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-    ):
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
-        shape = [batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor]
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, dtype=dtype)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    def prepare_image_latents(self, image, batch_size, dtype, generator=None, do_classifier_free_guidance=False):
-        if not isinstance(image, (paddle.Tensor, PIL.Image.Image, list)):
-            raise ValueError(
-                f"`image` has to be of type `paddle.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
-            )
-        image = image.cast(dtype)
-
-        if isinstance(generator, list):
-            init_latents = [
-                self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
-            ]
-            init_latents = paddle.concat(init_latents, axis=0)
-        else:
-            init_latents = self.vae.encode(image).latent_dist.sample(generator)
-
-        init_latents = self.vae.config.scaling_factor * init_latents
-
-        if do_classifier_free_guidance:
-            init_latents = paddle.concat([init_latents] * 2)
-
-        return init_latents
-
-    @paddle.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        image: Union[PIL.Image.Image, List[PIL.Image.Image], paddle.Tensor] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        # reference
-        control_name: str = "reference_only",  # "none", "reference_only", "reference_adain", "reference_adain+attn"
-        attention_auto_machine_weight: float = 1.0,
-        gn_auto_machine_weight: float = 1.0,
-        current_style_fidelity: float = 0.5,
-        resize_mode: int = -1,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `paddle.Tensor`):
-                The image or images to guide the image generation.
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
-                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
-                `self.processor` in `ppdiffusers.models.cross_attention`.
-        Examples:
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        assert control_name in ["none", "reference_only", "reference_adain", "reference_adain+attn"]
-        assert num_images_per_prompt == 1
-        # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * self.vae_scale_factor
-        width = width or self.unet.config.sample_size * self.vae_scale_factor
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
-        )
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-        )
-
-        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps = self.scheduler.timesteps
-
-        dtype = prompt_embeds.dtype
-
-        # 5. Prepare latent variables
-        num_channels_latents = self.unet.in_channels
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            dtype,
-            generator,
-            latents,
-        )
-
-        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 7. reference_only
-        enable_attn = (
-            "only" in control_name
-            or "attn" in control_name
-            and image is not None
-            and attention_auto_machine_weight > 0
-        )
-        enable_gn = "adain" in control_name and image is not None and gn_auto_machine_weight > 0
-        self.set_reference_only(
-            attention_auto_machine_weight,
-            gn_auto_machine_weight,
-            current_style_fidelity,
-            enable_attn,
-            enable_gn,
-            do_classifier_free_guidance,
-        )
-
-        if enable_attn or enable_gn:
-            image = preprocess(image, resize_mode, width, height)
-            image_latents = self.prepare_image_latents(
-                image, batch_size, dtype, generator, do_classifier_free_guidance
-            )
-            prompt_embeds = prompt_embeds.tile([1 + image.shape[0], 1, 1])
-
-        # 8. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                if enable_attn or enable_gn:
-                    image_noise = randn_tensor(image_latents.shape, generator=generator, dtype=dtype)
-                    image_latent_model_input = self.scheduler.scale_model_input(
-                        self.scheduler.add_noise(image_latents, image_noise, t), t
-                    )
-                    chunk_num = 2 if do_classifier_free_guidance else 1
-                    noise_pred = self.unet(
-                        paddle.concat([latent_model_input, image_latent_model_input.cast(latent_model_input.dtype)]),
-                        t,
-                        encoder_hidden_states=prompt_embeds,
-                        cross_attention_kwargs=cross_attention_kwargs,
-                    ).sample[:chunk_num]
-                else:
-                    noise_pred = self.unet(
-                        latent_model_input,
-                        t,
-                        encoder_hidden_states=prompt_embeds,
-                        cross_attention_kwargs=cross_attention_kwargs,
-                    ).sample
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        if output_type == "latent":
-            image = latents
-            has_nsfw_concept = None
-        elif output_type == "pil":
-            # 9. Post-processing
-            image = self.decode_latents(latents)
-
-            # 10. Run safety checker
-            image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
-
-            # 11. Convert to PIL
-            image = self.numpy_to_pil(image)
-        else:
-            # 9. Post-processing
-            image = self.decode_latents(latents)
-
-            # 10. Run safety checker
-            image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/examples/community/stable_diffusion_controlnet_img2img.py b/ppdiffusers/examples/community/stable_diffusion_controlnet_img2img.py
deleted file mode 100644
index 80d62cf8a852..000000000000
--- a/ppdiffusers/examples/community/stable_diffusion_controlnet_img2img.py
+++ /dev/null
@@ -1,912 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import inspect
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-import PIL.Image
-
-from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
-from ppdiffusers.image_processor import VaeImageProcessor
-from ppdiffusers.loaders import LoraLoaderMixin, TextualInversionLoaderMixin
-from ppdiffusers.models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
-from ppdiffusers.pipeline_utils import DiffusionPipeline
-from ppdiffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
-from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_controlnet import (
-    MultiControlNetModel,
-)
-from ppdiffusers.pipelines.stable_diffusion.safety_checker import (
-    StableDiffusionSafetyChecker,
-)
-from ppdiffusers.schedulers import KarrasDiffusionSchedulers
-from ppdiffusers.utils import (
-    check_min_version,
-    deprecate,
-    logging,
-    randn_tensor,
-    replace_example_docstring,
-)
-
-check_min_version("0.16.1")
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> # !pip install opencv-python paddlenlp
-        >>> from ppdiffusers import StableDiffusionControlNetImg2ImgPipeline, ControlNetModel, UniPCMultistepScheduler
-        >>> from ppdiffusers.utils import load_image
-        >>> import numpy as np
-        >>> import paddle
-
-        >>> import cv2
-        >>> from PIL import Image
-
-        >>> # download an image
-        >>> image = load_image(
-        ...     "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
-        ... )
-        >>> np_image = np.array(image)
-
-        >>> # get canny image
-        >>> np_image = cv2.Canny(np_image, 100, 200)
-        >>> np_image = np_image[:, :, None]
-        >>> np_image = np.concatenate([np_image, np_image, np_image], axis=2)
-        >>> canny_image = Image.fromarray(np_image)
-
-        >>> # load control net and stable diffusion v1-5
-        >>> controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", paddle_dtype=paddle.float16)
-        >>> pipe = StableDiffusionControlNetImg2ImgPipeline.from_pretrained(
-        ...     "runwayml/stable-diffusion-v1-5", controlnet=controlnet, paddle_dtype=paddle.float16
-        ... )
-
-        >>> # speed up diffusion process with faster scheduler and memory optimization
-        >>> pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
-
-        >>> # generate image
-        >>> generator = paddle.Generaotr().manual_seed(0)
-        >>> image = pipe(
-        ...     "futuristic-looking woman",
-        ...     num_inference_steps=20,
-        ...     generator=generator,
-        ...     image=image,
-        ...     control_image=canny_image,
-        ... ).images[0]
-        >>> image.save("demo.png")
-        ```
-"""
-
-
-class StableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
-    r"""
-    Pipeline for text-to-image generation using Stable Diffusion with ControlNet guidance.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    In addition the pipeline inherits the following loading methods:
-        - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        controlnet ([`ControlNetModel`] or `List[ControlNetModel]`):
-            Provides additional conditioning to the unet during the denoising process. If you set multiple ControlNets
-            as a list, the outputs from each ControlNet are added together to create one combined additional
-            conditioning.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPImageProcessor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-    _optional_components = ["safety_checker", "feature_extractor"]
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
-        scheduler: KarrasDiffusionSchedulers,
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPImageProcessor,
-        requires_safety_checker: bool = True,
-    ):
-        super().__init__()
-
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                f"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-
-        if isinstance(controlnet, (list, tuple)):
-            controlnet = MultiControlNetModel(controlnet)
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            controlnet=controlnet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
-        self.control_image_processor = VaeImageProcessor(
-            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
-        )
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-
-    def _encode_prompt(
-        self,
-        prompt,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        lora_scale: Optional[float] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            lora_scale (`float`, *optional*):
-                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
-        """
-        # set lora scale so that monkey patched LoRA
-        # function of text encoder can correctly access it
-        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
-            self._lora_scale = lora_scale
-
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
-
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            config = (
-                self.text_encoder.config
-                if isinstance(self.text_encoder.config, dict)
-                else self.text_encoder.config.to_dict()
-            )
-            if config.get("use_attention_mask", None) is not None and config["use_attention_mask"]:
-                attention_mask = text_inputs.attention_mask
-            else:
-                attention_mask = None
-
-            prompt_embeds = self.text_encoder(
-                text_input_ids,
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        prompt_embeds = prompt_embeds.cast(dtype=self.text_encoder.dtype)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif prompt is not None and type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
-
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-
-            config = (
-                self.text_encoder.config
-                if isinstance(self.text_encoder.config, dict)
-                else self.text_encoder.config.to_dict()
-            )
-            if config.get("use_attention_mask", None) is not None and config["use_attention_mask"]:
-                attention_mask = uncond_input.attention_mask
-            else:
-                attention_mask = None
-
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids,
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.cast(dtype=self.text_encoder.dtype)
-
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
-
-        return prompt_embeds
-
-    def run_safety_checker(self, image, dtype):
-        if self.safety_checker is None:
-            has_nsfw_concept = None
-        else:
-            if paddle.is_tensor(image):
-                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
-            else:
-                feature_extractor_input = self.image_processor.numpy_to_pil(image)
-            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pd")
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
-            )
-        return image, has_nsfw_concept
-
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    def check_inputs(
-        self,
-        prompt,
-        image,
-        callback_steps,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-        controlnet_conditioning_scale=1.0,
-    ):
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-        # `prompt` needs more sophisticated handling when there are multiple
-        # conditionings.
-        if isinstance(self.controlnet, MultiControlNetModel):
-            if isinstance(prompt, list):
-                logger.warning(
-                    f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}"
-                    " prompts. The conditionings will be fixed across the prompts."
-                )
-
-        if isinstance(self.controlnet, ControlNetModel):
-            self.check_image(image, prompt, prompt_embeds)
-        elif isinstance(self.controlnet, MultiControlNetModel):
-            if not isinstance(image, list):
-                raise TypeError("For multiple controlnets: `image` must be type `list`")
-
-            # When `image` is a nested list:
-            # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
-            elif any(isinstance(i, list) for i in image):
-                raise ValueError("A single batch of multiple conditionings are supported at the moment.")
-            elif len(image) != len(self.controlnet.nets):
-                raise ValueError(
-                    f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets."
-                )
-
-            for image_ in image:
-                self.check_image(image_, prompt, prompt_embeds)
-        else:
-            assert False
-
-        # Check `controlnet_conditioning_scale`
-        if isinstance(self.controlnet, ControlNetModel):
-            if not isinstance(controlnet_conditioning_scale, float):
-                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
-        elif isinstance(self.controlnet, MultiControlNetModel):
-            if isinstance(controlnet_conditioning_scale, list):
-                if any(isinstance(i, list) for i in controlnet_conditioning_scale):
-                    raise ValueError("A single batch of multiple conditionings are supported at the moment.")
-            elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
-                self.controlnet.nets
-            ):
-                raise ValueError(
-                    "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
-                    " the same length as the number of controlnets"
-                )
-        else:
-            assert False
-
-    def check_image(self, image, prompt, prompt_embeds):
-        image_is_pil = isinstance(image, PIL.Image.Image)
-        image_is_tensor = isinstance(image, paddle.Tensor)
-        image_is_np = isinstance(image, np.ndarray)
-        image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
-        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], paddle.Tensor)
-        image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
-
-        if (
-            not image_is_pil
-            and not image_is_tensor
-            and not image_is_np
-            and not image_is_pil_list
-            and not image_is_tensor_list
-            and not image_is_np_list
-        ):
-            raise TypeError(
-                f"image must be passed and be one of PIL image, numpy array, paddle tensor, list of PIL images, list of numpy arrays or list of paddle tensors, but is {type(image)}"
-            )
-
-        if image_is_pil:
-            image_batch_size = 1
-        else:
-            image_batch_size = len(image)
-
-        if prompt is not None and isinstance(prompt, str):
-            prompt_batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            prompt_batch_size = len(prompt)
-        elif prompt_embeds is not None:
-            prompt_batch_size = prompt_embeds.shape[0]
-
-        if image_batch_size != 1 and image_batch_size != prompt_batch_size:
-            raise ValueError(
-                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
-            )
-
-    def prepare_control_image(
-        self,
-        image,
-        width,
-        height,
-        batch_size,
-        num_images_per_prompt,
-        dtype,
-        do_classifier_free_guidance=False,
-        guess_mode=False,
-    ):
-        image = self.control_image_processor.preprocess(image, height=height, width=width).cast(dtype=paddle.float32)
-        image_batch_size = image.shape[0]
-
-        if image_batch_size == 1:
-            repeat_by = batch_size
-        else:
-            # image batch size is the same as prompt batch size
-            repeat_by = num_images_per_prompt
-
-        image = image.repeat_interleave(repeat_by, axis=0)
-
-        image = image.cast(dtype)
-
-        if do_classifier_free_guidance and not guess_mode:
-            image = paddle.concat([image] * 2)
-
-        return image
-
-    def get_timesteps(self, num_inference_steps, strength):
-        # get the original timestep using init_timestep
-        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
-
-        t_start = max(num_inference_steps - init_timestep, 0)
-        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
-
-        return timesteps, num_inference_steps - t_start
-
-    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None):
-        if not isinstance(image, (paddle.Tensor, PIL.Image.Image, list)):
-            raise ValueError(
-                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
-            )
-
-        image = image.cast(dtype=dtype)
-
-        batch_size = batch_size * num_images_per_prompt
-
-        if image.shape[1] == 4:
-            init_latents = image
-
-        else:
-            if isinstance(generator, list) and len(generator) != batch_size:
-                raise ValueError(
-                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-                )
-
-            elif isinstance(generator, list):
-                init_latents = [
-                    self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
-                ]
-                init_latents = paddle.concat(init_latents, axis=0)
-            else:
-                init_latents = self.vae.encode(image).latent_dist.sample(generator)
-
-            init_latents = self.vae.config.scaling_factor * init_latents
-
-        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
-            # expand init_latents for batch_size
-            deprecation_message = (
-                f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
-                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
-                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
-                " your script to pass as many initial images as text prompts to suppress this warning."
-            )
-            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
-            additional_image_per_prompt = batch_size // init_latents.shape[0]
-            init_latents = paddle.concat([init_latents] * additional_image_per_prompt, axis=0)
-        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
-            raise ValueError(
-                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
-            )
-        else:
-            init_latents = paddle.concat([init_latents], axis=0)
-
-        shape = init_latents.shape
-        noise = randn_tensor(shape, generator=generator, dtype=dtype)
-
-        # get latents
-        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
-        latents = init_latents
-
-        return latents
-
-    @paddle.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        image: Union[
-            paddle.Tensor,
-            PIL.Image.Image,
-            np.ndarray,
-            List[paddle.Tensor],
-            List[PIL.Image.Image],
-            List[np.ndarray],
-        ] = None,
-        control_image: Union[
-            paddle.Tensor,
-            PIL.Image.Image,
-            np.ndarray,
-            List[paddle.Tensor],
-            List[PIL.Image.Image],
-            List[np.ndarray],
-        ] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        strength: float = 0.8,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: int = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        controlnet_conditioning_scale: Union[float, List[float]] = 0.8,
-        guess_mode: bool = False,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            image (`paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[paddle.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
-                    `List[List[paddle.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
-                The initial image will be used as the starting point for the image generation process. Can also accpet
-                image latents as `image`, if passing latents directly, it will not be encoded again.
-            control_image (`paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[paddle.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
-                    `List[List[paddle.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
-                The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If
-                the type is specified as `paddle.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can
-                also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If
-                height and/or width are passed, `image` is resized according to them. If multiple ControlNets are
-                specified in init, images must be passed as a list such that each element of the list can be correctly
-                batched for input to a single controlnet.
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
-            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
-                The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
-                to the residual in the original unet. If multiple ControlNets are specified in init, you can set the
-                corresponding scale as a list. Note that by default, we use a smaller conditioning scale for inpainting
-                than for [`~StableDiffusionControlNetPipeline.__call__`].
-            guess_mode (`bool`, *optional*, defaults to `False`):
-                In this mode, the ControlNet encoder will try best to recognize the content of the input image even if
-                you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended.
-
-        Examples:
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt,
-            control_image,
-            callback_steps,
-            negative_prompt,
-            prompt_embeds,
-            negative_prompt_embeds,
-            controlnet_conditioning_scale,
-        )
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        controlnet = self.controlnet
-
-        if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
-            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
-
-        global_pool_conditions = (
-            controlnet.config.global_pool_conditions
-            if isinstance(controlnet, ControlNetModel)
-            else controlnet.nets[0].config.global_pool_conditions
-        )
-        guess_mode = guess_mode or global_pool_conditions
-
-        # 3. Encode input prompt
-        text_encoder_lora_scale = (
-            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
-        )
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            lora_scale=text_encoder_lora_scale,
-        )
-        # 4. Prepare image
-        image = self.image_processor.preprocess(image).cast(dtype=paddle.float32)
-
-        # 5. Prepare controlnet_conditioning_image
-        if isinstance(controlnet, ControlNetModel):
-            control_image = self.prepare_control_image(
-                image=control_image,
-                width=width,
-                height=height,
-                batch_size=batch_size * num_images_per_prompt,
-                num_images_per_prompt=num_images_per_prompt,
-                dtype=controlnet.dtype,
-                do_classifier_free_guidance=do_classifier_free_guidance,
-                guess_mode=guess_mode,
-            )
-        elif isinstance(controlnet, MultiControlNetModel):
-            control_images = []
-
-            for control_image_ in control_image:
-                control_image_ = self.prepare_control_image(
-                    image=control_image_,
-                    width=width,
-                    height=height,
-                    batch_size=batch_size * num_images_per_prompt,
-                    num_images_per_prompt=num_images_per_prompt,
-                    dtype=controlnet.dtype,
-                    do_classifier_free_guidance=do_classifier_free_guidance,
-                    guess_mode=guess_mode,
-                )
-
-                control_images.append(control_image_)
-
-            control_image = control_images
-        else:
-            assert False
-
-        # 5. Prepare timesteps
-        self.scheduler.set_timesteps(
-            num_inference_steps,
-        )
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
-        latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
-
-        # 6. Prepare latent variables
-        latents = self.prepare_latents(
-            image,
-            latent_timestep,
-            batch_size,
-            num_images_per_prompt,
-            prompt_embeds.dtype,
-            generator,
-        )
-
-        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 8. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                # controlnet(s) inference
-                if guess_mode and do_classifier_free_guidance:
-                    # Infer ControlNet only for the conditional batch.
-                    control_model_input = latents
-                    control_model_input = self.scheduler.scale_model_input(control_model_input, t)
-                    controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
-                else:
-                    control_model_input = latent_model_input
-                    controlnet_prompt_embeds = prompt_embeds
-
-                down_block_res_samples, mid_block_res_sample = self.controlnet(
-                    control_model_input,
-                    t,
-                    encoder_hidden_states=controlnet_prompt_embeds,
-                    controlnet_cond=control_image,
-                    conditioning_scale=controlnet_conditioning_scale,
-                    guess_mode=guess_mode,
-                    return_dict=False,
-                )
-
-                if guess_mode and do_classifier_free_guidance:
-                    # Infered ControlNet only for the conditional batch.
-                    # To apply the output of ControlNet to both the unconditional and conditional batches,
-                    # add 0 to the unconditional batch to keep it unchanged.
-                    down_block_res_samples = [paddle.concat([paddle.zeros_like(d), d]) for d in down_block_res_samples]
-                    mid_block_res_sample = paddle.concat(
-                        [paddle.zeros_like(mid_block_res_sample), mid_block_res_sample]
-                    )
-
-                # predict the noise residual
-                noise_pred = self.unet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                    down_block_additional_residuals=down_block_res_samples,
-                    mid_block_additional_residual=mid_block_res_sample,
-                    return_dict=False,
-                )[0]
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        if not output_type == "latent":
-            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
-            image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
-        else:
-            image = latents
-            has_nsfw_concept = None
-
-        if has_nsfw_concept is None:
-            do_denormalize = [True] * image.shape[0]
-        else:
-            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
-
-        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/examples/community/stable_diffusion_hires_fix.py b/ppdiffusers/examples/community/stable_diffusion_hires_fix.py
deleted file mode 100644
index b086e03b1160..000000000000
--- a/ppdiffusers/examples/community/stable_diffusion_hires_fix.py
+++ /dev/null
@@ -1,768 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import inspect
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import paddle
-from packaging import version
-
-from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import AutoencoderKL, DiffusionPipeline, UNet2DConditionModel
-from ppdiffusers.configuration_utils import FrozenDict
-from ppdiffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
-from ppdiffusers.pipelines.stable_diffusion.safety_checker import (
-    StableDiffusionSafetyChecker,
-)
-from ppdiffusers.schedulers import KarrasDiffusionSchedulers
-from ppdiffusers.utils import (
-    deprecate,
-    logging,
-    randn_tensor,
-    replace_example_docstring,
-)
-
-logger = logging.get_logger(__name__)
-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```python
-        >>> import paddle
-        >>> from ppdiffusers import StableDiffusionHiresFixPipeline
-        >>> paddle.seed(5232132133)
-
-        >>> pipe = StableDiffusionHiresFixPipeline.from_pretrained("stabilityai/stable-diffusion-2", paddle_dtype=paddle.float16)
-        >>> prompt = "1 real girl, long black hair, detailed face, light smile, chinese style, hanfu"
-        >>> image = pipe(prompt, guidance_scale=7.5, height=768, width=768,  hr_resize_width=1024, hr_resize_height=1024).images[0]
-        >>> image.save("girl.png")
-        ```
-"""
-
-
-class StableDiffusionHiresFixPipeline(DiffusionPipeline):
-    r"""
-    Pipeline for text-to-image generation with high resolution fixing(hires.fix) based on Stable Diffusion.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], [`PNDMScheduler`], [`EulerDiscreteScheduler`], [`EulerAncestralDiscreteScheduler`]
-            or [`DPMSolverMultistepScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPFeatureExtractor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-    _optional_components = ["safety_checker", "feature_extractor"]
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPFeatureExtractor,
-        requires_safety_checker: bool = True,
-    ):
-        super().__init__()
-
-        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
-                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
-                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
-                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
-                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file"
-            )
-            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["steps_offset"] = 1
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
-                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
-                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
-                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
-                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
-            )
-            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["clip_sample"] = False
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. PaddleNLP team, diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                f"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-
-        is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse(
-            version.parse(unet.config._ppdiffusers_version).base_version
-        ) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
-        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
-            deprecation_message = (
-                "The configuration file of the unet has set the default `sample_size` to smaller than"
-                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
-                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
-                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
-                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
-                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
-                " in the config might lead to incorrect results in future versions. If you have downloaded this"
-                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
-                " the `unet/config.json` file"
-            )
-            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(unet.config)
-            new_config["sample_size"] = 64
-            unet._internal_dict = FrozenDict(new_config)
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-
-    def _encode_prompt(
-        self,
-        prompt,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
-                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-        """
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask
-            else:
-                attention_mask = None
-
-            prompt_embeds = self.text_encoder(
-                text_input_ids,
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = uncond_input.attention_mask
-            else:
-                attention_mask = None
-
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids,
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-
-            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
-
-        return prompt_embeds
-
-    def get_timesteps(self, denoising_steps, denoising_strength):
-        steps = int(denoising_steps / min(denoising_strength, 0.999))
-        self.scheduler.set_timesteps(steps)
-
-        t_start = max(steps - denoising_steps, 0)
-        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
-
-        if hasattr(self.scheduler, "step_index_offset"):
-            self.scheduler.step_index_offset = t_start * self.scheduler.order
-
-        return timesteps, denoising_steps
-
-    def run_safety_checker(self, image, dtype):
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
-            )
-        else:
-            has_nsfw_concept = None
-        return image, has_nsfw_concept
-
-    def decode_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
-        image = self.vae.decode(latents).sample
-        image = (image / 2 + 0.5).clip(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        image = image.transpose([0, 2, 3, 1]).cast("float32").numpy()
-        return image
-
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    def check_inputs(
-        self,
-        prompt,
-        height,
-        width,
-        callback_steps,
-        hr_scale,
-        hr_resize_height,
-        hr_resize_width,
-        denoising_strength,
-        latent_scale_mode,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-    ):
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if hr_scale < 0:
-            raise ValueError("hr_scale shoule be greater that 0, but acceived {hr_scale}")
-
-        if hr_resize_height % 8 != 0 or hr_resize_width % 8 != 0:
-            raise ValueError(
-                f"`hr_resize_height` and `hr_resize_width` have to be divisible by 8 but are {hr_resize_height} and {hr_resize_width}."
-            )
-
-        if denoising_strength > 1 or denoising_strength < 0:
-            raise ValueError(f"denoising_strength should be set between 0 and 1., but acceived {denoising_strength}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if latent_scale_mode not in ["nearest", "bilinear", "bicubic", "area"]:
-            raise ValueError(
-                f"Only such interpolate method supported for latent_scale_mode in [nearest, bilinear, bicubic, area]. but acceived {latent_scale_mode}."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
-        shape = [batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor]
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, dtype=dtype)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    def get_upscaled_width_and_height(self, width, height, hr_scale=2, hr_resize_width=0, hr_resize_height=0):
-        if hr_resize_width == 0 and hr_resize_height == 0:
-            hr_upscale_to_width = int(width * hr_scale)
-            hr_upscale_to_height = int(height * hr_scale)
-        else:
-            if hr_resize_height == 0:
-                hr_upscale_to_width = hr_resize_width
-                hr_upscale_to_height = hr_resize_width * height // width
-            elif hr_resize_width == 0:
-                hr_upscale_to_width = hr_resize_height * width // height
-                hr_upscale_to_height = hr_resize_height
-            else:
-                src_ratio = width / height
-                dst_ratio = hr_resize_width / hr_resize_height
-
-                if src_ratio < dst_ratio:
-                    hr_upscale_to_width = hr_resize_width
-                    hr_upscale_to_height = hr_resize_width * height // width
-                else:
-                    hr_upscale_to_width = hr_resize_height * width // height
-                    hr_upscale_to_height = hr_resize_height
-
-        return hr_upscale_to_width, hr_upscale_to_height
-
-    @paddle.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 40,
-        hires_ratio: Optional[float] = 0.5,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        enable_hr: Optional[bool] = True,
-        hr_scale: Optional[float] = 2.0,
-        hr_resize_width: Optional[int] = 0,
-        hr_resize_height: Optional[int] = 0,
-        denoising_strength: Optional[float] = 0.7,
-        latent_scale_mode: Optional[str] = "nearest",
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 40):
-                The number of denoising steps, equal to sample_steps and hr_steps. samples_steps means the initial
-                denoising steps, and hr_steps means hires denoising steps. More denoising steps usually lead to a
-                higher quality image at the expense of slower inference.
-            hires_ratio (`float`, *optional*, defaults to 0.5):
-                The step proportion of hires.fix, that means hr_steps = int(num_inference_steps * hires_ratio).
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
-                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
-                `self.processor` in
-                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
-            hr_steps (`int`, *optional*, defaults to 30):
-                The number of second denoising steps about high resolution fixing.
-            hr_scale (`float`, *optional*, defaults to 2.0):
-                The upscaler to expand the width and height of image. if set 2.0, it means that expand width and height of a image to width*2.0 and height*2.0.
-            hr_resize_width (`int`, *optional*, defaults to 0):
-                It enable users to specify the upscaled width mannually. if hr_resize_width!=0, program will use it to compute scaled width and height instead of hr_scale.
-            hr_resize_height (`int`, *optional*, defaults to 0):
-                It enable users to specify the upscaled height mannually. if hr_resize_height!=0, program will use it to compute scaled width and height instead of hr_scale.
-            denoising_strength (`float`, *optional*, defaults to 0.7):
-                The denoising strength applying on hires.fix steps. It take a value between 0 and 1.
-            latent_scale_mode (`str`, *optional*, defaults to nearest):
-                The interpolate method applying upscale initial images, you can set it in [nearest, bilinear, bicubic, area].
-
-        Examples:
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * self.vae_scale_factor
-        width = width or self.unet.config.sample_size * self.vae_scale_factor
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt,
-            height,
-            width,
-            callback_steps,
-            hr_scale,
-            hr_resize_height,
-            hr_resize_width,
-            denoising_strength,
-            latent_scale_mode,
-            negative_prompt,
-            prompt_embeds,
-            negative_prompt_embeds,
-        )
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-        )
-
-        # 4. Prepare timesteps
-        if enable_hr:
-            hr_steps = int(num_inference_steps * hires_ratio)
-            sample_steps = num_inference_steps - hr_steps
-        else:
-            hr_steps = 0
-            sample_steps = num_inference_steps
-
-        self.scheduler.set_timesteps(sample_steps)
-        timesteps = self.scheduler.timesteps
-
-        # 5. Prepare latent variables
-        if generator is None:
-            generator_state = paddle.get_cuda_rng_state()
-            paddle.Generator().states_["initial_generator"] = copy.deepcopy(generator_state)
-        else:
-            paddle.Generator().states_["initial_generator"] = copy.deepcopy(paddle.Generator().states_[generator])
-
-        num_channels_latents = self.unet.in_channels
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            prompt_embeds.dtype,
-            generator,
-            latents,
-        )
-
-        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 7. Denoising loop
-        num_warmup_steps = len(timesteps) - sample_steps * self.scheduler.order
-        with self.progress_bar(total=sample_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                # predict the noise residual
-                noise_pred = self.unet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                ).sample
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        # start to apply hires.fix on initial latents
-        if enable_hr:
-            # 8. determine the upscaled width and height for upscaled images
-            truncate_width = 0
-            truncate_height = 0
-            self.hr_upscale_to_width, self.hr_upscale_to_height = self.get_upscaled_width_and_height(
-                width, height, hr_scale=hr_scale, hr_resize_width=hr_resize_width, hr_resize_height=hr_resize_height
-            )
-            if hr_resize_width != 0 and hr_resize_height != 0:
-                truncate_width = (self.hr_upscale_to_width - hr_resize_width) // self.vae_scale_factor
-                truncate_height = (self.hr_upscale_to_height - hr_resize_height) // self.vae_scale_factor
-
-            # 9. special case: do nothing if upscaling is not nesscessary
-            if self.hr_upscale_to_width == width and self.hr_upscale_to_height == height:
-                enable_hr = False
-                denoising_strength = None
-
-        if enable_hr:
-            # 10. prepare init latents
-            timesteps, hr_steps = self.get_timesteps(hr_steps, denoising_strength)
-            init_timestep = timesteps[:1].tile([latents.shape[0]])
-
-            latents = paddle.nn.functional.interpolate(
-                latents,
-                size=(
-                    self.hr_upscale_to_height // self.vae_scale_factor,
-                    self.hr_upscale_to_width // self.vae_scale_factor,
-                ),
-                mode=latent_scale_mode,
-            )
-            latents = latents[
-                :,
-                :,
-                truncate_height // 2 : latents.shape[2] - (truncate_height + 1) // 2,
-                truncate_width // 2 : latents.shape[3] - (truncate_width + 1) // 2,
-            ]
-
-            noise = randn_tensor(latents.shape, dtype=latents.dtype, generator="initial_generator")
-            latents = self.scheduler.add_noise(latents, noise, init_timestep)
-
-            # 11. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-            extra_step_kwargs = self.prepare_extra_step_kwargs("initial_generator", eta)
-
-            # 12. denoising on hires.fix steps
-            num_warmup_steps = len(timesteps) - hr_steps * self.scheduler.order
-            with self.progress_bar(total=hr_steps) as progress_bar:
-                for i, t in enumerate(timesteps):
-                    # expand the latents if we are doing classifier free guidance
-                    latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                    # predict the noise residual
-                    noise_pred = self.unet(
-                        latent_model_input,
-                        t,
-                        encoder_hidden_states=prompt_embeds,
-                        cross_attention_kwargs=cross_attention_kwargs,
-                    ).sample
-
-                    # perform guidance
-                    if do_classifier_free_guidance:
-                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                    # compute the previous noisy sample x_t -> x_t-1
-                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-                    # call the callback, if provided
-                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                        progress_bar.update()
-                        if callback is not None and i % callback_steps == 0:
-                            callback(i, t, latents)
-
-        # 13. process latents into images and perform safety checker
-        if output_type == "latent":
-            image = latents
-            has_nsfw_concept = None
-        elif output_type == "pil":
-            image = self.decode_latents(latents)
-            image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
-            image = self.numpy_to_pil(image)
-        else:
-            image = self.decode_latents(latents)
-            image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/examples/community/stable_diffusion_mega.py b/ppdiffusers/examples/community/stable_diffusion_mega.py
deleted file mode 100644
index b1e89b7c9b83..000000000000
--- a/ppdiffusers/examples/community/stable_diffusion_mega.py
+++ /dev/null
@@ -1,3333 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import inspect
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import numpy as np
-import paddle
-import PIL
-import PIL.Image
-
-from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (
-    AutoencoderKL,
-    ControlNetModel,
-    DDIMScheduler,
-    DDPMScheduler,
-    DEISMultistepScheduler,
-    DiffusionPipeline,
-    DPMSolverMultistepScheduler,
-    DPMSolverSinglestepScheduler,
-    EulerAncestralDiscreteScheduler,
-    EulerDiscreteScheduler,
-    HeunDiscreteScheduler,
-    KDPM2AncestralDiscreteScheduler,
-    KDPM2DiscreteScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    UNet2DConditionModel,
-    UniPCMultistepScheduler,
-)
-from ppdiffusers.configuration_utils import FrozenDict
-from ppdiffusers.image_processor import VaeImageProcessor
-from ppdiffusers.loaders import (
-    FromCkptMixin,
-    LoraLoaderMixin,
-    TextualInversionLoaderMixin,
-)
-from ppdiffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
-from ppdiffusers.pipelines.stable_diffusion.pipeline_cycle_diffusion import (
-    compute_noise,
-    posterior_sample,
-)
-from ppdiffusers.pipelines.stable_diffusion.safety_checker import (
-    StableDiffusionSafetyChecker,
-)
-from ppdiffusers.schedulers import KarrasDiffusionSchedulers
-from ppdiffusers.utils import PIL_INTERPOLATION, deprecate, logging, randn_tensor
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-import re
-
-re_attention = re.compile(
-    r"""
-\\\(|
-\\\)|
-\\\[|
-\\]|
-\\\\|
-\\|
-\(|
-\[|
-:([+-]?[.\d]+)\)|
-\)|
-]|
-[^\\()\[\]:]+|
-:
-""",
-    re.X,
-)
-
-
-def parse_prompt_attention(text):
-    r"""
-    Parses a string with attention tokens and returns a list of pairs: text and its associated weight.
-    Accepted tokens are:
-      (abc) - increases attention to abc by a multiplier of 1.1
-      (abc:3.12) - increases attention to abc by a multiplier of 3.12
-      [abc] - decreases attention to abc by a multiplier of 1.1
-      \( - literal character '('
-      \[ - literal character '['
-      \) - literal character ')'
-      \] - literal character ']'
-      \\ - literal character '\'
-      anything else - just text
-    >>> parse_prompt_attention('normal text')
-    [['normal text', 1.0]]
-    >>> parse_prompt_attention('an (important) word')
-    [['an ', 1.0], ['important', 1.1], [' word', 1.0]]
-    >>> parse_prompt_attention('(unbalanced')
-    [['unbalanced', 1.1]]
-    >>> parse_prompt_attention('\(literal\]')
-    [['(literal]', 1.0]]
-    >>> parse_prompt_attention('(unnecessary)(parens)')
-    [['unnecessaryparens', 1.1]]
-    >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
-    [['a ', 1.0],
-     ['house', 1.5730000000000004],
-     [' ', 1.1],
-     ['on', 1.0],
-     [' a ', 1.1],
-     ['hill', 0.55],
-     [', sun, ', 1.1],
-     ['sky', 1.4641000000000006],
-     ['.', 1.1]]
-    """
-
-    res = []
-    round_brackets = []
-    square_brackets = []
-
-    round_bracket_multiplier = 1.1
-    square_bracket_multiplier = 1 / 1.1
-
-    def multiply_range(start_position, multiplier):
-        for p in range(start_position, len(res)):
-            res[p][1] *= multiplier
-
-    for m in re_attention.finditer(text):
-        text = m.group(0)
-        weight = m.group(1)
-
-        if text.startswith("\\"):
-            res.append([text[1:], 1.0])
-        elif text == "(":
-            round_brackets.append(len(res))
-        elif text == "[":
-            square_brackets.append(len(res))
-        elif weight is not None and len(round_brackets) > 0:
-            multiply_range(round_brackets.pop(), float(weight))
-        elif text == ")" and len(round_brackets) > 0:
-            multiply_range(round_brackets.pop(), round_bracket_multiplier)
-        elif text == "]" and len(square_brackets) > 0:
-            multiply_range(square_brackets.pop(), square_bracket_multiplier)
-        else:
-            res.append([text, 1.0])
-
-    for pos in round_brackets:
-        multiply_range(pos, round_bracket_multiplier)
-
-    for pos in square_brackets:
-        multiply_range(pos, square_bracket_multiplier)
-
-    if len(res) == 0:
-        res = [["", 1.0]]
-
-    # merge runs of identical weights
-    i = 0
-    while i + 1 < len(res):
-        if res[i][1] == res[i + 1][1]:
-            res[i][0] += res[i + 1][0]
-            res.pop(i + 1)
-        else:
-            i += 1
-
-    return res
-
-
-def get_prompts_with_weights(pipe, prompt: List[str], max_length: int):
-    r"""
-    Tokenize a list of prompts and return its tokens with weights of each token.
-    No padding, starting or ending token is included.
-    """
-    tokens = []
-    weights = []
-    truncated = False
-    for text in prompt:
-        texts_and_weights = parse_prompt_attention(text)
-        text_token = []
-        text_weight = []
-        for word, weight in texts_and_weights:
-            # tokenize and discard the starting and the ending token
-            token = pipe.tokenizer(word).input_ids[1:-1]
-            text_token += token
-            # copy the weight by length of token
-            text_weight += [weight] * len(token)
-            # stop if the text is too long (longer than truncation limit)
-            if len(text_token) > max_length:
-                truncated = True
-                break
-        # truncate
-        if len(text_token) > max_length:
-            truncated = True
-            text_token = text_token[:max_length]
-            text_weight = text_weight[:max_length]
-        tokens.append(text_token)
-        weights.append(text_weight)
-    if truncated:
-        logger.warning("Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples")
-    return tokens, weights
-
-
-def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, pad, no_boseos_middle=True, chunk_length=77):
-    r"""
-    Pad the tokens (with starting and ending tokens) and weights (with 1.0) to max_length.
-    """
-    max_embeddings_multiples = (max_length - 2) // (chunk_length - 2)
-    weights_length = max_length if no_boseos_middle else max_embeddings_multiples * chunk_length
-    for i in range(len(tokens)):
-        tokens[i] = [bos] + tokens[i] + [eos] + [pad] * (max_length - 2 - len(tokens[i]))
-        if no_boseos_middle:
-            weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 - len(weights[i]))
-        else:
-            w = []
-            if len(weights[i]) == 0:
-                w = [1.0] * weights_length
-            else:
-                for j in range(max_embeddings_multiples):
-                    w.append(1.0)  # weight for starting token in this chunk
-                    w += weights[i][j * (chunk_length - 2) : min(len(weights[i]), (j + 1) * (chunk_length - 2))]
-                    w.append(1.0)  # weight for ending token in this chunk
-                w += [1.0] * (weights_length - len(w))
-            weights[i] = w[:]
-
-    return tokens, weights
-
-
-def get_unweighted_text_embeddings(
-    pipe,
-    text_input: paddle.Tensor,
-    chunk_length: int,
-    no_boseos_middle: Optional[bool] = True,
-):
-    """
-    When the length of tokens is a multiple of the capacity of the text encoder,
-    it should be split into chunks and sent to the text encoder individually.
-    """
-    max_embeddings_multiples = (text_input.shape[1] - 2) // (chunk_length - 2)
-    if max_embeddings_multiples > 1:
-        text_embeddings = []
-        for i in range(max_embeddings_multiples):
-            # extract the i-th chunk
-            text_input_chunk = text_input[:, i * (chunk_length - 2) : (i + 1) * (chunk_length - 2) + 2].clone()
-
-            # cover the head and the tail by the starting and the ending tokens
-            text_input_chunk[:, 0] = text_input[0, 0]
-            text_input_chunk[:, -1] = text_input[0, -1]
-            text_embedding = pipe.text_encoder(input_ids=text_input_chunk)[0]
-
-            if no_boseos_middle:
-                if i == 0:
-                    # discard the ending token
-                    text_embedding = text_embedding[:, :-1]
-                elif i == max_embeddings_multiples - 1:
-                    # discard the starting token
-                    text_embedding = text_embedding[:, 1:]
-                else:
-                    # discard both starting and ending tokens
-                    text_embedding = text_embedding[:, 1:-1]
-
-            text_embeddings.append(text_embedding)
-        text_embeddings = paddle.concat(text_embeddings, axis=1)
-    else:
-        text_embeddings = pipe.text_encoder(input_ids=text_input)[0]
-    return text_embeddings
-
-
-def get_weighted_text_embeddings(
-    pipe,
-    prompt: Union[str, List[str]],
-    uncond_prompt: Optional[Union[str, List[str]]] = None,
-    max_embeddings_multiples: Optional[int] = 1,
-    no_boseos_middle: Optional[bool] = False,
-    skip_parsing: Optional[bool] = False,
-    skip_weighting: Optional[bool] = False,
-    **kwargs,
-):
-    r"""
-    Prompts can be assigned with local weights using brackets. For example,
-    prompt 'A (very beautiful) masterpiece' highlights the words 'very beautiful',
-    and the embedding tokens corresponding to the words get multiplied by a constant, 1.1.
-    Also, to regularize of the embedding, the weighted embedding would be scaled to preserve the original mean.
-    Args:
-        pipe (`DiffusionPipeline`):
-            Pipe to provide access to the tokenizer and the text encoder.
-        prompt (`str` or `List[str]`):
-            The prompt or prompts to guide the image generation.
-        uncond_prompt (`str` or `List[str]`):
-            The unconditional prompt or prompts for guide the image generation. If unconditional prompt
-            is provided, the embeddings of prompt and uncond_prompt are concatenated.
-        max_embeddings_multiples (`int`, *optional*, defaults to `1`):
-            The max multiple length of prompt embeddings compared to the max output length of text encoder.
-        no_boseos_middle (`bool`, *optional*, defaults to `False`):
-            If the length of text token is multiples of the capacity of text encoder, whether reserve the starting and
-            ending token in each of the chunk in the middle.
-        skip_parsing (`bool`, *optional*, defaults to `False`):
-            Skip the parsing of brackets.
-        skip_weighting (`bool`, *optional*, defaults to `False`):
-            Skip the weighting. When the parsing is skipped, it is forced True.
-    """
-    max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
-    if isinstance(prompt, str):
-        prompt = [prompt]
-
-    if not skip_parsing:
-        prompt_tokens, prompt_weights = get_prompts_with_weights(pipe, prompt, max_length - 2)
-        if uncond_prompt is not None:
-            if isinstance(uncond_prompt, str):
-                uncond_prompt = [uncond_prompt]
-            uncond_tokens, uncond_weights = get_prompts_with_weights(pipe, uncond_prompt, max_length - 2)
-    else:
-        prompt_tokens = [
-            token[1:-1] for token in pipe.tokenizer(prompt, max_length=max_length, truncation=True).input_ids
-        ]
-        prompt_weights = [[1.0] * len(token) for token in prompt_tokens]
-        if uncond_prompt is not None:
-            if isinstance(uncond_prompt, str):
-                uncond_prompt = [uncond_prompt]
-            uncond_tokens = [
-                token[1:-1]
-                for token in pipe.tokenizer(uncond_prompt, max_length=max_length, truncation=True).input_ids
-            ]
-            uncond_weights = [[1.0] * len(token) for token in uncond_tokens]
-
-    # round up the longest length of tokens to a multiple of (model_max_length - 2)
-    max_length = max([len(token) for token in prompt_tokens])
-    if uncond_prompt is not None:
-        max_length = max(max_length, max([len(token) for token in uncond_tokens]))
-
-    max_embeddings_multiples = min(
-        max_embeddings_multiples,
-        (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1,
-    )
-    max_embeddings_multiples = max(1, max_embeddings_multiples)
-    max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
-
-    # pad the length of tokens and weights
-    # support bert tokenizer
-    bos = pipe.tokenizer.bos_token_id if pipe.tokenizer.bos_token_id is not None else pipe.tokenizer.cls_token_id
-    eos = pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id is not None else pipe.tokenizer.sep_token_id
-    pad = pipe.tokenizer.pad_token_id
-    prompt_tokens, prompt_weights = pad_tokens_and_weights(
-        prompt_tokens,
-        prompt_weights,
-        max_length,
-        bos,
-        eos,
-        pad,
-        no_boseos_middle=no_boseos_middle,
-        chunk_length=pipe.tokenizer.model_max_length,
-    )
-    prompt_tokens = paddle.to_tensor(prompt_tokens, dtype=paddle.int64)
-    if uncond_prompt is not None:
-        uncond_tokens, uncond_weights = pad_tokens_and_weights(
-            uncond_tokens,
-            uncond_weights,
-            max_length,
-            bos,
-            eos,
-            pad,
-            no_boseos_middle=no_boseos_middle,
-            chunk_length=pipe.tokenizer.model_max_length,
-        )
-        uncond_tokens = paddle.to_tensor(uncond_tokens, dtype=paddle.int64)
-
-    # get the embeddings
-    text_embeddings = get_unweighted_text_embeddings(
-        pipe,
-        prompt_tokens,
-        pipe.tokenizer.model_max_length,
-        no_boseos_middle=no_boseos_middle,
-    )
-    prompt_weights = paddle.to_tensor(prompt_weights, dtype=text_embeddings.dtype)
-    if uncond_prompt is not None:
-        uncond_embeddings = get_unweighted_text_embeddings(
-            pipe,
-            uncond_tokens,
-            pipe.tokenizer.model_max_length,
-            no_boseos_middle=no_boseos_middle,
-        )
-        uncond_weights = paddle.to_tensor(uncond_weights, dtype=uncond_embeddings.dtype)
-
-    # assign weights to the prompts and normalize in the sense of mean
-    # TODO: should we normalize by chunk or in a whole (current implementation)?
-    if (not skip_parsing) and (not skip_weighting):
-        previous_mean = text_embeddings.mean(axis=[-2, -1])
-        text_embeddings *= prompt_weights.unsqueeze(-1)
-        text_embeddings *= (previous_mean / text_embeddings.mean(axis=[-2, -1])).unsqueeze(-1).unsqueeze(-1)
-        if uncond_prompt is not None:
-            previous_mean = uncond_embeddings.mean(axis=[-2, -1])
-            uncond_embeddings *= uncond_weights.unsqueeze(-1)
-            uncond_embeddings *= (previous_mean / uncond_embeddings.mean(axis=[-2, -1])).unsqueeze(-1).unsqueeze(-1)
-
-    if uncond_prompt is not None:
-        return text_embeddings, uncond_embeddings
-    return text_embeddings, None
-
-
-def prepare_mask_and_masked_image(image, mask, height=None, width=None, return_image: bool = False):
-    """
-    Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be
-    converted to ``paddle.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
-    ``image`` and ``1`` for the ``mask``.
-
-    The ``image`` will be converted to ``paddle.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be
-    binarized (``mask > 0.5``) and cast to ``paddle.float32`` too.
-
-    Args:
-        image (Union[np.array, PIL.Image, paddle.Tensor]): The image to inpaint.
-            It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` or a ``channels x height x width``
-            ``paddle.Tensor`` or a ``batch x channels x height x width`` ``paddle.Tensor``.
-        mask (_type_): The mask to apply to the image, i.e. regions to inpaint.
-            It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width``
-            ``paddle.Tensor`` or a ``batch x 1 x height x width`` ``paddle.Tensor``.
-
-
-    Raises:
-        ValueError: ``paddle.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``paddle.Tensor`` mask
-        should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions.
-        TypeError: ``mask`` is a ``paddle.Tensor`` but ``image`` is not
-            (ot the other way around).
-
-    Returns:
-        tuple[paddle.Tensor]: The pair (mask, masked_image) as ``paddle.Tensor`` with 4
-            dimensions: ``batch x channels x height x width``.
-    """
-
-    if image is None:
-        raise ValueError("`image` input cannot be undefined.")
-
-    if mask is None:
-        raise ValueError("`mask_image` input cannot be undefined.")
-
-    if isinstance(image, paddle.Tensor):
-        if not isinstance(mask, paddle.Tensor):
-            raise TypeError(f"`image` is a paddle.Tensor but `mask` (type: {type(mask)} is not")
-
-        # Batch single image
-        if image.ndim == 3:
-            assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
-            image = image.unsqueeze(0)
-
-        # Batch and add channel dim for single mask
-        if mask.ndim == 2:
-            mask = mask.unsqueeze(0).unsqueeze(0)
-
-        # Batch single mask or add channel dim
-        if mask.ndim == 3:
-            # Single batched mask, no channel dim or single mask not batched but channel dim
-            if mask.shape[0] == 1:
-                mask = mask.unsqueeze(0)
-
-            # Batched masks no channel dim
-            else:
-                mask = mask.unsqueeze(1)
-
-        assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
-        assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
-        assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
-
-        # Check image is in [-1, 1]
-        if image.min() < -1 or image.max() > 1:
-            raise ValueError("Image should be in [-1, 1] range")
-
-        # Check mask is in [0, 1]
-        if mask.min() < 0 or mask.max() > 1:
-            raise ValueError("Mask should be in [0, 1] range")
-
-        # Binarize mask
-        mask[mask < 0.5] = 0
-        mask[mask >= 0.5] = 1
-
-        # Image as float32
-        image = image.cast(dtype=paddle.float32)
-    elif isinstance(mask, paddle.Tensor):
-        raise TypeError(f"`mask` is a paddle.Tensor but `image` (type: {type(image)} is not")
-    else:
-        # preprocess image
-        if isinstance(image, (PIL.Image.Image, np.ndarray)):
-            image = [image]
-        if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
-            # resize all images w.r.t passed height an width
-            if width is None or height is None:
-                w, h = image[0].size
-            else:
-                w, h = width, height
-            w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
-            image = [i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]) for i in image]
-            image = [np.array(i.convert("RGB"))[None, :] for i in image]
-            image = np.concatenate(image, axis=0)
-        elif isinstance(image, list) and isinstance(image[0], np.ndarray):
-            image = np.concatenate([i[None, :] for i in image], axis=0)
-
-        image = image.transpose(0, 3, 1, 2)
-        image = paddle.to_tensor(image, dtype=paddle.float32) / 127.5 - 1.0
-
-        # preprocess mask
-        if isinstance(mask, (PIL.Image.Image, np.ndarray)):
-            mask = [mask]
-
-        if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image):
-            # resize all images w.r.t passed height an width
-            if width is None or height is None:
-                w, h = mask[0].size
-            else:
-                w, h = width, height
-            w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
-            mask = [i.resize((w, h), resample=PIL_INTERPOLATION["nearest"]) for i in mask]
-            mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
-            mask = mask.astype(np.float32) / 255.0
-        elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
-            mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
-
-        mask[mask < 0.5] = 0
-        mask[mask >= 0.5] = 1
-        mask = paddle.to_tensor(mask)
-
-    masked_image = image * (mask < 0.5)
-
-    # n.b. ensure backwards compatibility as old function does not return image
-    if return_image:
-        return mask, masked_image, image
-
-    return mask, masked_image
-
-
-class CommonMixIn:
-    @property
-    def components(self) -> Dict[str, Any]:
-        return {k: getattr(self, k) for k in self.config.keys() if not k.startswith("_")}
-
-    def change_scheduler(self, scheduler_type="ddim"):
-        scheduler_type = scheduler_type.lower()
-        if scheduler_type == "pndm":
-            scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True)
-        elif scheduler_type == "lms":
-            scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config)
-        elif scheduler_type == "heun":
-            scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config)
-        elif scheduler_type == "euler":
-            scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config)
-        elif scheduler_type == "euler-ancestral":
-            scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
-        elif scheduler_type == "dpm-multi":
-            scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config)
-        elif scheduler_type == "dpm-single":
-            scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config)
-        elif scheduler_type == "kdpm2-ancestral":
-            scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
-        elif scheduler_type == "kdpm2":
-            scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config)
-        elif scheduler_type == "unipc-multi":
-            scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config)
-        elif scheduler_type == "ddim":
-            scheduler = DDIMScheduler.from_config(
-                self.orginal_scheduler_config,
-                steps_offset=1,
-                clip_sample=False,
-                set_alpha_to_one=False,
-            )
-        elif scheduler_type == "ddpm":
-            scheduler = DDPMScheduler.from_config(
-                self.orginal_scheduler_config,
-            )
-        elif scheduler_type == "deis-multi":
-            scheduler = DEISMultistepScheduler.from_config(
-                self.orginal_scheduler_config,
-            )
-        else:
-            raise ValueError(
-                f"Scheduler of type {scheduler_type} doesn't exist! Please choose in {self.supported_scheduler}!"
-            )
-        self.scheduler = scheduler
-
-    def get_timesteps(self, num_inference_steps, strength=1.0):
-        if strength >= 1:
-            return self.scheduler.timesteps, num_inference_steps
-
-        # get the original timestep using init_timestep
-        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
-
-        t_start = max(num_inference_steps - init_timestep, 0)
-        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
-
-        num_inference_steps = num_inference_steps - t_start
-        # check that number of inference steps is not < 1 - as this doesn't make sense
-        if num_inference_steps < 1:
-            raise ValueError(
-                f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
-                f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
-            )
-
-        return timesteps, num_inference_steps
-
-    def prepare_controlnet_cond(
-        self,
-        controlnet_cond,
-        controlnet_conditioning_scale,
-        width,
-        height,
-        batch_size,
-        num_images_per_prompt,
-        dtype,
-        do_classifier_free_guidance=False,
-        guess_mode=False,
-    ):
-        control_image = self.control_image_processor.preprocess(
-            controlnet_cond,
-            height=height,
-            width=width,
-        )
-        if isinstance(controlnet_conditioning_scale, (float, int)):
-            controlnet_conditioning_scale = paddle.to_tensor([controlnet_conditioning_scale] * 13, dtype=dtype)
-        elif isinstance(controlnet_conditioning_scale, (list, tuple)):
-            controlnet_conditioning_scale = paddle.to_tensor(controlnet_conditioning_scale, dtype=dtype)
-        else:
-            raise ValueError(
-                f"`controlnet_conditioning_scale` has to be of type `float` or `int` or `list` or `tuple` but is {type(controlnet_conditioning_scale)}"
-            )
-        assert controlnet_conditioning_scale.shape[0] == 13
-        image_batch_size = control_image.shape[0]
-        if image_batch_size == 1:
-            repeat_by = batch_size
-        else:
-            # image batch size is the same as prompt batch size
-            repeat_by = num_images_per_prompt
-        control_image = control_image.repeat_interleave(repeat_by, axis=0)
-        control_image = control_image.cast(dtype)
-        if do_classifier_free_guidance and not guess_mode:
-            control_image = paddle.concat([control_image] * 2)
-        return control_image, controlnet_conditioning_scale
-
-    def check_inputs(
-        self,
-        prompt,
-        height=512,
-        width=512,
-        callback_steps=1,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-        strength=1.0,
-    ):
-        if height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by {self.vae_scale_factor} but are {height} and {width}."
-            )
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-        if strength < 0 or strength > 1:
-            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
-
-    def prepare_latents(
-        self,
-        batch_size,
-        height,
-        width,
-        generator,
-        dtype=None,
-        latents=None,
-        image=None,
-        timestep=None,
-        is_strength_max=True,
-        return_noise=False,
-        return_image_latents=False,
-    ):
-        shape = [
-            batch_size,
-            self.vae.config.latent_channels,
-            height // self.vae_scale_factor,
-            width // self.vae_scale_factor,
-        ]
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if (image is None or timestep is None) and not is_strength_max:
-            raise ValueError(
-                "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise."
-                "However, either the image or the noise timestep has not been provided."
-            )
-
-        if return_image_latents or (latents is None and not is_strength_max):
-            image = image.cast(dtype=dtype)
-            image_latents = self._encode_vae_image(image, batch_size=batch_size, generator=generator)
-
-        if latents is None:
-            noise = randn_tensor(shape, generator=generator, dtype=dtype)
-            # if strength is 1. then initialise the latents to noise, else initial to image + noise
-            latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep)
-            # if pure noise then scale the initial latents by the  Scheduler's init sigma
-            latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
-        else:
-            noise = latents
-            if str(noise.dtype).replace("paddle.", "") != dtype:
-                noise = noise.cast(dtype)
-            latents = noise * self.scheduler.init_noise_sigma
-
-        outputs = (latents,)
-
-        if return_noise:
-            outputs += (noise,)
-
-        if return_image_latents:
-            outputs += (image_latents,)
-
-        if len(outputs) == 1:
-            outputs = latents
-        return outputs
-
-    def prepare_mask_latents(
-        self,
-        mask,
-        masked_image,
-        batch_size,
-        height,
-        width,
-        generator,
-        dtype,
-        do_classifier_free_guidance=False,
-        return_masked_image_latents=True,
-    ):
-        # resize the mask to latents shape as we concatenate the mask to the latents
-        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
-        # and half precision
-        mask = paddle.nn.functional.interpolate(
-            mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
-        )
-        mask = mask.cast(dtype=dtype)
-
-        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
-        if mask.shape[0] < batch_size:
-            if not batch_size % mask.shape[0] == 0:
-                raise ValueError(
-                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
-                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
-                    " of masks that you pass is divisible by the total requested batch size."
-                )
-            mask = mask.tile([batch_size // mask.shape[0], 1, 1, 1])
-
-        mask = paddle.concat([mask] * 2) if do_classifier_free_guidance else mask
-        if not return_masked_image_latents:
-            return mask
-
-        masked_image = masked_image.cast(dtype=dtype)
-        masked_image_latents = self._encode_vae_image(masked_image, batch_size=batch_size, generator=generator)
-        if masked_image_latents.shape[0] < batch_size:
-            if not batch_size % masked_image_latents.shape[0] == 0:
-                raise ValueError(
-                    "The passed images and the required batch size don't match. Images are supposed to be duplicated"
-                    f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
-                    " Make sure the number of images that you pass is divisible by the total requested batch size."
-                )
-            masked_image_latents = masked_image_latents.tile([batch_size // masked_image_latents.shape[0], 1, 1, 1])
-
-        masked_image_latents = (
-            paddle.concat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
-        )
-
-        # aligning device to prevent device errors when concating it with the latent model input
-        masked_image_latents = masked_image_latents.cast(dtype=dtype)
-        return mask, masked_image_latents
-
-    def is_scheduler_support_step_index(self):
-        kwargs_keys = set(inspect.signature(self.scheduler.step).parameters.keys())
-        return "kwargs" in kwargs_keys or "step_index" in kwargs_keys
-
-    def _encode_vae_image(self, image: paddle.Tensor, batch_size=1, generator=None, **kwargs):
-        if isinstance(generator, list):
-            init_latents = [
-                self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
-            ]
-            init_latents = paddle.concat(init_latents, axis=0)
-        else:
-            init_latents = self.vae.encode(image).latent_dist.sample(generator)
-        return self.vae.config.scaling_factor * init_latents
-
-    def _decode_vae_latents(self, latents: paddle.Tensor, **kwargs):
-        images_vae = self.vae.decode(
-            latents,
-        )[0]
-        return images_vae
-
-    def _encode_prompt(
-        self,
-        prompt,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        lora_scale: Optional[float] = None,
-        parse_prompt_type: Optional[str] = "lpw",
-        max_embeddings_multiples: Optional[int] = 3,
-        **kwargs,
-    ):
-        if parse_prompt_type == "lpw":
-            return self._encode_prompt_lpw(
-                prompt,
-                num_images_per_prompt=num_images_per_prompt,
-                do_classifier_free_guidance=do_classifier_free_guidance,
-                negative_prompt=negative_prompt,
-                prompt_embeds=prompt_embeds,
-                negative_prompt_embeds=negative_prompt_embeds,
-                lora_scale=lora_scale,
-                max_embeddings_multiples=max_embeddings_multiples,
-                **kwargs,
-            )
-        elif parse_prompt_type == "raw":
-            return self._encode_prompt_raw(
-                prompt,
-                num_images_per_prompt=num_images_per_prompt,
-                do_classifier_free_guidance=do_classifier_free_guidance,
-                negative_prompt=negative_prompt,
-                prompt_embeds=prompt_embeds,
-                negative_prompt_embeds=negative_prompt_embeds,
-                lora_scale=lora_scale,
-            )
-        elif parse_prompt_type == "webui":
-            raise NotImplementedError("`parse_prompt_type=webui` is not implemented yet.")
-
-    def _encode_prompt_lpw(
-        self,
-        prompt: Union[str, List[str]],
-        num_images_per_prompt: int,
-        do_classifier_free_guidance: bool,
-        negative_prompt: Union[str, List[str]],
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        lora_scale: Optional[float] = None,
-        max_embeddings_multiples: Optional[int] = 3,
-        **kwargs,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `list(int)`):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            lora_scale (`float`, *optional*):
-                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
-            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
-                The max multiple length of prompt embeddings compared to the max output length of text encoder.
-        """
-        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
-            self._lora_scale = lora_scale
-
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None and negative_prompt_embeds is None:
-            if isinstance(self, TextualInversionLoaderMixin):
-                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
-            uncond_tokens: List[str] = None
-            if do_classifier_free_guidance:
-                if negative_prompt is None:
-                    uncond_tokens = [""] * batch_size
-                elif prompt is not None and type(prompt) is not type(negative_prompt):
-                    raise TypeError(
-                        f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                        f" {type(prompt)}."
-                    )
-                elif isinstance(negative_prompt, str):
-                    uncond_tokens = [negative_prompt]
-                elif batch_size != len(negative_prompt):
-                    raise ValueError(
-                        f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                        f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                        " the batch size of `prompt`."
-                    )
-                else:
-                    uncond_tokens = negative_prompt
-                # textual inversion: procecss multi-vector tokens if necessary
-                if isinstance(self, TextualInversionLoaderMixin):
-                    uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
-
-            prompt_embeds, negative_prompt_embeds = get_weighted_text_embeddings(
-                pipe=self,
-                prompt=prompt,
-                uncond_prompt=uncond_tokens,
-                max_embeddings_multiples=max_embeddings_multiples,
-                **kwargs,
-            )
-
-        prompt_embeds = prompt_embeds.cast(dtype=self.text_encoder.dtype)
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.cast(dtype=self.text_encoder.dtype)
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
-
-        return prompt_embeds
-
-    def _encode_prompt_raw(
-        self,
-        prompt,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        lora_scale: Optional[float] = None,
-        **kwargs,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            lora_scale (`float`, *optional*):
-                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
-        """
-        # set lora scale so that monkey patched LoRA
-        # function of text encoder can correctly access it
-        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
-            self._lora_scale = lora_scale
-
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
-
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            config = (
-                self.text_encoder.config
-                if isinstance(self.text_encoder.config, dict)
-                else self.text_encoder.config.to_dict()
-            )
-            if config.get("use_attention_mask", None) is not None and config["use_attention_mask"]:
-                attention_mask = text_inputs.attention_mask
-            else:
-                attention_mask = None
-
-            prompt_embeds = self.text_encoder(
-                text_input_ids,
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        prompt_embeds = prompt_embeds.cast(dtype=self.text_encoder.dtype)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif prompt is not None and type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
-
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-
-            config = (
-                self.text_encoder.config
-                if isinstance(self.text_encoder.config, dict)
-                else self.text_encoder.config.to_dict()
-            )
-            if config.get("use_attention_mask", None) is not None and config["use_attention_mask"]:
-                attention_mask = uncond_input.attention_mask
-            else:
-                attention_mask = None
-
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids,
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.cast(dtype=self.text_encoder.dtype)
-
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
-
-        return prompt_embeds
-
-    def run_safety_checker(self, image, dtype):
-        if self.safety_checker is None:
-            has_nsfw_concept = None
-        else:
-            if paddle.is_tensor(image):
-                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
-            else:
-                feature_extractor_input = self.image_processor.numpy_to_pil(image)
-            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pd")
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
-            )
-        return image, has_nsfw_concept
-
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-
-class StableDiffusionMegaPipeline(
-    DiffusionPipeline, CommonMixIn, FromCkptMixin, LoraLoaderMixin, TextualInversionLoaderMixin
-):
-    r"""
-    Pipeline for mega using Stable Diffusion.
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        controlnet ([`ControlNetModel`] or `List[ControlNetModel]`):
-            Provides additional conditioning to the unet during the denoising process. If you set multiple ControlNets
-            as a list, the outputs from each ControlNet are added together to create one combined additional
-            conditioning.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        safety_checker ([`StableDiffusionMegaSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPFeatureExtractor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-    _optional_components = ["controlnet", "safety_checker", "feature_extractor"]
-
-    def __call__(self, *args, **kwargs):
-        return self.text2img(*args, **kwargs)
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        controlnet: ControlNetModel,
-        scheduler: KarrasDiffusionSchedulers,
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPFeatureExtractor,
-        requires_safety_checker: bool = True,
-    ):
-        super().__init__()
-        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
-                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
-                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
-                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
-                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file"
-            )
-            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["steps_offset"] = 1
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
-                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
-                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
-                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
-                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
-            )
-            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["clip_sample"] = False
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. PaddleNLP team, diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                f"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            controlnet=controlnet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
-        self.control_image_processor = VaeImageProcessor(
-            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
-        )
-        self.supported_scheduler = [
-            "pndm",
-            "lms",
-            "preconfig-lms",
-            "euler",
-            "euler-ancestral",
-            "preconfig-euler-ancestral",
-            "dpm-multi",
-            "dpm-single",
-            "unipc-multi",
-            "ddim",
-            "ddpm",
-            "deis-multi",
-            "heun",
-            "kdpm2-ancestral",
-            "kdpm2",
-        ]
-        self.orginal_scheduler_config = self.scheduler.config
-
-    @paddle.no_grad()
-    def do_unet(
-        self,
-        do_controlnet,
-        latents,
-        latent_model_input,
-        t,
-        i,
-        prompt_embeds,
-        control_image,
-        control_conditioning_scale,
-        cross_attention_kwargs,
-        guess_mode,
-        do_classifier_free_guidance,
-        is_scheduler_support_step_index=False,
-    ):
-        if not do_controlnet:
-            # predict the noise residual
-            noise_pred_unet = self.unet(
-                sample=latent_model_input,
-                timestep=t,
-                encoder_hidden_states=prompt_embeds,
-                cross_attention_kwargs=cross_attention_kwargs,
-                return_dict=False,
-            )[0]
-        else:
-            # controlnet inference
-            if guess_mode and do_classifier_free_guidance:
-                # Infer ControlNet only for the conditional batch.
-                control_model_input = latents
-                if is_scheduler_support_step_index:
-                    control_model_input = self.scheduler.scale_model_input(control_model_input, t, step_index=i)
-                else:
-                    control_model_input = self.scheduler.scale_model_input(control_model_input, t)
-                controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
-            else:
-                control_model_input = latent_model_input
-                controlnet_prompt_embeds = prompt_embeds
-
-            down_block_res_samples, mid_block_res_sample = self.controlnet(
-                sample=control_model_input,
-                timestep=t,
-                encoder_hidden_states=controlnet_prompt_embeds,
-                controlnet_cond=control_image,
-                conditioning_scale=control_conditioning_scale,
-                guess_mode=guess_mode,
-                return_dict=False,
-            )
-
-            if guess_mode and do_classifier_free_guidance:
-                # Infered ControlNet only for the conditional batch.
-                # To apply the output of ControlNet to both the unconditional and conditional batches,
-                # add 0 to the unconditional batch to keep it unchanged.
-                down_block_res_samples = [paddle.concat([paddle.zeros_like(d), d]) for d in down_block_res_samples]
-                mid_block_res_sample = paddle.concat([paddle.zeros_like(mid_block_res_sample), mid_block_res_sample])
-
-            # predict the noise residual
-            noise_pred_unet = self.unet(
-                latent_model_input,
-                t,
-                encoder_hidden_states=prompt_embeds,
-                cross_attention_kwargs=cross_attention_kwargs,
-                down_block_additional_residuals=down_block_res_samples,
-                mid_block_additional_residual=mid_block_res_sample,
-                return_dict=False,
-            )[0]
-        return noise_pred_unet
-
-    @paddle.no_grad()
-    def text2img(
-        self,
-        prompt: Union[str, List[str]] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        parse_prompt_type: Optional[str] = "lpw",
-        max_embeddings_multiples: Optional[int] = 3,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
-        controlnet_conditioning_scale: float = 1.0,
-        guess_mode: bool = False,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
-                The max multiple length of prompt embeddings compared to the max output length of text encoder.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [ppdiffusers.cross_attention](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/ppdiffusers/ppdiffusers/models/cross_attention.py).
-            control_cond (`paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[paddle.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
-                    `List[List[paddle.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
-                The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If
-                the type is specified as `paddle.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can
-                also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If
-                height and/or width are passed, `image` is resized according to them. If multiple ControlNets are
-                specified in init, images must be passed as a list such that each element of the list can be correctly
-                batched for input to a single controlnet.
-            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
-                The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
-                to the residual in the original unet. If multiple ControlNets are specified in init, you can set the
-                corresponding scale as a list. Note that by default, we use a smaller conditioning scale for inpainting
-                than for [`~DiffusionPipeline.text2img`].
-            guess_mode (`bool`, *optional*, defaults to `False`):
-                In this mode, the ControlNet encoder will try best to recognize the content of the input image even if
-                you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended.
-
-        Examples:
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-
-        # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * self.vae_scale_factor
-        width = width or self.unet.config.sample_size * self.vae_scale_factor
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt,
-            height,
-            width,
-            callback_steps,
-            negative_prompt,
-            prompt_embeds,
-            negative_prompt_embeds,
-        )
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        guess_mode = guess_mode or (
-            self.controlnet.config.global_pool_conditions if self.controlnet is not None else False
-        )
-
-        # 3. Encode input prompt
-        text_encoder_lora_scale = (
-            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
-        )
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            lora_scale=text_encoder_lora_scale,
-            max_embeddings_multiples=max_embeddings_multiples,
-            parse_prompt_type=parse_prompt_type,
-        )
-        dtype = prompt_embeds.dtype
-
-        # do_controlnet
-        do_controlnet = controlnet_cond is not None and self.controlnet is not None
-        if not do_controlnet:
-            guess_mode = False
-        if do_controlnet:
-            control_image, control_conditioning_scale = self.prepare_controlnet_cond(
-                controlnet_cond=controlnet_cond,
-                controlnet_conditioning_scale=controlnet_conditioning_scale,
-                width=width,
-                height=height,
-                batch_size=batch_size,
-                dtype=dtype,
-                num_images_per_prompt=num_images_per_prompt,
-                do_classifier_free_guidance=do_classifier_free_guidance,
-                guess_mode=guess_mode,
-            )
-        else:
-            control_image = None
-            control_conditioning_scale = None
-        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps)
-
-        # 5. Prepare latent variables
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            height,
-            width,
-            generator=generator,
-            dtype=dtype,
-            latents=latents,
-        )
-
-        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 7. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-
-        is_scheduler_support_step_index = self.is_scheduler_support_step_index()
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-                if is_scheduler_support_step_index:
-                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i)
-                else:
-                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                noise_pred_unet = self.do_unet(
-                    do_controlnet,
-                    latents,
-                    latent_model_input,
-                    t,
-                    i,
-                    prompt_embeds,
-                    control_image,
-                    control_conditioning_scale,
-                    cross_attention_kwargs,
-                    guess_mode,
-                    do_classifier_free_guidance,
-                    is_scheduler_support_step_index=is_scheduler_support_step_index,
-                )
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                else:
-                    noise_pred = noise_pred_unet
-                # compute the previous noisy sample x_t -> x_t-1
-                if is_scheduler_support_step_index:
-                    scheduler_output = self.scheduler.step(
-                        noise_pred, t, latents, step_index=i, return_pred_original_sample=False, **extra_step_kwargs
-                    )
-                else:
-                    scheduler_output = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)
-                latents = scheduler_output.prev_sample.cast(dtype)
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        if not output_type == "latent":
-            image = self._decode_vae_latents(latents / self.vae.config.scaling_factor)
-            image, has_nsfw_concept = self.run_safety_checker(image, dtype)
-        else:
-            image = latents
-            has_nsfw_concept = None
-
-        if has_nsfw_concept is None:
-            do_denormalize = [True] * image.shape[0]
-        else:
-            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
-
-        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
-
-    @paddle.no_grad()
-    def img2img(
-        self,
-        prompt: Union[str, List[str]] = None,
-        image: Union[paddle.Tensor, PIL.Image.Image] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        strength: float = 0.8,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        parse_prompt_type: Optional[str] = "lpw",
-        max_embeddings_multiples: Optional[int] = 3,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
-        controlnet_conditioning_scale: float = 1.0,
-        guess_mode: bool = False,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            image (`paddle.Tensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, that will be used as the starting point for the
-                process.
-            strength (`float`, *optional*, defaults to 0.8):
-                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
-                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
-                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
-                be maximum and the denoising process will run for the full number of iterations specified in
-                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference. This parameter will be modulated by `strength`.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
-                is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
-                The max multiple length of prompt embeddings compared to the max output length of text encoder.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [ppdiffusers.cross_attention](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/ppdiffusers/ppdiffusers/models/cross_attention.py).
-            control_cond (`paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[paddle.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
-                    `List[List[paddle.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
-                The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If
-                the type is specified as `paddle.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can
-                also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If
-                height and/or width are passed, `image` is resized according to them. If multiple ControlNets are
-                specified in init, images must be passed as a list such that each element of the list can be correctly
-                batched for input to a single controlnet.
-            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
-                The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
-                to the residual in the original unet. If multiple ControlNets are specified in init, you can set the
-                corresponding scale as a list. Note that by default, we use a smaller conditioning scale for inpainting
-                than for [`~DiffusionPipeline.img2img`].
-            guess_mode (`bool`, *optional*, defaults to `False`):
-                In this mode, the ControlNet encoder will try best to recognize the content of the input image even if
-                you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended.
-
-        Examples:
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        if image is None:
-            return self.text2img(
-                prompt=prompt,
-                height=height,
-                width=width,
-                num_inference_steps=num_inference_steps,
-                guidance_scale=guidance_scale,
-                negative_prompt=negative_prompt,
-                num_images_per_prompt=num_images_per_prompt,
-                eta=eta,
-                generator=generator,
-                latents=latents,
-                prompt_embeds=prompt_embeds,
-                negative_prompt_embeds=negative_prompt_embeds,
-                output_type=output_type,
-                return_dict=return_dict,
-                callback=callback,
-                callback_steps=callback_steps,
-                cross_attention_kwargs=cross_attention_kwargs,
-                controlnet_cond=controlnet_cond,
-                controlnet_conditioning_scale=controlnet_conditioning_scale,
-                guess_mode=guess_mode,
-                max_embeddings_multiples=max_embeddings_multiples,
-                parse_prompt_type=parse_prompt_type,
-            )
-        # 0. Preprocess image
-        init_image = self.image_processor.preprocess(image, height=height, width=width)
-        height, width = init_image.shape[-2:]
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt,
-            height,
-            width,
-            callback_steps,
-            negative_prompt,
-            prompt_embeds,
-            negative_prompt_embeds,
-            strength,
-        )
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-        guess_mode = guess_mode or (
-            self.controlnet.config.global_pool_conditions if self.controlnet is not None else False
-        )
-
-        # 3. Encode input prompt
-        text_encoder_lora_scale = (
-            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
-        )
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            lora_scale=text_encoder_lora_scale,
-            max_embeddings_multiples=max_embeddings_multiples,
-            parse_prompt_type=parse_prompt_type,
-        )
-        dtype = prompt_embeds.dtype
-
-        # do_controlnet
-        do_controlnet = controlnet_cond is not None and self.controlnet is not None
-        if not do_controlnet:
-            guess_mode = False
-        if do_controlnet:
-            control_image, control_conditioning_scale = self.prepare_controlnet_cond(
-                controlnet_cond=controlnet_cond,
-                controlnet_conditioning_scale=controlnet_conditioning_scale,
-                width=width,
-                height=height,
-                batch_size=batch_size,
-                dtype=dtype,
-                num_images_per_prompt=num_images_per_prompt,
-                do_classifier_free_guidance=do_classifier_free_guidance,
-                guess_mode=guess_mode,
-            )
-        else:
-            control_image = None
-            control_conditioning_scale = None
-        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
-
-        # 5. Prepare latent variables
-        # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
-        latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
-        # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
-        is_strength_max = strength == 1.0
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            height,
-            width,
-            generator=generator,
-            dtype=dtype,
-            latents=latents,
-            image=init_image,
-            timestep=latent_timestep,
-            is_strength_max=is_strength_max,
-        )
-        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 7. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                noise_pred_unet = self.do_unet(
-                    do_controlnet,
-                    latents,
-                    latent_model_input,
-                    t,
-                    i,
-                    prompt_embeds,
-                    control_image,
-                    control_conditioning_scale,
-                    cross_attention_kwargs,
-                    guess_mode,
-                    do_classifier_free_guidance,
-                )
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                else:
-                    noise_pred = noise_pred_unet
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
-                latents = latents.cast(dtype)
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        if not output_type == "latent":
-            image = self._decode_vae_latents(latents / self.vae.config.scaling_factor)
-            image, has_nsfw_concept = self.run_safety_checker(image, dtype)
-        else:
-            image = latents
-            has_nsfw_concept = None
-
-        if has_nsfw_concept is None:
-            do_denormalize = [True] * image.shape[0]
-        else:
-            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
-
-        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
-
-    @paddle.no_grad()
-    def inpaint_legacy(
-        self,
-        prompt: Union[str, List[str]] = None,
-        image: Union[paddle.Tensor, PIL.Image.Image] = None,
-        mask_image: Union[paddle.Tensor, PIL.Image.Image] = None,
-        height: int = None,
-        width: int = None,
-        strength: float = 1.0,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        add_predicted_noise: Optional[bool] = False,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        parse_prompt_type: Optional[str] = "lpw",
-        max_embeddings_multiples: Optional[int] = 3,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
-        controlnet_conditioning_scale: float = 1.0,
-        guess_mode: bool = False,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            image (`paddle.Tensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, that will be used as the starting point for the
-                process. This is the image whose masked region will be inpainted.
-            mask_image (`paddle.Tensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
-                replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
-                PIL image, it will be converted to a single channel (luminance) before use. If mask is a tensor, the
-                expected shape should be either `(B, H, W, C)` or `(B, C, H, W)`, where C is 1 or 3.
-            height (`int`, *optional*, defaults to None):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to None):
-                The width in pixels of the generated image.
-            strength (`float`, *optional*, defaults to 1.0):
-                Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
-                is 1, the denoising process will be run on the masked area for the full number of iterations specified
-                in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more noise to
-                that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
-                the expense of slower inference. This parameter will be modulated by `strength`, as explained above.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
-                is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            add_predicted_noise (`bool`, *optional*, defaults to False):
-                Use predicted noise instead of random noise when constructing noisy versions of the original image in
-                the reverse diffusion process
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator`, *optional*):
-                One or a list of [paddle generator(s)] to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noise tensor, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. If not provided, a noise tensor will ge generated by sampling using the supplied random
-                `generator`.
-            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
-                The max multiple length of prompt embeddings compared to the max output length of text encoder.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [ppdiffusers.cross_attention](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/ppdiffusers/ppdiffusers/models/cross_attention.py).
-            control_cond (`paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[paddle.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
-                    `List[List[paddle.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
-                The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If
-                the type is specified as `paddle.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can
-                also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If
-                height and/or width are passed, `image` is resized according to them. If multiple ControlNets are
-                specified in init, images must be passed as a list such that each element of the list can be correctly
-                batched for input to a single controlnet.
-            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
-                The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
-                to the residual in the original unet. If multiple ControlNets are specified in init, you can set the
-                corresponding scale as a list. Note that by default, we use a smaller conditioning scale for inpainting
-                than for [`~DiffusionPipeline.inpaint_legacy`].
-            guess_mode (`bool`, *optional*, defaults to `False`):
-                In this mode, the ControlNet encoder will try best to recognize the content of the input image even if
-                you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended.        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        # 0. Preprocess mask and image
-        mask, masked_image, init_image = prepare_mask_and_masked_image(
-            image,
-            mask_image,
-            height,
-            width,
-            return_image=True,
-        )
-        height, width = init_image.shape[-2:]
-
-        # 1. Check inputs
-        self.check_inputs(
-            prompt,
-            height,
-            width,
-            callback_steps,
-            negative_prompt,
-            prompt_embeds,
-            negative_prompt_embeds,
-            strength,
-        )
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        guess_mode = guess_mode or (
-            self.controlnet.config.global_pool_conditions if self.controlnet is not None else False
-        )
-
-        # 3. Encode input prompt
-        text_encoder_lora_scale = (
-            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
-        )
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            lora_scale=text_encoder_lora_scale,
-            max_embeddings_multiples=max_embeddings_multiples,
-            parse_prompt_type=parse_prompt_type,
-        )
-        dtype = prompt_embeds.dtype
-
-        # do_controlnet
-        do_controlnet = controlnet_cond is not None and self.controlnet is not None
-        if not do_controlnet:
-            guess_mode = False
-        if do_controlnet:
-            control_image, control_conditioning_scale = self.prepare_controlnet_cond(
-                controlnet_cond=controlnet_cond,
-                controlnet_conditioning_scale=controlnet_conditioning_scale,
-                width=width,
-                height=height,
-                batch_size=batch_size,
-                dtype=dtype,
-                num_images_per_prompt=num_images_per_prompt,
-                do_classifier_free_guidance=do_classifier_free_guidance,
-                guess_mode=guess_mode,
-            )
-        else:
-            control_image = None
-            control_conditioning_scale = None
-
-        # 4. set timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
-        # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
-        latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
-        # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
-        is_strength_max = strength == 1.0
-
-        # 5. Prepare latent variables
-        latents, noise, image_latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            height,
-            width,
-            generator=generator,
-            dtype=dtype,
-            latents=latents,
-            image=init_image,
-            timestep=latent_timestep,
-            is_strength_max=is_strength_max,
-            return_noise=True,
-            return_image_latents=True,
-        )
-
-        # 6. Prepare mask latent variables
-        mask = self.prepare_mask_latents(
-            mask,
-            masked_image,
-            batch_size * num_images_per_prompt,
-            height,
-            width,
-            dtype=dtype,
-            generator=generator,
-            do_classifier_free_guidance=do_classifier_free_guidance,
-            return_masked_image_latents=False,
-        )
-
-        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        if do_classifier_free_guidance:
-            init_mask = mask[: mask.shape[0] // 2]
-        else:
-            init_mask = mask
-
-        # 8. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                noise_pred_unet = self.do_unet(
-                    do_controlnet,
-                    latents,
-                    latent_model_input,
-                    t,
-                    i,
-                    prompt_embeds,
-                    control_image,
-                    control_conditioning_scale,
-                    cross_attention_kwargs,
-                    guess_mode,
-                    do_classifier_free_guidance,
-                )
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                else:
-                    noise_pred = noise_pred_unet
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
-
-                if i < len(timesteps) - 1:
-                    # masking
-                    if add_predicted_noise:
-                        init_latents_proper = self.scheduler.add_noise(image_latents, noise_pred_uncond, t)
-                    else:
-                        # https://github.com/huggingface/diffusers/pull/3749/files#diff-39d36ab1e622684e35fe6971c12fb44e24756bdc383aba3d7f6e3b1625bdaafc
-                        noise_timestep = timesteps[i + 1]
-                        init_latents_proper = self.scheduler.add_noise(image_latents, noise, noise_timestep)
-                else:
-                    init_latents_proper = image_latents
-
-                latents = (1 - init_mask) * init_latents_proper + init_mask * latents
-                latents = latents.cast(dtype)
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-        if not output_type == "latent":
-            image = self._decode_vae_latents(latents / self.vae.config.scaling_factor)
-            image, has_nsfw_concept = self.run_safety_checker(image, dtype)
-        else:
-            image = latents
-            has_nsfw_concept = None
-
-        if has_nsfw_concept is None:
-            do_denormalize = [True] * image.shape[0]
-        else:
-            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
-
-        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
-
-    @paddle.no_grad()
-    def inpaint(
-        self,
-        prompt: Union[str, List[str]] = None,
-        image: Union[paddle.Tensor, PIL.Image.Image] = None,
-        mask_image: Union[paddle.Tensor, PIL.Image.Image] = None,
-        height: int = None,
-        width: int = None,
-        strength: float = 1.0,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        add_predicted_noise: Optional[bool] = False,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        parse_prompt_type: Optional[str] = "lpw",
-        max_embeddings_multiples: Optional[int] = 3,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
-        controlnet_conditioning_scale: float = 1.0,
-        guess_mode: bool = False,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            image (`paddle.Tensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, that will be used as the starting point for the
-                process. This is the image whose masked region will be inpainted.
-            mask_image (`paddle.Tensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
-                replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
-                PIL image, it will be converted to a single channel (luminance) before use. If mask is a tensor, the
-                expected shape should be either `(B, H, W, C)` or `(B, C, H, W)`, where C is 1 or 3.
-            height (`int`, *optional*, defaults to None):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to None):
-                The width in pixels of the generated image.
-            strength (`float`, *optional*, defaults to 1.0):
-                Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
-                is 1, the denoising process will be run on the masked area for the full number of iterations specified
-                in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more noise to
-                that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
-                the expense of slower inference. This parameter will be modulated by `strength`, as explained above.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
-                is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            add_predicted_noise (`bool`, *optional*, defaults to False):
-                Use predicted noise instead of random noise when constructing noisy versions of the original image in
-                the reverse diffusion process
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator`, *optional*):
-                One or a list of [paddle generator(s)] to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noise tensor, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. If not provided, a noise tensor will ge generated by sampling using the supplied random
-                `generator`.
-            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
-                The max multiple length of prompt embeddings compared to the max output length of text encoder.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [ppdiffusers.cross_attention](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/ppdiffusers/ppdiffusers/models/cross_attention.py).
-            control_cond (`paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[paddle.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
-                    `List[List[paddle.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
-                The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If
-                the type is specified as `paddle.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can
-                also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If
-                height and/or width are passed, `image` is resized according to them. If multiple ControlNets are
-                specified in init, images must be passed as a list such that each element of the list can be correctly
-                batched for input to a single controlnet.
-            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
-                The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
-                to the residual in the original unet. If multiple ControlNets are specified in init, you can set the
-                corresponding scale as a list. Note that by default, we use a smaller conditioning scale for inpainting
-                than for [`~DiffusionPipeline.text2img`].
-            guess_mode (`bool`, *optional*, defaults to `False`):
-                In this mode, the ControlNet encoder will try best to recognize the content of the input image even if
-                you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended.
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        # 0. Preprocess mask and image
-        mask, masked_image, init_image = prepare_mask_and_masked_image(
-            image,
-            mask_image,
-            height,
-            width,
-            return_image=True,
-        )
-        height, width = init_image.shape[-2:]
-
-        # 1. Check inputs
-        self.check_inputs(
-            prompt,
-            height,
-            width,
-            callback_steps,
-            negative_prompt,
-            prompt_embeds,
-            negative_prompt_embeds,
-            strength,
-        )
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        guess_mode = guess_mode or (
-            self.controlnet.config.global_pool_conditions if self.controlnet is not None else False
-        )
-
-        # 3. Encode input prompt
-        text_encoder_lora_scale = (
-            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
-        )
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            lora_scale=text_encoder_lora_scale,
-            max_embeddings_multiples=max_embeddings_multiples,
-            parse_prompt_type=parse_prompt_type,
-        )
-        dtype = prompt_embeds.dtype
-
-        # 4. set timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
-        # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
-        latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
-        # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
-        is_strength_max = strength == 1.0
-
-        # 5. Prepare latent variables
-        num_channels_latents = self.vae.config.latent_channels
-        num_channels_unet = self.unet.config.in_channels
-        is_legacy = return_image_latents = num_channels_unet == 4
-
-        latents_outputs = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            height,
-            width,
-            generator=generator,
-            dtype=dtype,
-            latents=latents,
-            image=init_image,
-            timestep=latent_timestep,
-            is_strength_max=is_strength_max,
-            return_noise=True,
-            return_image_latents=return_image_latents,
-        )
-
-        if return_image_latents:
-            latents, noise, image_latents = latents_outputs
-        else:
-            latents, noise = latents_outputs
-
-        # 6. Prepare mask latent variables
-        mask, masked_image_latents = self.prepare_mask_latents(
-            mask,
-            masked_image,
-            batch_size * num_images_per_prompt,
-            height,
-            width,
-            dtype=dtype,
-            generator=generator,
-            do_classifier_free_guidance=do_classifier_free_guidance,
-            return_masked_image_latents=True,
-        )
-
-        # 7. Check that sizes of mask, masked image and latents match
-        if num_channels_unet == 9:
-            # default case for runwayml/stable-diffusion-inpainting
-            num_channels_mask = mask.shape[1]
-            num_channels_masked_image = masked_image_latents.shape[1]
-            if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
-                raise ValueError(
-                    f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
-                    f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
-                    f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
-                    f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
-                    " `pipeline.unet` or your `mask_image` or `image` input."
-                )
-        elif num_channels_unet != 4:
-            raise ValueError(f"The unet should have either 4 or 9 input channels, not {num_channels_unet}.")
-
-        # do_controlnet
-        do_controlnet = controlnet_cond is not None and self.controlnet is not None and is_legacy
-        if not do_controlnet:
-            guess_mode = False
-        if do_controlnet:
-            control_image, control_conditioning_scale = self.prepare_controlnet_cond(
-                controlnet_cond=controlnet_cond,
-                controlnet_conditioning_scale=controlnet_conditioning_scale,
-                width=width,
-                height=height,
-                batch_size=batch_size,
-                num_images_per_prompt=num_images_per_prompt,
-                dtype=dtype,
-                do_classifier_free_guidance=do_classifier_free_guidance,
-                guess_mode=guess_mode,
-            )
-        else:
-            control_image = None
-            control_conditioning_scale = None
-        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        if do_classifier_free_guidance:
-            init_mask = mask[: mask.shape[0] // 2]
-        else:
-            init_mask = mask
-
-        # 9. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                if not is_legacy:
-                    # concat latents, mask, masked_image_latents in the channel dimension
-                    latent_model_input = paddle.concat([latent_model_input, mask, masked_image_latents], axis=1)
-
-                noise_pred_unet = self.do_unet(
-                    do_controlnet,
-                    latents,
-                    latent_model_input,
-                    t,
-                    i,
-                    prompt_embeds,
-                    control_image,
-                    control_conditioning_scale,
-                    cross_attention_kwargs,
-                    guess_mode,
-                    do_classifier_free_guidance,
-                )
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                else:
-                    noise_pred = noise_pred_unet
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
-
-                if is_legacy:
-                    if i < len(timesteps) - 1:
-                        # masking
-                        if add_predicted_noise:
-                            init_latents_proper = self.scheduler.add_noise(image_latents, noise_pred_uncond, t)
-                        else:
-                            # https://github.com/huggingface/diffusers/pull/3749/files#diff-39d36ab1e622684e35fe6971c12fb44e24756bdc383aba3d7f6e3b1625bdaafc
-                            noise_timestep = timesteps[i + 1]
-                            init_latents_proper = self.scheduler.add_noise(image_latents, noise, noise_timestep)
-                    else:
-                        init_latents_proper = image_latents
-                    latents = (1 - init_mask) * init_latents_proper + init_mask * latents
-                latents = latents.cast(dtype)
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        if not output_type == "latent":
-            image = self._decode_vae_latents(latents / self.vae.config.scaling_factor)
-            image, has_nsfw_concept = self.run_safety_checker(image, dtype)
-        else:
-            image = latents
-            has_nsfw_concept = None
-
-        if has_nsfw_concept is None:
-            do_denormalize = [True] * image.shape[0]
-        else:
-            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
-
-        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
-
-    def check_inputs_hires_fix(
-        self,
-        prompt,
-        height,
-        width,
-        callback_steps,
-        hr_scale,
-        hr_resize_height,
-        hr_resize_width,
-        denoising_strength,
-        latent_scale_mode,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-    ):
-        if height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by {self.vae_scale_factor} but are {height} and {width}."
-            )
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if hr_scale < 0:
-            raise ValueError("hr_scale shoule be greater that 0, but acceived {hr_scale}")
-
-        if hr_resize_height % 8 != 0 or hr_resize_width % 8 != 0:
-            raise ValueError(
-                f"`hr_resize_height` and `hr_resize_width` have to be divisible by 8 but are {hr_resize_height} and {hr_resize_width}."
-            )
-
-        if denoising_strength > 1 or denoising_strength < 0:
-            raise ValueError(f"denoising_strength should be set between 0 and 1., but acceived {denoising_strength}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if latent_scale_mode not in ["nearest", "bilinear", "bicubic", "area"]:
-            raise ValueError(
-                f"Only such interpolate method supported for latent_scale_mode in [nearest, bilinear, bicubic, area]. but acceived {latent_scale_mode}."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-    def get_upscaled_width_and_height(self, width, height, hr_scale=2, hr_resize_width=0, hr_resize_height=0):
-        if hr_resize_width == 0 and hr_resize_height == 0:
-            hr_upscale_to_width = int(width * hr_scale)
-            hr_upscale_to_height = int(height * hr_scale)
-        else:
-            if hr_resize_height == 0:
-                hr_upscale_to_width = hr_resize_width
-                hr_upscale_to_height = hr_resize_width * height // width
-            elif hr_resize_width == 0:
-                hr_upscale_to_width = hr_resize_height * width // height
-                hr_upscale_to_height = hr_resize_height
-            else:
-                src_ratio = width / height
-                dst_ratio = hr_resize_width / hr_resize_height
-
-                if src_ratio < dst_ratio:
-                    hr_upscale_to_width = hr_resize_width
-                    hr_upscale_to_height = hr_resize_width * height // width
-                else:
-                    hr_upscale_to_width = hr_resize_height * width // height
-                    hr_upscale_to_height = hr_resize_height
-
-        return hr_upscale_to_width, hr_upscale_to_height
-
-    def get_hires_fix_timesteps(self, denoising_steps, denoising_strength):
-        steps = int(denoising_steps / min(denoising_strength, 0.999))
-        self.scheduler.set_timesteps(steps)
-        timesteps = self.scheduler.timesteps[steps - denoising_steps :]
-        return timesteps, denoising_steps
-
-    @paddle.no_grad()
-    def hires_fix(
-        self,
-        prompt: Union[str, List[str]] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 40,
-        hires_ratio: Optional[float] = 0.5,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        parse_prompt_type: Optional[str] = "lpw",
-        max_embeddings_multiples: Optional[int] = 3,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        enable_hr: Optional[bool] = True,
-        hr_scale: Optional[float] = 2.0,
-        hr_resize_width: Optional[int] = 0,
-        hr_resize_height: Optional[int] = 0,
-        denoising_strength: Optional[float] = 0.7,
-        latent_scale_mode: Optional[str] = "nearest",
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
-        controlnet_conditioning_scale: float = 1.0,
-        guess_mode: bool = False,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 40):
-                The number of denoising steps, equal to sample_steps and hr_steps. samples_steps means the initial
-                denoising steps, and hr_steps means hires denoising steps. More denoising steps usually lead to a
-                higher quality image at the expense of slower inference.
-            hires_ratio (`float`, *optional*, defaults to 0.5):
-                The step proportion of hires.fix, that means hr_steps = int(num_inference_steps * hires_ratio).
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
-                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
-                The max multiple length of prompt embeddings compared to the max output length of text encoder.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            hr_steps (`int`, *optional*, defaults to 30):
-                The number of second denoising steps about high resolution fixing.
-            hr_scale (`float`, *optional*, defaults to 2.0):
-                The upscaler to expand the width and height of image. if set 2.0, it means that expand width and height of a image to width*2.0 and height*2.0.
-            hr_resize_width (`int`, *optional*, defaults to 0):
-                It enable users to specify the upscaled width mannually. if hr_resize_width!=0, program will use it to compute scaled width and height instead of hr_scale.
-            hr_resize_height (`int`, *optional*, defaults to 0):
-                It enable users to specify the upscaled height mannually. if hr_resize_height!=0, program will use it to compute scaled width and height instead of hr_scale.
-            denoising_strength (`float`, *optional*, defaults to 0.7):
-                The denoising strength applying on hires.fix steps. It take a value between 0 and 1.
-            latent_scale_mode (`str`, *optional*, defaults to nearest):
-                The interpolate method applying upscale initial images, you can set it in [nearest, bilinear, bicubic, area].
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [ppdiffusers.cross_attention](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/ppdiffusers/ppdiffusers/models/cross_attention.py).
-            control_cond (`paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[paddle.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
-                    `List[List[paddle.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
-                The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If
-                the type is specified as `paddle.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can
-                also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If
-                height and/or width are passed, `image` is resized according to them. If multiple ControlNets are
-                specified in init, images must be passed as a list such that each element of the list can be correctly
-                batched for input to a single controlnet.
-            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
-                The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
-                to the residual in the original unet. If multiple ControlNets are specified in init, you can set the
-                corresponding scale as a list. Note that by default, we use a smaller conditioning scale for inpainting
-                than for [`~DiffusionPipeline.inpaint_legacy`].
-            guess_mode (`bool`, *optional*, defaults to `False`):
-                In this mode, the ControlNet encoder will try best to recognize the content of the input image even if
-                you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended.        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-
-        Examples:
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * self.vae_scale_factor
-        width = width or self.unet.config.sample_size * self.vae_scale_factor
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs_hires_fix(
-            prompt,
-            height,
-            width,
-            callback_steps,
-            hr_scale,
-            hr_resize_height,
-            hr_resize_width,
-            denoising_strength,
-            latent_scale_mode,
-            negative_prompt,
-            prompt_embeds,
-            negative_prompt_embeds,
-        )
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        guess_mode = guess_mode or (
-            self.controlnet.config.global_pool_conditions if self.controlnet is not None else False
-        )
-
-        # 3. Encode input prompt
-        text_encoder_lora_scale = (
-            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
-        )
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            lora_scale=text_encoder_lora_scale,
-            max_embeddings_multiples=max_embeddings_multiples,
-            parse_prompt_type=parse_prompt_type,
-        )
-        dtype = prompt_embeds.dtype
-
-        # do_controlnet
-        do_controlnet = controlnet_cond is not None and self.controlnet is not None
-        if not do_controlnet:
-            guess_mode = False
-        if do_controlnet:
-            control_image, control_conditioning_scale = self.prepare_controlnet_cond(
-                controlnet_cond=controlnet_cond,
-                controlnet_conditioning_scale=controlnet_conditioning_scale,
-                width=width,
-                height=height,
-                batch_size=batch_size,
-                dtype=dtype,
-                num_images_per_prompt=num_images_per_prompt,
-                do_classifier_free_guidance=do_classifier_free_guidance,
-                guess_mode=guess_mode,
-            )
-        else:
-            control_image = None
-            control_conditioning_scale = None
-        # 4. Prepare timesteps
-        if enable_hr:
-            hr_steps = int(num_inference_steps * hires_ratio)
-            sample_steps = num_inference_steps - hr_steps
-        else:
-            hr_steps = 0
-            sample_steps = num_inference_steps
-
-        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(sample_steps)
-        timesteps = self.scheduler.timesteps
-
-        # 5. Prepare latent variables
-        if generator is None:
-            generator_state = paddle.get_cuda_rng_state()
-            paddle.Generator().states_["initial_generator"] = copy.deepcopy(generator_state)
-        else:
-            paddle.Generator().states_["initial_generator"] = copy.deepcopy(paddle.Generator().states_[generator])
-
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            height,
-            width,
-            generator=generator,
-            dtype=dtype,
-            latents=latents,
-        )
-
-        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 7. Denoising loop
-        num_warmup_steps = len(timesteps) - sample_steps * self.scheduler.order
-
-        with self.progress_bar(total=sample_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                noise_pred_unet = self.do_unet(
-                    do_controlnet,
-                    latents,
-                    latent_model_input,
-                    t,
-                    i,
-                    prompt_embeds,
-                    control_image,
-                    control_conditioning_scale,
-                    cross_attention_kwargs,
-                    guess_mode,
-                    do_classifier_free_guidance,
-                )
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                else:
-                    noise_pred = noise_pred_unet
-
-                # compute the previous noisy sample x_t -> x_t-1
-                scheduler_output = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)
-                latents = scheduler_output.prev_sample.cast(dtype)
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        # start to apply hires.fix on initial latents
-        if enable_hr:
-            # 8. determine the upscaled width and height for upscaled images
-            truncate_width = 0
-            truncate_height = 0
-            hr_upscale_to_width, hr_upscale_to_height = self.get_upscaled_width_and_height(
-                width, height, hr_scale=hr_scale, hr_resize_width=hr_resize_width, hr_resize_height=hr_resize_height
-            )
-            if hr_resize_width != 0 and hr_resize_height != 0:
-                truncate_width = (hr_upscale_to_width - hr_resize_width) // self.vae_scale_factor
-                truncate_height = (hr_upscale_to_height - hr_resize_height) // self.vae_scale_factor
-
-            # 9. special case: do nothing if upscaling is not nesscessary
-            if hr_upscale_to_width == width and hr_upscale_to_height == height:
-                enable_hr = False
-                denoising_strength = None
-
-        if enable_hr:
-            if do_controlnet:
-                control_image, control_conditioning_scale = self.prepare_controlnet_cond(
-                    controlnet_cond=controlnet_cond,
-                    controlnet_conditioning_scale=controlnet_conditioning_scale,
-                    width=hr_upscale_to_width,
-                    height=hr_upscale_to_height,
-                    batch_size=batch_size,
-                    num_images_per_prompt=num_images_per_prompt,
-                    dtype=dtype,
-                    do_classifier_free_guidance=do_classifier_free_guidance,
-                    guess_mode=guess_mode,
-                )
-            else:
-                control_image = None
-                control_conditioning_scale = None
-            # 10. prepare init latents
-            timesteps, hr_steps = self.get_hires_fix_timesteps(hr_steps, denoising_strength)
-            init_timestep = timesteps[:1].tile([latents.shape[0]])
-
-            latents = paddle.nn.functional.interpolate(
-                latents,
-                size=(
-                    hr_upscale_to_height // self.vae_scale_factor,
-                    hr_upscale_to_width // self.vae_scale_factor,
-                ),
-                mode=latent_scale_mode,
-            )
-            latents = latents[
-                :,
-                :,
-                truncate_height // 2 : latents.shape[2] - (truncate_height + 1) // 2,
-                truncate_width // 2 : latents.shape[3] - (truncate_width + 1) // 2,
-            ]
-
-            noise = randn_tensor(latents.shape, dtype=latents.dtype, generator="initial_generator")
-            latents = self.scheduler.add_noise(latents, noise, init_timestep)
-
-            # 11. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-            extra_step_kwargs = self.prepare_extra_step_kwargs("initial_generator", eta)
-
-            # 12. denoising on hires.fix steps
-            num_warmup_steps = len(timesteps) - hr_steps * self.scheduler.order
-            with self.progress_bar(total=hr_steps) as progress_bar:
-                for i, t in enumerate(timesteps):
-                    # expand the latents if we are doing classifier free guidance
-                    latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                    noise_pred_unet = self.do_unet(
-                        do_controlnet,
-                        latents,
-                        latent_model_input,
-                        t,
-                        i,
-                        prompt_embeds,
-                        control_image,
-                        control_conditioning_scale,
-                        cross_attention_kwargs,
-                        guess_mode,
-                        do_classifier_free_guidance,
-                    )
-                    # perform guidance
-                    if do_classifier_free_guidance:
-                        noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2)
-                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                    else:
-                        noise_pred = noise_pred_unet
-
-                    # compute the previous noisy sample x_t -> x_t-1
-                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-                    latents = latents.cast(dtype)
-                    # call the callback, if provided
-                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                        progress_bar.update()
-                        if callback is not None and i % callback_steps == 0:
-                            callback(i, t, latents)
-
-        if not output_type == "latent":
-            image = self._decode_vae_latents(latents / self.vae.config.scaling_factor)
-            image, has_nsfw_concept = self.run_safety_checker(image, dtype)
-        else:
-            image = latents
-            has_nsfw_concept = None
-
-        if has_nsfw_concept is None:
-            do_denormalize = [True] * image.shape[0]
-        else:
-            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
-
-        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
-
-    @paddle.no_grad()
-    def cycle_diffusion(
-        self,
-        prompt: Union[str, List[str]],
-        source_prompt: Union[str, List[str]],
-        image: Union[paddle.Tensor, PIL.Image.Image] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        strength: float = 0.8,
-        num_inference_steps: Optional[int] = 50,
-        guidance_scale: Optional[float] = 7.5,
-        negative_prompt: Optional[paddle.Tensor] = None,
-        source_guidance_scale: Optional[float] = 1,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: Optional[float] = 0.1,
-        latents: Optional[paddle.Tensor] = None,
-        parse_prompt_type: Optional[str] = "lpw",
-        max_embeddings_multiples: Optional[int] = 3,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`):
-                The target prompt or prompts to guide the image generation.
-            source_prompt (`str` or `List[str]`):
-                The source prompt or prompts describe the input image.
-            height (`int`, *optional*, defaults to None):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to None):
-                The width in pixels of the generated image.
-            image (`paddle.Tensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, that will be used as the starting point for the
-                process.
-            strength (`float`, *optional*, defaults to 0.8):
-                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
-                `image` will be used as a starting point, adding more noise to it the larger the `strength`. The
-                number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
-                noise will be maximum and the denoising process will run for the full number of iterations specified in
-                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference. This parameter will be modulated by `strength`.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The negative prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
-                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
-            source_guidance_scale (`float`, *optional*, defaults to 1):
-                Guidance scale for the source prompt. This is useful to control the amount of influence the source
-                prompt for encoding.
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.1):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
-                The max multiple length of prompt embeddings compared to the max output length of text encoder.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        self.change_scheduler("ddim")
-        # 0. Preprocess image
-        init_image = self.image_processor.preprocess(image, height=height, width=width)
-        height, width = init_image.shape[-2:]
-
-        # 1. Check inputs
-        self.check_inputs(
-            prompt,
-            height,
-            width,
-            callback_steps,
-            negative_prompt,
-            prompt_embeds,
-            negative_prompt_embeds,
-            strength,
-        )
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode target prompt and source prompt
-        text_encoder_lora_scale = (
-            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
-        )
-
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            lora_scale=text_encoder_lora_scale,
-            max_embeddings_multiples=max_embeddings_multiples,
-            parse_prompt_type=parse_prompt_type,
-        )
-        source_prompt_embeds = self._encode_prompt(
-            source_prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            lora_scale=text_encoder_lora_scale,
-            max_embeddings_multiples=max_embeddings_multiples,
-            parse_prompt_type=parse_prompt_type,
-        )
-        dtype = prompt_embeds.dtype
-
-        # 5. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
-        # 6. Prepare latent variables
-        # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
-        latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
-        is_strength_max = strength == 1.0
-        latents, clean_latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            height,
-            width,
-            dtype=dtype,
-            generator=generator,
-            latents=latents,
-            image=init_image,
-            timestep=latent_timestep,
-            is_strength_max=is_strength_max,
-            return_image_latents=True,
-        )
-        source_latents = latents
-
-        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-        generator = extra_step_kwargs.pop("generator", None)
-
-        # 8. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2)
-                source_latent_model_input = paddle.concat([source_latents] * 2)
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-                source_latent_model_input = self.scheduler.scale_model_input(source_latent_model_input, t)
-
-                # predict the noise residual
-                concat_latent_model_input = paddle.stack(
-                    [
-                        source_latent_model_input[0],
-                        latent_model_input[0],
-                        source_latent_model_input[1],
-                        latent_model_input[1],
-                    ],
-                    axis=0,
-                )
-                concat_prompt_embeds = paddle.stack(
-                    [
-                        source_prompt_embeds[0],
-                        prompt_embeds[0],
-                        source_prompt_embeds[1],
-                        prompt_embeds[1],
-                    ],
-                    axis=0,
-                )
-
-                # predict the noise residual
-                concat_noise_pred = self.unet(
-                    sample=concat_latent_model_input,
-                    timestep=t,
-                    encoder_hidden_states=concat_prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                    return_dict=False,
-                )[0]
-
-                # perform guidance
-                (
-                    source_noise_pred_uncond,
-                    noise_pred_uncond,
-                    source_noise_pred_text,
-                    noise_pred_text,
-                ) = concat_noise_pred.chunk(4, axis=0)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                source_noise_pred = source_noise_pred_uncond + source_guidance_scale * (
-                    source_noise_pred_text - source_noise_pred_uncond
-                )
-
-                # Sample source_latents from the posterior distribution.
-                prev_source_latents = posterior_sample(
-                    self.scheduler, source_latents, t, clean_latents, generator=generator, **extra_step_kwargs
-                )
-                # Compute noise.
-                noise = compute_noise(
-                    self.scheduler, prev_source_latents, source_latents, t, source_noise_pred, **extra_step_kwargs
-                )
-                source_latents = prev_source_latents.cast(dtype)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(
-                    noise_pred, t, latents, variance_noise=noise, **extra_step_kwargs
-                ).prev_sample
-
-                latents = latents.cast(dtype)
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        if not output_type == "latent":
-            image = self._decode_vae_latents(latents / self.vae.config.scaling_factor)
-            image, has_nsfw_concept = self.run_safety_checker(image, dtype)
-        else:
-            image = latents
-            has_nsfw_concept = None
-
-        if has_nsfw_concept is None:
-            do_denormalize = [True] * image.shape[0]
-        else:
-            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
-
-        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/examples/community/webui_stable_diffusion.py b/ppdiffusers/examples/community/webui_stable_diffusion.py
deleted file mode 100644
index 057bb75a7b78..000000000000
--- a/ppdiffusers/examples/community/webui_stable_diffusion.py
+++ /dev/null
@@ -1,2151 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# modified from https://github.com/AUTOMATIC1111/stable-diffusion-webui
-# Here is the AGPL-3.0 license https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/master/LICENSE.txt
-
-import inspect
-import shutil
-from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import paddle
-import paddle.nn as nn
-import PIL
-import PIL.Image
-
-from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
-from ppdiffusers.models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
-from ppdiffusers.pipelines.pipeline_utils import DiffusionPipeline
-from ppdiffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
-from ppdiffusers.pipelines.stable_diffusion.safety_checker import (
-    StableDiffusionSafetyChecker,
-)
-from ppdiffusers.schedulers import KarrasDiffusionSchedulers
-from ppdiffusers.utils import (
-    PIL_INTERPOLATION,
-    PPDIFFUSERS_CACHE,
-    logging,
-    ppdiffusers_url_download,
-    randn_tensor,
-    safetensors_load,
-    smart_load,
-    torch_load,
-)
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-import copy
-import os
-import os.path
-
-from huggingface_hub.file_download import _request_wrapper, hf_raise_for_status
-
-# lark omegaconf
-
-
-def resize_image(resize_mode, im, width, height, upscaler_name=None):
-    """
-    Resizes an image with the specified resize_mode, width, and height.
-
-    Args:
-        resize_mode: The mode to use when resizing the image.
-           -1: do nothing.
-            0: Resize the image to the specified width and height.
-            1: Resize the image to fill the specified width and height, maintaining the aspect ratio, and then center the image within the dimensions, cropping the excess.
-            2: Resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image within the dimensions, filling empty with data from image.
-        im: The image to resize.
-        width: The width to resize the image to.
-        height: The height to resize the image to.
-        upscaler_name: The name of the upscaler to use. If not provided, defaults to opts.upscaler_for_img2img.
-    """
-    # ["Just resize", "Crop and resize", "Resize and fill", "Do nothing"]
-    #         0              1                   2               -1
-    def resize(im, w, h):
-        if upscaler_name is None or upscaler_name == "None" or im.mode == "L":
-            return im.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
-
-    if resize_mode == -1:
-        return im
-    elif resize_mode == 0:
-        res = resize(im, width, height)
-
-    elif resize_mode == 1:
-        ratio = width / height
-        src_ratio = im.width / im.height
-
-        src_w = width if ratio > src_ratio else im.width * height // im.height
-        src_h = height if ratio <= src_ratio else im.height * width // im.width
-
-        resized = resize(im, src_w, src_h)
-        res = Image.new("RGB", (width, height))
-        res.paste(resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2))
-
-    else:
-        ratio = width / height
-        src_ratio = im.width / im.height
-
-        src_w = width if ratio < src_ratio else im.width * height // im.height
-        src_h = height if ratio >= src_ratio else im.height * width // im.width
-
-        resized = resize(im, src_w, src_h)
-        res = Image.new("RGB", (width, height))
-        res.paste(resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2))
-
-        if ratio < src_ratio:
-            fill_height = height // 2 - src_h // 2
-            res.paste(resized.resize((width, fill_height), box=(0, 0, width, 0)), box=(0, 0))
-            res.paste(
-                resized.resize((width, fill_height), box=(0, resized.height, width, resized.height)),
-                box=(0, fill_height + src_h),
-            )
-        elif ratio > src_ratio:
-            fill_width = width // 2 - src_w // 2
-            res.paste(resized.resize((fill_width, height), box=(0, 0, 0, height)), box=(0, 0))
-            res.paste(
-                resized.resize((fill_width, height), box=(resized.width, 0, resized.width, height)),
-                box=(fill_width + src_w, 0),
-            )
-
-    return res
-
-
-def get_civitai_download_url(display_url, url_prefix="https://civitai.com"):
-    if "api/download" in display_url:
-        return display_url
-    import bs4
-    import requests
-
-    headers = {
-        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE"
-    }
-    r = requests.get(display_url, headers=headers)
-    soup = bs4.BeautifulSoup(r.text, "lxml")
-    download_url = None
-    for a in soup.find_all("a", href=True):
-        if "Download" in str(a):
-            download_url = url_prefix + a["href"].split("?")[0]
-            break
-    return download_url
-
-
-def http_file_name(
-    url: str,
-    *,
-    proxies=None,
-    headers: Optional[Dict[str, str]] = None,
-    timeout=10.0,
-    max_retries=0,
-):
-    """
-    Get a remote file name.
-    """
-    headers = copy.deepcopy(headers) or {}
-    r = _request_wrapper(
-        method="GET",
-        url=url,
-        stream=True,
-        proxies=proxies,
-        headers=headers,
-        timeout=timeout,
-        max_retries=max_retries,
-    )
-    hf_raise_for_status(r)
-    displayed_name = url.split("/")[-1]
-    content_disposition = r.headers.get("Content-Disposition")
-    if content_disposition is not None and "filename=" in content_disposition:
-        # Means file is on CDN
-        displayed_name = content_disposition.split("filename=")[-1]
-    return displayed_name
-
-
-@paddle.no_grad()
-def load_lora(
-    pipeline,
-    state_dict: dict,
-    LORA_PREFIX_UNET: str = "lora_unet",
-    LORA_PREFIX_TEXT_ENCODER: str = "lora_te",
-    ratio: float = 1.0,
-):
-    ratio = float(ratio)
-    visited = []
-    for key in state_dict:
-        if ".alpha" in key or ".lora_up" in key or key in visited:
-            continue
-
-        if "text" in key:
-            tmp_layer_infos = key.split(".")[0].split(LORA_PREFIX_TEXT_ENCODER + "_")[-1].split("_")
-            hf_to_ppnlp = {
-                "encoder": "transformer",
-                "fc1": "linear1",
-                "fc2": "linear2",
-            }
-            layer_infos = []
-            for layer_info in tmp_layer_infos:
-                if layer_info == "mlp":
-                    continue
-                layer_infos.append(hf_to_ppnlp.get(layer_info, layer_info))
-            curr_layer: paddle.nn.Linear = pipeline.text_encoder
-        else:
-            layer_infos = key.split(".")[0].split(LORA_PREFIX_UNET + "_")[-1].split("_")
-            curr_layer: paddle.nn.Linear = pipeline.unet
-
-        temp_name = layer_infos.pop(0)
-        while len(layer_infos) > -1:
-            try:
-                if temp_name == "to":
-                    raise ValueError()
-                curr_layer = curr_layer.__getattr__(temp_name)
-                if len(layer_infos) > 0:
-                    temp_name = layer_infos.pop(0)
-                elif len(layer_infos) == 0:
-                    break
-            except Exception:
-                if len(temp_name) > 0:
-                    temp_name += "_" + layer_infos.pop(0)
-                else:
-                    temp_name = layer_infos.pop(0)
-
-        triplet_keys = [key, key.replace("lora_down", "lora_up"), key.replace("lora_down.weight", "alpha")]
-        dtype: paddle.dtype = curr_layer.weight.dtype
-        weight_down: paddle.Tensor = state_dict[triplet_keys[0]].cast(dtype)
-        weight_up: paddle.Tensor = state_dict[triplet_keys[1]].cast(dtype)
-        rank: float = float(weight_down.shape[0])
-        if triplet_keys[2] in state_dict:
-            alpha: float = state_dict[triplet_keys[2]].cast(dtype).item()
-            scale: float = alpha / rank
-        else:
-            scale = 1.0
-
-        if not hasattr(curr_layer, "backup_weights"):
-            curr_layer.backup_weights = curr_layer.weight.clone()
-
-        if len(weight_down.shape) == 4:
-            if weight_down.shape[2:4] == [1, 1]:
-                # conv2d 1x1
-                curr_layer.weight.copy_(
-                    curr_layer.weight
-                    + ratio
-                    * paddle.matmul(weight_up.squeeze([-1, -2]), weight_down.squeeze([-1, -2])).unsqueeze([-1, -2])
-                    * scale,
-                    True,
-                )
-            else:
-                # conv2d 3x3
-                curr_layer.weight.copy_(
-                    curr_layer.weight
-                    + ratio
-                    * paddle.nn.functional.conv2d(weight_down.transpose([1, 0, 2, 3]), weight_up).transpose(
-                        [1, 0, 2, 3]
-                    )
-                    * scale,
-                    True,
-                )
-        else:
-            # linear
-            curr_layer.weight.copy_(curr_layer.weight + ratio * paddle.matmul(weight_up, weight_down).T * scale, True)
-
-        # update visited list
-        visited.extend(triplet_keys)
-    return pipeline
-
-
-class WebUIStableDiffusionPipeline(DiffusionPipeline):
-    r"""
-    Pipeline for text-to-image generation using Stable Diffusion.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        controlnet ([`ControlNetModel`]):
-            Provides additional conditioning to the unet during the denoising process.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], [`PNDMScheduler`], [`EulerDiscreteScheduler`], [`EulerAncestralDiscreteScheduler`]
-            or [`DPMSolverMultistepScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPFeatureExtractor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-    _optional_components = ["safety_checker", "feature_extractor", "controlnet"]
-    enable_emphasis = True
-    comma_padding_backtrack = 20
-    LORA_DIR = os.path.join(PPDIFFUSERS_CACHE, "lora")
-    TI_DIR = os.path.join(PPDIFFUSERS_CACHE, "textual_inversion")
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        controlnet: ControlNetModel,
-        scheduler: KarrasDiffusionSchedulers,
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPFeatureExtractor,
-        requires_safety_checker: bool = True,
-    ):
-        super().__init__()
-
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. PaddleNLP team, diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                f"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            controlnet=controlnet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-
-        # custom data
-        clip_model = FrozenCLIPEmbedder(text_encoder, tokenizer)
-        self.sj = StableDiffusionModelHijack(clip_model)
-        self.orginal_scheduler_config = self.scheduler.config
-        self.supported_scheduler = [
-            "pndm",
-            "lms",
-            "euler",
-            "euler-ancestral",
-            "dpm-multi",
-            "dpm-single",
-            "unipc-multi",
-            "ddim",
-            "ddpm",
-            "deis-multi",
-            "heun",
-            "kdpm2-ancestral",
-            "kdpm2",
-        ]
-        self.weights_has_changed = False
-
-        # register_state_dict_hook to fix text_encoder, when we save_pretrained text model.
-        def map_to(state_dict, *args, **kwargs):
-            if "text_model.token_embedding.wrapped.weight" in state_dict:
-                state_dict["text_model.token_embedding.weight"] = state_dict.pop(
-                    "text_model.token_embedding.wrapped.weight"
-                )
-            return state_dict
-
-        self.text_encoder.register_state_dict_hook(map_to)
-
-    def add_ti_embedding_dir(self, embeddings_dir=None):
-        self.sj.embedding_db.add_embedding_dir(embeddings_dir)
-        self.sj.embedding_db.load_textual_inversion_embeddings()
-
-    def clear_ti_embedding(self):
-        self.sj.embedding_db.clear_embedding_dirs()
-        self.sj.embedding_db.load_textual_inversion_embeddings(True)
-
-    def download_civitai_lora_file(self, url):
-        if os.path.isfile(url):
-            dst = os.path.join(self.LORA_DIR, os.path.basename(url))
-            shutil.copyfile(url, dst)
-            return dst
-
-        download_url = get_civitai_download_url(url) or url
-        file_path = ppdiffusers_url_download(
-            download_url, cache_dir=self.LORA_DIR, filename=http_file_name(download_url).strip('"')
-        )
-        return file_path
-
-    def download_civitai_ti_file(self, url):
-        if os.path.isfile(url):
-            dst = os.path.join(self.TI_DIR, os.path.basename(url))
-            shutil.copyfile(url, dst)
-            return dst
-
-        download_url = get_civitai_download_url(url) or url
-        file_path = ppdiffusers_url_download(
-            download_url, cache_dir=self.TI_DIR, filename=http_file_name(download_url).strip('"')
-        )
-        return file_path
-
-    def change_scheduler(self, scheduler_type="ddim"):
-        self.switch_scheduler(scheduler_type)
-
-    def switch_scheduler(self, scheduler_type="ddim"):
-        scheduler_type = scheduler_type.lower()
-        from ppdiffusers import (
-            DDIMScheduler,
-            DDPMScheduler,
-            DEISMultistepScheduler,
-            DPMSolverMultistepScheduler,
-            DPMSolverSinglestepScheduler,
-            EulerAncestralDiscreteScheduler,
-            EulerDiscreteScheduler,
-            HeunDiscreteScheduler,
-            KDPM2AncestralDiscreteScheduler,
-            KDPM2DiscreteScheduler,
-            LMSDiscreteScheduler,
-            PNDMScheduler,
-            UniPCMultistepScheduler,
-        )
-
-        if scheduler_type == "pndm":
-            scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True)
-        elif scheduler_type == "lms":
-            scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config)
-        elif scheduler_type == "heun":
-            scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config)
-        elif scheduler_type == "euler":
-            scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config)
-        elif scheduler_type == "euler-ancestral":
-            scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
-        elif scheduler_type == "dpm-multi":
-            scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config)
-        elif scheduler_type == "dpm-single":
-            scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config)
-        elif scheduler_type == "kdpm2-ancestral":
-            scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
-        elif scheduler_type == "kdpm2":
-            scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config)
-        elif scheduler_type == "unipc-multi":
-            scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config)
-        elif scheduler_type == "ddim":
-            scheduler = DDIMScheduler.from_config(
-                self.orginal_scheduler_config,
-                steps_offset=1,
-                clip_sample=False,
-                set_alpha_to_one=False,
-            )
-        elif scheduler_type == "ddpm":
-            scheduler = DDPMScheduler.from_config(
-                self.orginal_scheduler_config,
-            )
-        elif scheduler_type == "deis-multi":
-            scheduler = DEISMultistepScheduler.from_config(
-                self.orginal_scheduler_config,
-            )
-        else:
-            raise ValueError(
-                f"Scheduler of type {scheduler_type} doesn't exist! Please choose in {self.supported_scheduler}!"
-            )
-        self.scheduler = scheduler
-
-    @paddle.no_grad()
-    def _encode_prompt(
-        self,
-        prompt: str,
-        do_classifier_free_guidance: float = 7.5,
-        negative_prompt: str = None,
-        num_inference_steps: int = 50,
-    ):
-        if do_classifier_free_guidance:
-            assert isinstance(negative_prompt, str)
-            negative_prompt = [negative_prompt]
-            uc = get_learned_conditioning(self.sj.clip, negative_prompt, num_inference_steps)
-        else:
-            uc = None
-
-        c = get_multicond_learned_conditioning(self.sj.clip, prompt, num_inference_steps)
-        return c, uc
-
-    def run_safety_checker(self, image, dtype):
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
-            )
-        else:
-            has_nsfw_concept = None
-        return image, has_nsfw_concept
-
-    def decode_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
-        image = self.vae.decode(latents).sample
-        image = (image / 2 + 0.5).clip(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        image = image.transpose([0, 2, 3, 1]).cast("float32").numpy()
-        return image
-
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    def check_inputs(
-        self,
-        prompt,
-        image,
-        height,
-        width,
-        callback_steps,
-        negative_prompt=None,
-        controlnet_conditioning_scale=1.0,
-    ):
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and not isinstance(prompt, str):
-            raise ValueError(f"`prompt` has to be of type `str` but is {type(prompt)}")
-
-        if negative_prompt is not None and not isinstance(negative_prompt, str):
-            raise ValueError(f"`negative_prompt` has to be of type `str` but is {type(negative_prompt)}")
-
-        # Check `image`
-        if image is not None and self.controlnet is not None:
-            if isinstance(self.controlnet, ControlNetModel):
-                self.check_image(image, prompt)
-            else:
-                assert False
-
-            # Check `controlnet_conditioning_scale`
-            if isinstance(self.controlnet, ControlNetModel):
-                if not isinstance(controlnet_conditioning_scale, (float, list, tuple)):
-                    raise TypeError(
-                        "For single controlnet: `controlnet_conditioning_scale` must be type `float, list(float) or tuple(float)`."
-                    )
-
-    def check_image(self, image, prompt):
-        image_is_pil = isinstance(image, PIL.Image.Image)
-        image_is_tensor = isinstance(image, paddle.Tensor)
-        image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
-        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], paddle.Tensor)
-
-        if not image_is_pil and not image_is_tensor and not image_is_pil_list and not image_is_tensor_list:
-            raise TypeError(
-                "image must be one of PIL image, paddle tensor, list of PIL images, or list of paddle tensors"
-            )
-
-        if image_is_pil:
-            image_batch_size = 1
-        elif image_is_tensor:
-            image_batch_size = image.shape[0]
-        elif image_is_pil_list:
-            image_batch_size = len(image)
-        elif image_is_tensor_list:
-            image_batch_size = len(image)
-
-        if prompt is not None and isinstance(prompt, str):
-            prompt_batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            prompt_batch_size = len(prompt)
-
-        if image_batch_size != 1 and image_batch_size != prompt_batch_size:
-            raise ValueError(
-                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
-            )
-
-    def prepare_image(self, image, width, height, dtype, resize_mode=-1):
-        if not isinstance(image, paddle.Tensor):
-            if isinstance(image, PIL.Image.Image):
-                image = resize_image(resize_mode=resize_mode, im=image, width=width, height=height)
-                image = [image]
-
-            if isinstance(image[0], PIL.Image.Image):
-                image = [resize_image(resize_mode=resize_mode, im=im, width=width, height=height) for im in image]
-
-                images = []
-                for image_ in image:
-                    image_ = image_.convert("RGB")
-                    image_ = image_.resize((width, height), resample=PIL_INTERPOLATION["lanczos"])
-                    image_ = np.array(image_)
-                    image_ = image_[None, :]
-                    images.append(image_)
-
-                image = np.concatenate(images, axis=0)
-                image = np.array(image).astype(np.float32) / 255.0
-                image = image.transpose(0, 3, 1, 2)
-                image = paddle.to_tensor(image)
-            elif isinstance(image[0], paddle.Tensor):
-                image = paddle.concat(image, axis=0)
-
-        image = image.cast(dtype)
-        return image
-
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
-        shape = [batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor]
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, dtype=dtype)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    def _default_height_width(self, height, width, image):
-        while isinstance(image, list):
-            image = image[0]
-
-        if height is None:
-            if isinstance(image, PIL.Image.Image):
-                height = image.height
-            elif isinstance(image, paddle.Tensor):
-                height = image.shape[3]
-
-            height = (height // 8) * 8  # round down to nearest multiple of 8
-
-        if width is None:
-            if isinstance(image, PIL.Image.Image):
-                width = image.width
-            elif isinstance(image, paddle.Tensor):
-                width = image.shape[2]
-
-            width = (width // 8) * 8  # round down to nearest multiple of 8
-
-        return height, width
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        prompt: str = None,
-        image: PIL.Image.Image = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: str = None,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        clip_skip: int = 1,
-        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
-        enable_lora: bool = True,
-        resize_mode: int = 0,
-        # ["Just resize", "Crop and resize", "Resize and fill", "Do nothing"]
-        #         0              1                   2               -1
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            image (`paddle.Tensor`, `PIL.Image.Image`):
-                The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If
-                the type is specified as `paddle.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can
-                also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If
-                height and/or width are passed, `image` is resized according to them. If multiple ControlNets are
-                specified in init, images must be passed as a list such that each element of the list can be correctly
-                batched for input to a single controlnet.
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
-                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
-                `self.processor` in
-                [ppdiffusers.cross_attention](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/ppdiffusers/ppdiffusers/models/cross_attention.py).
-            clip_skip (`int`, *optional*, defaults to 1):
-                CLIP_stop_at_last_layers, if clip_skip <= 1, we will use the last_hidden_state from text_encoder.
-            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
-                The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
-                to the residual in the original unet. If multiple ControlNets are specified in init, you can set the
-                corresponding scale as a list.
-        Examples:
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        self.add_ti_embedding_dir(self.TI_DIR)
-        enable_control = image is not None and self.controlnet is not None
-        try:
-            # 0. Default height and width to unet
-            if enable_control:
-                height, width = self._default_height_width(height, width, image)
-                image = self.prepare_image(
-                    image=image,
-                    width=width,
-                    height=height,
-                    dtype=self.controlnet.dtype,
-                    resize_mode=resize_mode,
-                )
-            else:
-                height = height or max(self.unet.config.sample_size * self.vae_scale_factor, 512)
-                width = width or max(self.unet.config.sample_size * self.vae_scale_factor, 512)
-
-            # 1. Check inputs. Raise error if not correct
-            self.check_inputs(
-                prompt,
-                image,
-                height,
-                width,
-                callback_steps,
-                negative_prompt,
-                controlnet_conditioning_scale,
-            )
-
-            batch_size = 1
-
-            # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-            # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-            # corresponds to doing no classifier free guidance.
-            do_classifier_free_guidance = guidance_scale > 1.0
-
-            prompts, extra_network_data = parse_prompts([prompt])
-
-            if enable_lora and self.LORA_DIR is not None:
-                if os.path.exists(self.LORA_DIR):
-                    lora_mapping = {p.stem: p.absolute() for p in Path(self.LORA_DIR).glob("*.safetensors")}
-                    for params in extra_network_data["lora"]:
-                        assert len(params.items) > 0
-                        name = params.items[0]
-                        if name in lora_mapping:
-                            ratio = float(params.items[1]) if len(params.items) > 1 else 1.0
-                            lora_state_dict = smart_load(lora_mapping[name], map_location=paddle.get_device())
-                            self.weights_has_changed = True
-                            load_lora(self, state_dict=lora_state_dict, ratio=ratio)
-                            del lora_state_dict
-                        else:
-                            print(f"We can't find lora weight: {name}! Please make sure that exists!")
-                else:
-                    if len(extra_network_data["lora"]) > 0:
-                        print(f"{self.LORA_DIR} not exists, so we cant load loras!")
-
-            self.sj.clip.CLIP_stop_at_last_layers = clip_skip
-            # 3. Encode input prompt
-            prompt_embeds, negative_prompt_embeds = self._encode_prompt(
-                prompts,
-                do_classifier_free_guidance,
-                negative_prompt,
-                num_inference_steps=num_inference_steps,
-            )
-
-            # 4. Prepare timesteps
-            self.scheduler.set_timesteps(num_inference_steps)
-            timesteps = self.scheduler.timesteps
-
-            # 5. Prepare latent variables
-            num_channels_latents = self.unet.in_channels
-            latents = self.prepare_latents(
-                batch_size,
-                num_channels_latents,
-                height,
-                width,
-                self.unet.dtype,
-                generator,
-                latents,
-            )
-
-            # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-            extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-            # 7. Denoising loop
-            num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-            with self.progress_bar(total=num_inference_steps) as progress_bar:
-                for i, t in enumerate(timesteps):
-                    step = i // self.scheduler.order
-                    do_batch = False
-                    conds_list, cond_tensor = reconstruct_multicond_batch(prompt_embeds, step)
-                    try:
-                        weight = conds_list[0][0][1]
-                    except Exception:
-                        weight = 1.0
-                    if do_classifier_free_guidance:
-                        uncond_tensor = reconstruct_cond_batch(negative_prompt_embeds, step)
-                        do_batch = cond_tensor.shape[1] == uncond_tensor.shape[1]
-
-                    # expand the latents if we are doing classifier free guidance
-                    latent_model_input = paddle.concat([latents] * 2) if do_batch else latents
-                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                    if do_batch:
-                        encoder_hidden_states = paddle.concat([uncond_tensor, cond_tensor])
-                        control_kwargs = {}
-                        if enable_control:
-                            down_block_res_samples, mid_block_res_sample = self.controlnet(
-                                latent_model_input,
-                                t,
-                                encoder_hidden_states=encoder_hidden_states,
-                                controlnet_cond=paddle.concat([image, image]),
-                                conditioning_scale=controlnet_conditioning_scale,
-                                return_dict=False,
-                            )
-                            control_kwargs["down_block_additional_residuals"] = down_block_res_samples
-                            control_kwargs["mid_block_additional_residual"] = mid_block_res_sample
-                        noise_pred = self.unet(
-                            latent_model_input,
-                            t,
-                            encoder_hidden_states=encoder_hidden_states,
-                            cross_attention_kwargs=cross_attention_kwargs,
-                            **control_kwargs,
-                        ).sample
-                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                        noise_pred = noise_pred_uncond + weight * guidance_scale * (
-                            noise_pred_text - noise_pred_uncond
-                        )
-                    else:
-                        control_kwargs = {}
-                        if enable_control:
-                            down_block_res_samples, mid_block_res_sample = self.controlnet(
-                                latent_model_input,
-                                t,
-                                encoder_hidden_states=cond_tensor,
-                                controlnet_cond=image,
-                                conditioning_scale=controlnet_conditioning_scale,
-                                return_dict=False,
-                            )
-                            control_kwargs["down_block_additional_residuals"] = down_block_res_samples
-                            control_kwargs["mid_block_additional_residual"] = mid_block_res_sample
-                        noise_pred = self.unet(
-                            latent_model_input,
-                            t,
-                            encoder_hidden_states=cond_tensor,
-                            cross_attention_kwargs=cross_attention_kwargs,
-                            **control_kwargs,
-                        ).sample
-
-                        if do_classifier_free_guidance:
-                            control_kwargs = {}
-                            if enable_control:
-                                down_block_res_samples, mid_block_res_sample = self.controlnet(
-                                    latent_model_input,
-                                    t,
-                                    encoder_hidden_states=uncond_tensor,
-                                    controlnet_cond=image,
-                                    conditioning_scale=controlnet_conditioning_scale,
-                                    return_dict=False,
-                                )
-                                control_kwargs["down_block_additional_residuals"] = down_block_res_samples
-                                control_kwargs["mid_block_additional_residual"] = mid_block_res_sample
-                            noise_pred_uncond = self.unet(
-                                latent_model_input,
-                                t,
-                                encoder_hidden_states=uncond_tensor,
-                                cross_attention_kwargs=cross_attention_kwargs,
-                                **control_kwargs,
-                            ).sample
-                            noise_pred = noise_pred_uncond + weight * guidance_scale * (noise_pred - noise_pred_uncond)
-
-                    # compute the previous noisy sample x_t -> x_t-1
-                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-                    # call the callback, if provided
-                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                        progress_bar.update()
-                        if callback is not None and i % callback_steps == 0:
-                            callback(i, t, latents)
-
-            if output_type == "latent":
-                image = latents
-                has_nsfw_concept = None
-            elif output_type == "pil":
-                # 8. Post-processing
-                image = self.decode_latents(latents)
-
-                # 9. Run safety checker
-                image, has_nsfw_concept = self.run_safety_checker(image, self.unet.dtype)
-
-                # 10. Convert to PIL
-                image = self.numpy_to_pil(image)
-            else:
-                # 8. Post-processing
-                image = self.decode_latents(latents)
-
-                # 9. Run safety checker
-                image, has_nsfw_concept = self.run_safety_checker(image, self.unet.dtype)
-
-            if not return_dict:
-                return (image, has_nsfw_concept)
-
-            return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
-        except Exception as e:
-            raise ValueError(e)
-        finally:
-            if enable_lora and self.weights_has_changed:
-                for sub_layer in self.text_encoder.sublayers(include_self=True):
-                    if hasattr(sub_layer, "backup_weights"):
-                        sub_layer.weight.copy_(sub_layer.backup_weights, True)
-                for sub_layer in self.unet.sublayers(include_self=True):
-                    if hasattr(sub_layer, "backup_weights"):
-                        sub_layer.weight.copy_(sub_layer.backup_weights, True)
-                self.weights_has_changed = False
-
-
-# clip.py
-import math
-from collections import namedtuple
-
-
-class PromptChunk:
-    """
-    This object contains token ids, weight (multipliers:1.4) and textual inversion embedding info for a chunk of prompt.
-    If a prompt is short, it is represented by one PromptChunk, otherwise, multiple are necessary.
-    Each PromptChunk contains an exact amount of tokens - 77, which includes one for start and end token,
-    so just 75 tokens from prompt.
-    """
-
-    def __init__(self):
-        self.tokens = []
-        self.multipliers = []
-        self.fixes = []
-
-
-PromptChunkFix = namedtuple("PromptChunkFix", ["offset", "embedding"])
-"""An object of this type is a marker showing that textual inversion embedding's vectors have to placed at offset in the prompt
-chunk. Thos objects are found in PromptChunk.fixes and, are placed into FrozenCLIPEmbedderWithCustomWordsBase.hijack.fixes, and finally
-are applied by sd_hijack.EmbeddingsWithFixes's forward function."""
-
-
-class FrozenCLIPEmbedder(nn.Layer):
-    """Uses the CLIP transformer encoder for text (from huggingface)"""
-
-    LAYERS = ["last", "pooled", "hidden"]
-
-    def __init__(self, text_encoder, tokenizer, freeze=True, layer="last", layer_idx=None):
-        super().__init__()
-        assert layer in self.LAYERS
-        self.tokenizer = tokenizer
-        self.text_encoder = text_encoder
-        if freeze:
-            self.freeze()
-        self.layer = layer
-        self.layer_idx = layer_idx
-        if layer == "hidden":
-            assert layer_idx is not None
-            assert 0 <= abs(layer_idx) <= 12
-
-    def freeze(self):
-        self.text_encoder.eval()
-        for param in self.parameters():
-            param.stop_gradient = False
-
-    def forward(self, text):
-        batch_encoding = self.tokenizer(
-            text,
-            truncation=True,
-            max_length=self.tokenizer.model_max_length,
-            padding="max_length",
-            return_tensors="pd",
-        )
-        tokens = batch_encoding["input_ids"]
-        outputs = self.text_encoder(input_ids=tokens, output_hidden_states=self.layer == "hidden", return_dict=True)
-        if self.layer == "last":
-            z = outputs.last_hidden_state
-        elif self.layer == "pooled":
-            z = outputs.pooler_output[:, None, :]
-        else:
-            z = outputs.hidden_states[self.layer_idx]
-        return z
-
-    def encode(self, text):
-        return self(text)
-
-
-class FrozenCLIPEmbedderWithCustomWordsBase(nn.Layer):
-    """A pytorch module that is a wrapper for FrozenCLIPEmbedder module. it enhances FrozenCLIPEmbedder, making it possible to
-    have unlimited prompt length and assign weights to tokens in prompt.
-    """
-
-    def __init__(self, wrapped, hijack):
-        super().__init__()
-
-        self.wrapped = wrapped
-        """Original FrozenCLIPEmbedder module; can also be FrozenOpenCLIPEmbedder or xlmr.BertSeriesModelWithTransformation,
-        depending on model."""
-
-        self.hijack = hijack
-        self.chunk_length = 75
-
-    def empty_chunk(self):
-        """creates an empty PromptChunk and returns it"""
-
-        chunk = PromptChunk()
-        chunk.tokens = [self.id_start] + [self.id_end] * (self.chunk_length + 1)
-        chunk.multipliers = [1.0] * (self.chunk_length + 2)
-        return chunk
-
-    def get_target_prompt_token_count(self, token_count):
-        """returns the maximum number of tokens a prompt of a known length can have before it requires one more PromptChunk to be represented"""
-
-        return math.ceil(max(token_count, 1) / self.chunk_length) * self.chunk_length
-
-    def tokenize(self, texts):
-        """Converts a batch of texts into a batch of token ids"""
-
-        raise NotImplementedError
-
-    def encode_with_text_encoder(self, tokens):
-        """
-        converts a batch of token ids (in python lists) into a single tensor with numeric respresentation of those tokens;
-        All python lists with tokens are assumed to have same length, usually 77.
-        if input is a list with B elements and each element has T tokens, expected output shape is (B, T, C), where C depends on
-        model - can be 768 and 1024.
-        Among other things, this call will read self.hijack.fixes, apply it to its inputs, and clear it (setting it to None).
-        """
-
-        raise NotImplementedError
-
-    def encode_embedding_init_text(self, init_text, nvpt):
-        """Converts text into a tensor with this text's tokens' embeddings. Note that those are embeddings before they are passed through
-        transformers. nvpt is used as a maximum length in tokens. If text produces less teokens than nvpt, only this many is returned."""
-
-        raise NotImplementedError
-
-    def tokenize_line(self, line):
-        """
-        this transforms a single prompt into a list of PromptChunk objects - as many as needed to
-        represent the prompt.
-        Returns the list and the total number of tokens in the prompt.
-        """
-
-        if WebUIStableDiffusionPipeline.enable_emphasis:
-            parsed = parse_prompt_attention(line)
-        else:
-            parsed = [[line, 1.0]]
-
-        tokenized = self.tokenize([text for text, _ in parsed])
-
-        chunks = []
-        chunk = PromptChunk()
-        token_count = 0
-        last_comma = -1
-
-        def next_chunk(is_last=False):
-            """puts current chunk into the list of results and produces the next one - empty;
-            if is_last is true, tokens <end-of-text> tokens at the end won't add to token_count"""
-            nonlocal token_count
-            nonlocal last_comma
-            nonlocal chunk
-
-            if is_last:
-                token_count += len(chunk.tokens)
-            else:
-                token_count += self.chunk_length
-
-            to_add = self.chunk_length - len(chunk.tokens)
-            if to_add > 0:
-                chunk.tokens += [self.id_end] * to_add
-                chunk.multipliers += [1.0] * to_add
-
-            chunk.tokens = [self.id_start] + chunk.tokens + [self.id_end]
-            chunk.multipliers = [1.0] + chunk.multipliers + [1.0]
-
-            last_comma = -1
-            chunks.append(chunk)
-            chunk = PromptChunk()
-
-        for tokens, (text, weight) in zip(tokenized, parsed):
-            if text == "BREAK" and weight == -1:
-                next_chunk()
-                continue
-
-            position = 0
-            while position < len(tokens):
-                token = tokens[position]
-
-                if token == self.comma_token:
-                    last_comma = len(chunk.tokens)
-
-                # this is when we are at the end of alloted 75 tokens for the current chunk, and the current token is not a comma. opts.comma_padding_backtrack
-                # is a setting that specifies that if there is a comma nearby, the text after the comma should be moved out of this chunk and into the next.
-                elif (
-                    WebUIStableDiffusionPipeline.comma_padding_backtrack != 0
-                    and len(chunk.tokens) == self.chunk_length
-                    and last_comma != -1
-                    and len(chunk.tokens) - last_comma <= WebUIStableDiffusionPipeline.comma_padding_backtrack
-                ):
-                    break_location = last_comma + 1
-
-                    reloc_tokens = chunk.tokens[break_location:]
-                    reloc_mults = chunk.multipliers[break_location:]
-
-                    chunk.tokens = chunk.tokens[:break_location]
-                    chunk.multipliers = chunk.multipliers[:break_location]
-
-                    next_chunk()
-                    chunk.tokens = reloc_tokens
-                    chunk.multipliers = reloc_mults
-
-                if len(chunk.tokens) == self.chunk_length:
-                    next_chunk()
-
-                embedding, embedding_length_in_tokens = self.hijack.embedding_db.find_embedding_at_position(
-                    tokens, position
-                )
-                if embedding is None:
-                    chunk.tokens.append(token)
-                    chunk.multipliers.append(weight)
-                    position += 1
-                    continue
-
-                emb_len = int(embedding.vec.shape[0])
-                if len(chunk.tokens) + emb_len > self.chunk_length:
-                    next_chunk()
-
-                chunk.fixes.append(PromptChunkFix(len(chunk.tokens), embedding))
-
-                chunk.tokens += [0] * emb_len
-                chunk.multipliers += [weight] * emb_len
-                position += embedding_length_in_tokens
-
-        if len(chunk.tokens) > 0 or len(chunks) == 0:
-            next_chunk(is_last=True)
-
-        return chunks, token_count
-
-    def process_texts(self, texts):
-        """
-        Accepts a list of texts and calls tokenize_line() on each, with cache. Returns the list of results and maximum
-        length, in tokens, of all texts.
-        """
-
-        token_count = 0
-
-        cache = {}
-        batch_chunks = []
-        for line in texts:
-            if line in cache:
-                chunks = cache[line]
-            else:
-                chunks, current_token_count = self.tokenize_line(line)
-                token_count = max(current_token_count, token_count)
-
-                cache[line] = chunks
-
-            batch_chunks.append(chunks)
-
-        return batch_chunks, token_count
-
-    def forward(self, texts):
-        """
-        Accepts an array of texts; Passes texts through transformers network to create a tensor with numerical representation of those texts.
-        Returns a tensor with shape of (B, T, C), where B is length of the array; T is length, in tokens, of texts (including padding) - T will
-        be a multiple of 77; and C is dimensionality of each token - for SD1 it's 768, and for SD2 it's 1024.
-        An example shape returned by this function can be: (2, 77, 768).
-        Webui usually sends just one text at a time through this function - the only time when texts is an array with more than one elemenet
-        is when you do prompt editing: "a picture of a [cat:dog:0.4] eating ice cream"
-        """
-
-        batch_chunks, token_count = self.process_texts(texts)
-
-        used_embeddings = {}
-        chunk_count = max([len(x) for x in batch_chunks])
-
-        zs = []
-        for i in range(chunk_count):
-            batch_chunk = [chunks[i] if i < len(chunks) else self.empty_chunk() for chunks in batch_chunks]
-
-            tokens = [x.tokens for x in batch_chunk]
-            multipliers = [x.multipliers for x in batch_chunk]
-            self.hijack.fixes = [x.fixes for x in batch_chunk]
-
-            for fixes in self.hijack.fixes:
-                for position, embedding in fixes:
-                    used_embeddings[embedding.name] = embedding
-
-            z = self.process_tokens(tokens, multipliers)
-            zs.append(z)
-
-        if len(used_embeddings) > 0:
-            embeddings_list = ", ".join(
-                [f"{name} [{embedding.checksum()}]" for name, embedding in used_embeddings.items()]
-            )
-            self.hijack.comments.append(f"Used embeddings: {embeddings_list}")
-
-        return paddle.concat(zs, axis=1)
-
-    def process_tokens(self, remade_batch_tokens, batch_multipliers):
-        """
-        sends one single prompt chunk to be encoded by transformers neural network.
-        remade_batch_tokens is a batch of tokens - a list, where every element is a list of tokens; usually
-        there are exactly 77 tokens in the list. batch_multipliers is the same but for multipliers instead of tokens.
-        Multipliers are used to give more or less weight to the outputs of transformers network. Each multiplier
-        corresponds to one token.
-        """
-        tokens = paddle.to_tensor(remade_batch_tokens)
-
-        # this is for SD2: SD1 uses the same token for padding and end of text, while SD2 uses different ones.
-        if self.id_end != self.id_pad:
-            for batch_pos in range(len(remade_batch_tokens)):
-                index = remade_batch_tokens[batch_pos].index(self.id_end)
-                tokens[batch_pos, index + 1 : tokens.shape[1]] = self.id_pad
-
-        z = self.encode_with_text_encoder(tokens)
-
-        # restoring original mean is likely not correct, but it seems to work well to prevent artifacts that happen otherwise
-        batch_multipliers = paddle.to_tensor(batch_multipliers)
-        original_mean = z.mean()
-        z = z * batch_multipliers.reshape(
-            batch_multipliers.shape
-            + [
-                1,
-            ]
-        ).expand(z.shape)
-        new_mean = z.mean()
-        z = z * (original_mean / new_mean)
-
-        return z
-
-
-class FrozenCLIPEmbedderWithCustomWords(FrozenCLIPEmbedderWithCustomWordsBase):
-    def __init__(self, wrapped, hijack, CLIP_stop_at_last_layers=-1):
-        super().__init__(wrapped, hijack)
-        self.CLIP_stop_at_last_layers = CLIP_stop_at_last_layers
-        self.tokenizer = wrapped.tokenizer
-
-        vocab = self.tokenizer.get_vocab()
-
-        self.comma_token = vocab.get(",</w>", None)
-
-        self.token_mults = {}
-        tokens_with_parens = [(k, v) for k, v in vocab.items() if "(" in k or ")" in k or "[" in k or "]" in k]
-        for text, ident in tokens_with_parens:
-            mult = 1.0
-            for c in text:
-                if c == "[":
-                    mult /= 1.1
-                if c == "]":
-                    mult *= 1.1
-                if c == "(":
-                    mult *= 1.1
-                if c == ")":
-                    mult /= 1.1
-
-            if mult != 1.0:
-                self.token_mults[ident] = mult
-
-        self.id_start = self.wrapped.tokenizer.bos_token_id
-        self.id_end = self.wrapped.tokenizer.eos_token_id
-        self.id_pad = self.id_end
-
-    def tokenize(self, texts):
-        tokenized = self.wrapped.tokenizer(texts, truncation=False, add_special_tokens=False)["input_ids"]
-
-        return tokenized
-
-    def encode_with_text_encoder(self, tokens):
-        output_hidden_states = self.CLIP_stop_at_last_layers > 1
-        outputs = self.wrapped.text_encoder(
-            input_ids=tokens, output_hidden_states=output_hidden_states, return_dict=True
-        )
-
-        if output_hidden_states:
-            z = outputs.hidden_states[-self.CLIP_stop_at_last_layers]
-            z = self.wrapped.text_encoder.text_model.ln_final(z)
-        else:
-            z = outputs.last_hidden_state
-
-        return z
-
-    def encode_embedding_init_text(self, init_text, nvpt):
-        embedding_layer = self.wrapped.text_encoder.text_model
-        ids = self.wrapped.tokenizer(init_text, max_length=nvpt, return_tensors="pd", add_special_tokens=False)[
-            "input_ids"
-        ]
-        embedded = embedding_layer.token_embedding.wrapped(ids).squeeze(0)
-
-        return embedded
-
-
-# extra_networks.py
-import re
-from collections import defaultdict
-
-
-class ExtraNetworkParams:
-    def __init__(self, items=None):
-        self.items = items or []
-
-
-re_extra_net = re.compile(r"<(\w+):([^>]+)>")
-
-
-def parse_prompt(prompt):
-    res = defaultdict(list)
-
-    def found(m):
-        name = m.group(1)
-        args = m.group(2)
-
-        res[name].append(ExtraNetworkParams(items=args.split(":")))
-
-        return ""
-
-    prompt = re.sub(re_extra_net, found, prompt)
-
-    return prompt, res
-
-
-def parse_prompts(prompts):
-    res = []
-    extra_data = None
-
-    for prompt in prompts:
-        updated_prompt, parsed_extra_data = parse_prompt(prompt)
-
-        if extra_data is None:
-            extra_data = parsed_extra_data
-
-        res.append(updated_prompt)
-
-    return res, extra_data
-
-
-# image_embeddings.py
-
-import base64
-import json
-import zlib
-
-import numpy as np
-from PIL import Image
-
-
-class EmbeddingDecoder(json.JSONDecoder):
-    def __init__(self, *args, **kwargs):
-        json.JSONDecoder.__init__(self, object_hook=self.object_hook, *args, **kwargs)
-
-    def object_hook(self, d):
-        if "TORCHTENSOR" in d:
-            return paddle.to_tensor(np.array(d["TORCHTENSOR"]))
-        return d
-
-
-def embedding_from_b64(data):
-    d = base64.b64decode(data)
-    return json.loads(d, cls=EmbeddingDecoder)
-
-
-def lcg(m=2**32, a=1664525, c=1013904223, seed=0):
-    while True:
-        seed = (a * seed + c) % m
-        yield seed % 255
-
-
-def xor_block(block):
-    g = lcg()
-    randblock = np.array([next(g) for _ in range(np.product(block.shape))]).astype(np.uint8).reshape(block.shape)
-    return np.bitwise_xor(block.astype(np.uint8), randblock & 0x0F)
-
-
-def crop_black(img, tol=0):
-    mask = (img > tol).all(2)
-    mask0, mask1 = mask.any(0), mask.any(1)
-    col_start, col_end = mask0.argmax(), mask.shape[1] - mask0[::-1].argmax()
-    row_start, row_end = mask1.argmax(), mask.shape[0] - mask1[::-1].argmax()
-    return img[row_start:row_end, col_start:col_end]
-
-
-def extract_image_data_embed(image):
-    d = 3
-    outarr = (
-        crop_black(np.array(image.convert("RGB").getdata()).reshape(image.size[1], image.size[0], d).astype(np.uint8))
-        & 0x0F
-    )
-    black_cols = np.where(np.sum(outarr, axis=(0, 2)) == 0)
-    if black_cols[0].shape[0] < 2:
-        print("No Image data blocks found.")
-        return None
-
-    data_block_lower = outarr[:, : black_cols[0].min(), :].astype(np.uint8)
-    data_block_upper = outarr[:, black_cols[0].max() + 1 :, :].astype(np.uint8)
-
-    data_block_lower = xor_block(data_block_lower)
-    data_block_upper = xor_block(data_block_upper)
-
-    data_block = (data_block_upper << 4) | (data_block_lower)
-    data_block = data_block.flatten().tobytes()
-
-    data = zlib.decompress(data_block)
-    return json.loads(data, cls=EmbeddingDecoder)
-
-
-# prompt_parser.py
-import re
-from collections import namedtuple
-from typing import List
-
-import lark
-
-# a prompt like this: "fantasy landscape with a [mountain:lake:0.25] and [an oak:a christmas tree:0.75][ in foreground::0.6][ in background:0.25] [shoddy:masterful:0.5]"
-# will be represented with prompt_schedule like this (assuming steps=100):
-# [25, 'fantasy landscape with a mountain and an oak in foreground shoddy']
-# [50, 'fantasy landscape with a lake and an oak in foreground in background shoddy']
-# [60, 'fantasy landscape with a lake and an oak in foreground in background masterful']
-# [75, 'fantasy landscape with a lake and an oak in background masterful']
-# [100, 'fantasy landscape with a lake and a christmas tree in background masterful']
-
-schedule_parser = lark.Lark(
-    r"""
-!start: (prompt | /[][():]/+)*
-prompt: (emphasized | scheduled | alternate | plain | WHITESPACE)*
-!emphasized: "(" prompt ")"
-        | "(" prompt ":" prompt ")"
-        | "[" prompt "]"
-scheduled: "[" [prompt ":"] prompt ":" [WHITESPACE] NUMBER "]"
-alternate: "[" prompt ("|" prompt)+ "]"
-WHITESPACE: /\s+/
-plain: /([^\\\[\]():|]|\\.)+/
-%import common.SIGNED_NUMBER -> NUMBER
-"""
-)
-
-
-def get_learned_conditioning_prompt_schedules(prompts, steps):
-    """
-    >>> g = lambda p: get_learned_conditioning_prompt_schedules([p], 10)[0]
-    >>> g("test")
-    [[10, 'test']]
-    >>> g("a [b:3]")
-    [[3, 'a '], [10, 'a b']]
-    >>> g("a [b: 3]")
-    [[3, 'a '], [10, 'a b']]
-    >>> g("a [[[b]]:2]")
-    [[2, 'a '], [10, 'a [[b]]']]
-    >>> g("[(a:2):3]")
-    [[3, ''], [10, '(a:2)']]
-    >>> g("a [b : c : 1] d")
-    [[1, 'a b  d'], [10, 'a  c  d']]
-    >>> g("a[b:[c:d:2]:1]e")
-    [[1, 'abe'], [2, 'ace'], [10, 'ade']]
-    >>> g("a [unbalanced")
-    [[10, 'a [unbalanced']]
-    >>> g("a [b:.5] c")
-    [[5, 'a  c'], [10, 'a b c']]
-    >>> g("a [{b|d{:.5] c")  # not handling this right now
-    [[5, 'a  c'], [10, 'a {b|d{ c']]
-    >>> g("((a][:b:c [d:3]")
-    [[3, '((a][:b:c '], [10, '((a][:b:c d']]
-    >>> g("[a|(b:1.1)]")
-    [[1, 'a'], [2, '(b:1.1)'], [3, 'a'], [4, '(b:1.1)'], [5, 'a'], [6, '(b:1.1)'], [7, 'a'], [8, '(b:1.1)'], [9, 'a'], [10, '(b:1.1)']]
-    """
-
-    def collect_steps(steps, tree):
-        l = [steps]
-
-        class CollectSteps(lark.Visitor):
-            def scheduled(self, tree):
-                tree.children[-1] = float(tree.children[-1])
-                if tree.children[-1] < 1:
-                    tree.children[-1] *= steps
-                tree.children[-1] = min(steps, int(tree.children[-1]))
-                l.append(tree.children[-1])
-
-            def alternate(self, tree):
-                l.extend(range(1, steps + 1))
-
-        CollectSteps().visit(tree)
-        return sorted(set(l))
-
-    def at_step(step, tree):
-        class AtStep(lark.Transformer):
-            def scheduled(self, args):
-                before, after, _, when = args
-                yield before or () if step <= when else after
-
-            def alternate(self, args):
-                yield next(args[(step - 1) % len(args)])
-
-            def start(self, args):
-                def flatten(x):
-                    if type(x) == str:
-                        yield x
-                    else:
-                        for gen in x:
-                            yield from flatten(gen)
-
-                return "".join(flatten(args))
-
-            def plain(self, args):
-                yield args[0].value
-
-            def __default__(self, data, children, meta):
-                for child in children:
-                    yield child
-
-        return AtStep().transform(tree)
-
-    def get_schedule(prompt):
-        try:
-            tree = schedule_parser.parse(prompt)
-        except lark.exceptions.LarkError:
-            if 0:
-                import traceback
-
-                traceback.print_exc()
-            return [[steps, prompt]]
-        return [[t, at_step(t, tree)] for t in collect_steps(steps, tree)]
-
-    promptdict = {prompt: get_schedule(prompt) for prompt in set(prompts)}
-    return [promptdict[prompt] for prompt in prompts]
-
-
-ScheduledPromptConditioning = namedtuple("ScheduledPromptConditioning", ["end_at_step", "cond"])
-
-
-def get_learned_conditioning(model, prompts, steps):
-    """converts a list of prompts into a list of prompt schedules - each schedule is a list of ScheduledPromptConditioning, specifying the comdition (cond),
-    and the sampling step at which this condition is to be replaced by the next one.
-
-    Input:
-    (model, ['a red crown', 'a [blue:green:5] jeweled crown'], 20)
-
-    Output:
-    [
-        [
-            ScheduledPromptConditioning(end_at_step=20, cond=tensor([[-0.3886,  0.0229, -0.0523,  ..., -0.4901, -0.3066,  0.0674], ..., [ 0.3317, -0.5102, -0.4066,  ...,  0.4119, -0.7647, -1.0160]], device='cuda:0'))
-        ],
-        [
-            ScheduledPromptConditioning(end_at_step=5, cond=tensor([[-0.3886,  0.0229, -0.0522,  ..., -0.4901, -0.3067,  0.0673], ..., [-0.0192,  0.3867, -0.4644,  ...,  0.1135, -0.3696, -0.4625]], device='cuda:0')),
-            ScheduledPromptConditioning(end_at_step=20, cond=tensor([[-0.3886,  0.0229, -0.0522,  ..., -0.4901, -0.3067,  0.0673], ..., [-0.7352, -0.4356, -0.7888,  ...,  0.6994, -0.4312, -1.2593]], device='cuda:0'))
-        ]
-    ]
-    """
-    res = []
-
-    prompt_schedules = get_learned_conditioning_prompt_schedules(prompts, steps)
-    cache = {}
-
-    for prompt, prompt_schedule in zip(prompts, prompt_schedules):
-
-        cached = cache.get(prompt, None)
-        if cached is not None:
-            res.append(cached)
-            continue
-
-        texts = [x[1] for x in prompt_schedule]
-        conds = model(texts)
-
-        cond_schedule = []
-        for i, (end_at_step, text) in enumerate(prompt_schedule):
-            cond_schedule.append(ScheduledPromptConditioning(end_at_step, conds[i]))
-
-        cache[prompt] = cond_schedule
-        res.append(cond_schedule)
-
-    return res
-
-
-re_AND = re.compile(r"\bAND\b")
-re_weight = re.compile(r"^(.*?)(?:\s*:\s*([-+]?(?:\d+\.?|\d*\.\d+)))?\s*$")
-
-
-def get_multicond_prompt_list(prompts):
-    res_indexes = []
-
-    prompt_flat_list = []
-    prompt_indexes = {}
-
-    for prompt in prompts:
-        subprompts = re_AND.split(prompt)
-
-        indexes = []
-        for subprompt in subprompts:
-            match = re_weight.search(subprompt)
-
-            text, weight = match.groups() if match is not None else (subprompt, 1.0)
-
-            weight = float(weight) if weight is not None else 1.0
-
-            index = prompt_indexes.get(text, None)
-            if index is None:
-                index = len(prompt_flat_list)
-                prompt_flat_list.append(text)
-                prompt_indexes[text] = index
-
-            indexes.append((index, weight))
-
-        res_indexes.append(indexes)
-
-    return res_indexes, prompt_flat_list, prompt_indexes
-
-
-class ComposableScheduledPromptConditioning:
-    def __init__(self, schedules, weight=1.0):
-        self.schedules: List[ScheduledPromptConditioning] = schedules
-        self.weight: float = weight
-
-
-class MulticondLearnedConditioning:
-    def __init__(self, shape, batch):
-        self.shape: tuple = shape  # the shape field is needed to send this object to DDIM/PLMS
-        self.batch: List[List[ComposableScheduledPromptConditioning]] = batch
-
-
-def get_multicond_learned_conditioning(model, prompts, steps) -> MulticondLearnedConditioning:
-    """same as get_learned_conditioning, but returns a list of ScheduledPromptConditioning along with the weight objects for each prompt.
-    For each prompt, the list is obtained by splitting the prompt using the AND separator.
-
-    https://energy-based-model.github.io/Compositional-Visual-Generation-with-Composable-Diffusion-Models/
-    """
-
-    res_indexes, prompt_flat_list, prompt_indexes = get_multicond_prompt_list(prompts)
-
-    learned_conditioning = get_learned_conditioning(model, prompt_flat_list, steps)
-
-    res = []
-    for indexes in res_indexes:
-        res.append([ComposableScheduledPromptConditioning(learned_conditioning[i], weight) for i, weight in indexes])
-
-    return MulticondLearnedConditioning(shape=(len(prompts),), batch=res)
-
-
-def reconstruct_cond_batch(c: List[List[ScheduledPromptConditioning]], current_step):
-    param = c[0][0].cond
-    res = paddle.zeros(
-        [
-            len(c),
-        ]
-        + param.shape,
-        dtype=param.dtype,
-    )
-    for i, cond_schedule in enumerate(c):
-        target_index = 0
-        for current, (end_at, cond) in enumerate(cond_schedule):
-            if current_step <= end_at:
-                target_index = current
-                break
-        res[i] = cond_schedule[target_index].cond
-
-    return res
-
-
-def reconstruct_multicond_batch(c: MulticondLearnedConditioning, current_step):
-    param = c.batch[0][0].schedules[0].cond
-
-    tensors = []
-    conds_list = []
-
-    for batch_no, composable_prompts in enumerate(c.batch):
-        conds_for_batch = []
-
-        for cond_index, composable_prompt in enumerate(composable_prompts):
-            target_index = 0
-            for current, (end_at, cond) in enumerate(composable_prompt.schedules):
-                if current_step <= end_at:
-                    target_index = current
-                    break
-
-            conds_for_batch.append((len(tensors), composable_prompt.weight))
-            tensors.append(composable_prompt.schedules[target_index].cond)
-
-        conds_list.append(conds_for_batch)
-
-    # if prompts have wildly different lengths above the limit we'll get tensors fo different shapes
-    # and won't be able to torch.stack them. So this fixes that.
-    token_count = max([x.shape[0] for x in tensors])
-    for i in range(len(tensors)):
-        if tensors[i].shape[0] != token_count:
-            last_vector = tensors[i][-1:]
-            last_vector_repeated = last_vector.tile([token_count - tensors[i].shape[0], 1])
-            tensors[i] = paddle.concat([tensors[i], last_vector_repeated], axis=0)
-
-    return conds_list, paddle.stack(tensors).cast(dtype=param.dtype)
-
-
-re_attention = re.compile(
-    r"""
-\\\(|
-\\\)|
-\\\[|
-\\]|
-\\\\|
-\\|
-\(|
-\[|
-:([+-]?[.\d]+)\)|
-\)|
-]|
-[^\\()\[\]:]+|
-:
-""",
-    re.X,
-)
-
-re_break = re.compile(r"\s*\bBREAK\b\s*", re.S)
-
-
-def parse_prompt_attention(text):
-    """
-    Parses a string with attention tokens and returns a list of pairs: text and its associated weight.
-    Accepted tokens are:
-      (abc) - increases attention to abc by a multiplier of 1.1
-      (abc:3.12) - increases attention to abc by a multiplier of 3.12
-      [abc] - decreases attention to abc by a multiplier of 1.1
-      \( - literal character '('
-      \[ - literal character '['
-      \) - literal character ')'
-      \] - literal character ']'
-      \\ - literal character '\'
-      anything else - just text
-
-    >>> parse_prompt_attention('normal text')
-    [['normal text', 1.0]]
-    >>> parse_prompt_attention('an (important) word')
-    [['an ', 1.0], ['important', 1.1], [' word', 1.0]]
-    >>> parse_prompt_attention('(unbalanced')
-    [['unbalanced', 1.1]]
-    >>> parse_prompt_attention('\(literal\]')
-    [['(literal]', 1.0]]
-    >>> parse_prompt_attention('(unnecessary)(parens)')
-    [['unnecessaryparens', 1.1]]
-    >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
-    [['a ', 1.0],
-     ['house', 1.5730000000000004],
-     [' ', 1.1],
-     ['on', 1.0],
-     [' a ', 1.1],
-     ['hill', 0.55],
-     [', sun, ', 1.1],
-     ['sky', 1.4641000000000006],
-     ['.', 1.1]]
-    """
-
-    res = []
-    round_brackets = []
-    square_brackets = []
-
-    round_bracket_multiplier = 1.1
-    square_bracket_multiplier = 1 / 1.1
-
-    def multiply_range(start_position, multiplier):
-        for p in range(start_position, len(res)):
-            res[p][1] *= multiplier
-
-    for m in re_attention.finditer(text):
-        text = m.group(0)
-        weight = m.group(1)
-
-        if text.startswith("\\"):
-            res.append([text[1:], 1.0])
-        elif text == "(":
-            round_brackets.append(len(res))
-        elif text == "[":
-            square_brackets.append(len(res))
-        elif weight is not None and len(round_brackets) > 0:
-            multiply_range(round_brackets.pop(), float(weight))
-        elif text == ")" and len(round_brackets) > 0:
-            multiply_range(round_brackets.pop(), round_bracket_multiplier)
-        elif text == "]" and len(square_brackets) > 0:
-            multiply_range(square_brackets.pop(), square_bracket_multiplier)
-        else:
-            parts = re.split(re_break, text)
-            for i, part in enumerate(parts):
-                if i > 0:
-                    res.append(["BREAK", -1])
-                res.append([part, 1.0])
-
-    for pos in round_brackets:
-        multiply_range(pos, round_bracket_multiplier)
-
-    for pos in square_brackets:
-        multiply_range(pos, square_bracket_multiplier)
-
-    if len(res) == 0:
-        res = [["", 1.0]]
-
-    # merge runs of identical weights
-    i = 0
-    while i + 1 < len(res):
-        if res[i][1] == res[i + 1][1]:
-            res[i][0] += res[i + 1][0]
-            res.pop(i + 1)
-        else:
-            i += 1
-
-    return res
-
-
-# sd_hijack.py
-
-
-class StableDiffusionModelHijack:
-    fixes = None
-    comments = []
-    layers = None
-    circular_enabled = False
-
-    def __init__(self, clip_model, embeddings_dir=None, CLIP_stop_at_last_layers=-1):
-        model_embeddings = clip_model.text_encoder.text_model
-        model_embeddings.token_embedding = EmbeddingsWithFixes(model_embeddings.token_embedding, self)
-        clip_model = FrozenCLIPEmbedderWithCustomWords(
-            clip_model, self, CLIP_stop_at_last_layers=CLIP_stop_at_last_layers
-        )
-
-        self.embedding_db = EmbeddingDatabase(clip_model)
-        self.embedding_db.add_embedding_dir(embeddings_dir)
-
-        # hack this!
-        self.clip = clip_model
-
-        def flatten(el):
-            flattened = [flatten(children) for children in el.children()]
-            res = [el]
-            for c in flattened:
-                res += c
-            return res
-
-        self.layers = flatten(clip_model)
-
-    def clear_comments(self):
-        self.comments = []
-
-    def get_prompt_lengths(self, text):
-        _, token_count = self.clip.process_texts([text])
-
-        return token_count, self.clip.get_target_prompt_token_count(token_count)
-
-
-class EmbeddingsWithFixes(nn.Layer):
-    def __init__(self, wrapped, embeddings):
-        super().__init__()
-        self.wrapped = wrapped
-        self.embeddings = embeddings
-
-    def forward(self, input_ids):
-        batch_fixes = self.embeddings.fixes
-        self.embeddings.fixes = None
-
-        inputs_embeds = self.wrapped(input_ids)
-
-        if batch_fixes is None or len(batch_fixes) == 0 or max([len(x) for x in batch_fixes]) == 0:
-            return inputs_embeds
-
-        vecs = []
-        for fixes, tensor in zip(batch_fixes, inputs_embeds):
-            for offset, embedding in fixes:
-                emb = embedding.vec.cast(self.wrapped.dtype)
-                emb_len = min(tensor.shape[0] - offset - 1, emb.shape[0])
-                tensor = paddle.concat([tensor[0 : offset + 1], emb[0:emb_len], tensor[offset + 1 + emb_len :]])
-
-            vecs.append(tensor)
-
-        return paddle.stack(vecs)
-
-
-# textual_inversion.py
-
-import os
-import sys
-import traceback
-
-
-class Embedding:
-    def __init__(self, vec, name, step=None):
-        self.vec = vec
-        self.name = name
-        self.step = step
-        self.shape = None
-        self.vectors = 0
-        self.cached_checksum = None
-        self.sd_checkpoint = None
-        self.sd_checkpoint_name = None
-        self.optimizer_state_dict = None
-        self.filename = None
-
-    def save(self, filename):
-        embedding_data = {
-            "string_to_token": {"*": 265},
-            "string_to_param": {"*": self.vec},
-            "name": self.name,
-            "step": self.step,
-            "sd_checkpoint": self.sd_checkpoint,
-            "sd_checkpoint_name": self.sd_checkpoint_name,
-        }
-
-        paddle.save(embedding_data, filename)
-
-    def checksum(self):
-        if self.cached_checksum is not None:
-            return self.cached_checksum
-
-        def const_hash(a):
-            r = 0
-            for v in a:
-                r = (r * 281 ^ int(v) * 997) & 0xFFFFFFFF
-            return r
-
-        self.cached_checksum = f"{const_hash(self.vec.flatten() * 100) & 0xffff:04x}"
-        return self.cached_checksum
-
-
-class DirWithTextualInversionEmbeddings:
-    def __init__(self, path):
-        self.path = path
-        self.mtime = None
-
-    def has_changed(self):
-        if not os.path.isdir(self.path):
-            return False
-
-        mt = os.path.getmtime(self.path)
-        if self.mtime is None or mt > self.mtime:
-            return True
-
-    def update(self):
-        if not os.path.isdir(self.path):
-            return
-
-        self.mtime = os.path.getmtime(self.path)
-
-
-class EmbeddingDatabase:
-    def __init__(self, clip):
-        self.clip = clip
-        self.ids_lookup = {}
-        self.word_embeddings = {}
-        self.skipped_embeddings = {}
-        self.expected_shape = -1
-        self.embedding_dirs = {}
-        self.previously_displayed_embeddings = ()
-
-    def add_embedding_dir(self, path):
-        if path is not None and path not in self.embedding_dirs:
-            self.embedding_dirs[path] = DirWithTextualInversionEmbeddings(path)
-
-    def clear_embedding_dirs(self):
-        self.embedding_dirs.clear()
-
-    def register_embedding(self, embedding, model):
-        self.word_embeddings[embedding.name] = embedding
-
-        ids = model.tokenize([embedding.name])[0]
-
-        first_id = ids[0]
-        if first_id not in self.ids_lookup:
-            self.ids_lookup[first_id] = []
-
-        self.ids_lookup[first_id] = sorted(
-            self.ids_lookup[first_id] + [(ids, embedding)], key=lambda x: len(x[0]), reverse=True
-        )
-
-        return embedding
-
-    def get_expected_shape(self):
-        vec = self.clip.encode_embedding_init_text(",", 1)
-        return vec.shape[1]
-
-    def load_from_file(self, path, filename):
-        name, ext = os.path.splitext(filename)
-        ext = ext.upper()
-
-        if ext in [".PNG", ".WEBP", ".JXL", ".AVIF"]:
-            _, second_ext = os.path.splitext(name)
-            if second_ext.upper() == ".PREVIEW":
-                return
-
-            embed_image = Image.open(path)
-            if hasattr(embed_image, "text") and "sd-ti-embedding" in embed_image.text:
-                data = embedding_from_b64(embed_image.text["sd-ti-embedding"])
-                name = data.get("name", name)
-            else:
-                data = extract_image_data_embed(embed_image)
-                if data:
-                    name = data.get("name", name)
-                else:
-                    # if data is None, means this is not an embeding, just a preview image
-                    return
-        elif ext in [".BIN", ".PT"]:
-            data = torch_load(path)
-        elif ext in [".SAFETENSORS"]:
-            data = safetensors_load(path)
-        else:
-            return
-
-        # textual inversion embeddings
-        if "string_to_param" in data:
-            param_dict = data["string_to_param"]
-            if hasattr(param_dict, "_parameters"):
-                param_dict = getattr(param_dict, "_parameters")
-            assert len(param_dict) == 1, "embedding file has multiple terms in it"
-            emb = next(iter(param_dict.items()))[1]
-        # diffuser concepts
-        elif type(data) == dict and type(next(iter(data.values()))) == paddle.Tensor:
-            assert len(data.keys()) == 1, "embedding file has multiple terms in it"
-
-            emb = next(iter(data.values()))
-            if len(emb.shape) == 1:
-                emb = emb.unsqueeze(0)
-        else:
-            raise Exception(
-                f"Couldn't identify {filename} as neither textual inversion embedding nor diffuser concept."
-            )
-
-        with paddle.no_grad():
-            if hasattr(emb, "detach"):
-                emb = emb.detach()
-            if hasattr(emb, "cpu"):
-                emb = emb.cpu()
-            if hasattr(emb, "numpy"):
-                emb = emb.numpy()
-            emb = paddle.to_tensor(emb)
-            vec = emb.detach().cast(paddle.float32)
-        embedding = Embedding(vec, name)
-        embedding.step = data.get("step", None)
-        embedding.sd_checkpoint = data.get("sd_checkpoint", None)
-        embedding.sd_checkpoint_name = data.get("sd_checkpoint_name", None)
-        embedding.vectors = vec.shape[0]
-        embedding.shape = vec.shape[-1]
-        embedding.filename = path
-
-        if self.expected_shape == -1 or self.expected_shape == embedding.shape:
-            self.register_embedding(embedding, self.clip)
-        else:
-            self.skipped_embeddings[name] = embedding
-
-    def load_from_dir(self, embdir):
-        if not os.path.isdir(embdir.path):
-            return
-
-        for root, dirs, fns in os.walk(embdir.path, followlinks=True):
-            for fn in fns:
-                try:
-                    fullfn = os.path.join(root, fn)
-
-                    if os.stat(fullfn).st_size == 0:
-                        continue
-
-                    self.load_from_file(fullfn, fn)
-                except Exception:
-                    print(f"Error loading embedding {fn}:", file=sys.stderr)
-                    print(traceback.format_exc(), file=sys.stderr)
-                    continue
-
-    def load_textual_inversion_embeddings(self, force_reload=False):
-        if not force_reload:
-            need_reload = False
-            for path, embdir in self.embedding_dirs.items():
-                if embdir.has_changed():
-                    need_reload = True
-                    break
-
-            if not need_reload:
-                return
-
-        self.ids_lookup.clear()
-        self.word_embeddings.clear()
-        self.skipped_embeddings.clear()
-        self.expected_shape = self.get_expected_shape()
-
-        for path, embdir in self.embedding_dirs.items():
-            self.load_from_dir(embdir)
-            embdir.update()
-
-        displayed_embeddings = (tuple(self.word_embeddings.keys()), tuple(self.skipped_embeddings.keys()))
-        if self.previously_displayed_embeddings != displayed_embeddings:
-            self.previously_displayed_embeddings = displayed_embeddings
-            print(
-                f"Textual inversion embeddings loaded({len(self.word_embeddings)}): {', '.join(self.word_embeddings.keys())}"
-            )
-            if len(self.skipped_embeddings) > 0:
-                print(
-                    f"Textual inversion embeddings skipped({len(self.skipped_embeddings)}): {', '.join(self.skipped_embeddings.keys())}"
-                )
-
-    def find_embedding_at_position(self, tokens, offset):
-        token = tokens[offset]
-        possible_matches = self.ids_lookup.get(token, None)
-
-        if possible_matches is None:
-            return None, None
-
-        for ids, embedding in possible_matches:
-            if tokens[offset : offset + len(ids)] == ids:
-                return embedding, len(ids)
-
-        return None, None
diff --git a/ppdiffusers/examples/community/wildcard_stable_diffusion.py b/ppdiffusers/examples/community/wildcard_stable_diffusion.py
deleted file mode 100644
index dc527e129221..000000000000
--- a/ppdiffusers/examples/community/wildcard_stable_diffusion.py
+++ /dev/null
@@ -1,423 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-import os
-import random
-import re
-from dataclasses import dataclass
-from typing import Callable, Dict, List, Optional, Union
-
-import paddle
-
-from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
-from ppdiffusers.configuration_utils import FrozenDict
-from ppdiffusers.models import AutoencoderKL, UNet2DConditionModel
-from ppdiffusers.pipeline_utils import DiffusionPipeline
-from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import (
-    StableDiffusionPipelineOutput,
-)
-from ppdiffusers.pipelines.stable_diffusion.safety_checker import (
-    StableDiffusionSafetyChecker,
-)
-from ppdiffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
-from ppdiffusers.utils import deprecate, logging
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-global_re_wildcard = re.compile(r"__([^_]*)__")
-
-
-def get_filename(path: str):
-    # this doesn't work on Windows
-    return os.path.basename(path).split(".txt")[0]
-
-
-def read_wildcard_values(path: str):
-    with open(path, encoding="utf8") as f:
-        return f.read().splitlines()
-
-
-def grab_wildcard_values(wildcard_option_dict: Dict[str, List[str]] = {}, wildcard_files: List[str] = []):
-    for wildcard_file in wildcard_files:
-        filename = get_filename(wildcard_file)
-        read_values = read_wildcard_values(wildcard_file)
-        if filename not in wildcard_option_dict:
-            wildcard_option_dict[filename] = []
-        wildcard_option_dict[filename].extend(read_values)
-    return wildcard_option_dict
-
-
-def replace_prompt_with_wildcards(
-    prompt: str, wildcard_option_dict: Dict[str, List[str]] = {}, wildcard_files: List[str] = []
-):
-    new_prompt = prompt
-
-    # get wildcard options
-    wildcard_option_dict = grab_wildcard_values(wildcard_option_dict, wildcard_files)
-
-    for m in global_re_wildcard.finditer(new_prompt):
-        wildcard_value = m.group()
-        replace_value = random.choice(wildcard_option_dict[wildcard_value.strip("__")])
-        new_prompt = new_prompt.replace(wildcard_value, replace_value, 1)
-
-    return new_prompt
-
-
-@dataclass
-class WildcardStableDiffusionOutput(StableDiffusionPipelineOutput):
-    prompts: List[str]
-
-
-class WildcardStableDiffusionPipeline(DiffusionPipeline):
-    r"""
-    Example Usage:
-        pipe = WildcardStableDiffusionPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4"
-        )
-        prompt = "__animal__ sitting on a __object__ wearing a __clothing__"
-        out = pipe(
-            prompt,
-            wildcard_option_dict={
-                "clothing":["hat", "shirt", "scarf", "beret"]
-            },
-            wildcard_files=["object.txt", "animal.txt"],
-            num_prompt_samples=1
-        )
-    Pipeline for text-to-image generation with wild cards using Stable Diffusion.
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPFeatureExtractor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPFeatureExtractor,
-    ):
-        super().__init__()
-
-        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
-                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
-                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
-                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
-                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file"
-            )
-            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["steps_offset"] = 1
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if safety_checker is None:
-            logger.warn(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. PaddleNLP team, diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]],
-        height: int = 512,
-        width: int = 512,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        seed: Optional[int] = None,
-        latents: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        wildcard_option_dict: Dict[str, List[str]] = {},
-        wildcard_files: List[str] = [],
-        num_prompt_samples: Optional[int] = 1,
-        **kwargs,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-        Args:
-            prompt (`str` or `List[str]`):
-                The prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to 512):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to 512):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            seed (`int`, *optional*):
-                A random seed.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            wildcard_option_dict (Dict[str, List[str]]):
-                dict with key as `wildcard` and values as a list of possible replacements. For example if a prompt, "A __animal__ sitting on a chair". A wildcard_option_dict can provide possible values for "animal" like this: {"animal":["dog", "cat", "fox"]}
-            wildcard_files: (List[str])
-               List of filenames of txt files for wildcard replacements. For example if a prompt, "A __animal__ sitting on a chair". A file can be provided ["animal.txt"]
-            num_prompt_samples: int
-                Number of times to sample wildcards for each prompt provided
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-
-        if isinstance(prompt, str):
-            prompt = [
-                replace_prompt_with_wildcards(prompt, wildcard_option_dict, wildcard_files)
-                for i in range(num_prompt_samples)
-            ]
-            batch_size = len(prompt)
-        elif isinstance(prompt, list):
-            prompt_list = []
-            for p in prompt:
-                for i in range(num_prompt_samples):
-                    prompt_list.append(replace_prompt_with_wildcards(p, wildcard_option_dict, wildcard_files))
-            prompt = prompt_list
-            batch_size = len(prompt)
-        else:
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        # get prompt text embeddings
-        text_inputs = self.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=self.tokenizer.model_max_length,
-            return_tensors="pd",
-        )
-        text_input_ids = text_inputs.input_ids
-
-        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
-            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
-            logger.warning(
-                "The following part of your input was truncated because CLIP can only handle sequences up to"
-                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-            )
-            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
-        attention_mask = paddle.ones_like(text_input_ids)
-        text_embeddings = self.text_encoder(text_input_ids, attention_mask=attention_mask)[0]
-
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        bs_embed, seq_len, _ = text_embeddings.shape
-        text_embeddings = text_embeddings.tile([1, num_images_per_prompt, 1])
-        text_embeddings = text_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""]
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            max_length = text_input_ids.shape[-1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-            attention_mask = paddle.ones_like(uncond_input.input_ids)
-            uncond_embeddings = self.text_encoder(uncond_input.input_ids, attention_mask=attention_mask)[0]
-
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = uncond_embeddings.shape[1]
-            uncond_embeddings = uncond_embeddings.tile([batch_size, num_images_per_prompt, 1])
-            uncond_embeddings = uncond_embeddings.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            text_embeddings = paddle.concat([uncond_embeddings, text_embeddings])
-
-        # get the initial random noise unless the user supplied it
-
-        # Unlike in other pipelines, latents need to be generated in the target device
-        # for 1-to-1 results reproducibility with the CompVis implementation.
-        # However this currently doesn't work in `mps`.
-        latents_shape = [batch_size * num_images_per_prompt, self.unet.in_channels, height // 8, width // 8]
-        latents_dtype = text_embeddings.dtype
-        if latents is None:
-            if seed is not None:
-                paddle.seed(seed)
-            latents = paddle.randn(latents_shape, dtype=latents_dtype)
-        else:
-            if latents.shape != latents_shape:
-                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
-            latents = latents
-
-        # set timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-
-        # Some schedulers like PNDM have timesteps as arrays
-        # It's more optimized to move all timesteps to correct device beforehand
-        timesteps_tensor = self.scheduler.timesteps
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
-            # expand the latents if we are doing classifier free guidance
-            latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-            # predict the noise residual
-            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
-
-            # perform guidance
-            if do_classifier_free_guidance:
-                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-            # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-            # call the callback, if provided
-            if callback is not None and i % callback_steps == 0:
-                callback(i, t, latents)
-
-        latents = 1 / 0.18215 * latents
-        image = self.vae.decode(latents).sample
-
-        image = (image / 2 + 0.5).clip(0, 1)
-
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
-        image = image.transpose([0, 2, 3, 1]).astype("float32").numpy()
-
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.astype(text_embeddings.dtype)
-            )
-        else:
-            has_nsfw_concept = None
-
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return WildcardStableDiffusionOutput(images=image, nsfw_content_detected=has_nsfw_concept, prompts=prompt)
diff --git a/ppdiffusers/examples/controlnet/README.md b/ppdiffusers/examples/controlnet/README.md
deleted file mode 100644
index aaf37dddeada..000000000000
--- a/ppdiffusers/examples/controlnet/README.md
+++ /dev/null
@@ -1,211 +0,0 @@
-# ControlNet
-[ControlNet](https://arxiv.org/abs/2302.05543) 是一种通过添加额外条件来控制扩散模型的神经网络结构。
-<p align="center">
-    <img src="https://raw.githubusercontent.com/lllyasviel/ControlNet/main/github_page/he.png">
-</p>
-
-## 安装依赖
-在运行这部分代码前，我们需要安装develop分支的ppdiffusers库：
-```bash
-cd ppdiffusers
-python setup.py install
-```
-此外我们还需要安装相关依赖：
-```bash
-pip install -r requirements.txt
-```
-
-
-# ControlNet with Stable Diffusion预训练模型
-除文本提示外，ControlNet还需要一个控制图作为控制条件。每个预训练模型使用不同的控制方法进行训练，其中每种方法对应一种不同的控制图。例如，Canny to Image要求控制图像是Canny边缘检测的输出图像，而Pose to Image要求控制图是OpenPose骨骼姿态检测图像。目前我们支持如下控制方式及预训练模型。
-## Canny to Image
-采用Canny边缘检测图片作为控制条件。
-```
-python gradio_canny2image.py
-```
-![image](https://user-images.githubusercontent.com/20476674/222131385-0dfaa370-fb11-4b2b-9ef5-36143557578b.png)
-
-## Hed to Image
-采用Hed边缘检测图片作为控制条件。
-```
-python gradio_hed2image.py
-```
-![image](https://user-images.githubusercontent.com/20476674/223642261-d5bdbd83-06f9-459b-8224-486f2235f7a6.png)
-
-
-## Pose to Image
-采用OpenPose姿态图片作为控制条件。
-```
-python gradio_pose2image.py
-```
-![image](https://user-images.githubusercontent.com/20476674/222131475-4dc8582a-d2a2-447a-9724-85461de04c26.png)
-
-## Semantic Segmentation to Image
-采用ADE20K分割协议的图片作为控制条件。
-```
-python gradio_seg2image_segmenter.py
-```
-![image](https://user-images.githubusercontent.com/20476674/222131908-b0c52512-ef42-4e4b-8fde-62c12c600ff2.png)
-
-## Depth to Image
-采用Depth深度检测图片作为控制条件。
-```
-python gradio_depth2image.py
-```
-![image](https://user-images.githubusercontent.com/31800336/236171819-29085f22-c99c-4f63-b0a0-7cce6ac98ebc.jpg)
-
-## Normal to Image
-采用Normal检测图片作为控制条件。
-```
-python gradio_normal2image.py
-```
-![image](https://user-images.githubusercontent.com/31800336/236171840-f31a4f1c-9997-41c0-83ca-4f87ca4cc870.jpg)
-
-## Hough Line to Image
-采用HoughLine检测图片作为控制条件。
-```
-python gradio_hough2image.py
-```
-![image](https://user-images.githubusercontent.com/31800336/236171830-f9254b66-9fbd-46d3-a3bc-e905c87d0ec3.jpg)
-
-## Pix2Pix to Image
-(ControlNet V1.1) InstructPix2Pix根据指令修改图像
-```
-python gradio_ip2p2image.py
-```
-![image](https://github.com/Submerge-Gu/Images/raw/main/4.png)
-
-## Shuffle to Image
-(ControlNet V1.1) Shuffle打乱图像进行重构。
-```
-python gradio_shuffle2image.py
-```
-![image](https://github.com/Submerge-Gu/Images/raw/main/control.png)
-
-# ControlNet模型训练
-
-## Fill50K 训练例子
-
-作为案例，我们将使用 Fill50K 数据集，带领大家训练 ControlNet 模型。首先我们需要下载数据集。
-```sh
-wget https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/fill50k.zip
-unzip -o fill50k.zip
-```
-注意：下面的代码需要在32G V100上才可以正常运行。
-
-### 单机单卡训练
-```bash
-export FLAGS_conv_workspace_size_limit=4096
-python -u train_txt2img_control_trainer.py \
-    --do_train \
-    --output_dir ./sd15_control \
-    --per_device_train_batch_size 4 \
-    --gradient_accumulation_steps 1 \
-    --learning_rate 1e-5 \
-    --weight_decay 0.02 \
-    --lr_scheduler_type "constant" \
-    --warmup_steps 0 \
-    --sd_locked True \
-    --max_steps 10000000 \
-    --logging_steps 50 \
-    --image_logging_steps 400 \
-    --save_steps 2000 \
-    --save_total_limit 2 \
-    --seed 23 \
-    --dataloader_num_workers 4 \
-    --pretrained_model_name_or_path runwayml/stable-diffusion-v1-5 \
-    --max_grad_norm -1 \
-    --file_path ./fill50k \
-    --recompute True \
-    --overwrite_output_dir
-```
-
-`train_txt2img_control_trainer.py`代码可传入的参数解释如下：
-> * `--vae_name_or_path`: 预训练`vae`模型名称或地址，`runwayml/stable-diffusion-v1-5/vae`，程序将自动从BOS上下载预训练好的权重。
-> * `--text_encoder_name_or_path`: 预训练`text_encoder`模型名称或地址，`runwayml/stable-diffusion-v1-5/text_encoder`，程序将自动从BOS上下载预训练好的权重。
-> * `--unet_name_or_path`: 预训练`unet`模型名称或地址，`runwayml/stable-diffusion-v1-5/unet`，程序将自动从BOS上下载预训练好的权重。
-> * `--pretrained_model_name_or_path`: 加载预训练模型的名称或本地路径，如`runwayml/stable-diffusion-v1-5`，`pretrained_model_name_or_path`的优先级高于`vae_name_or_path`, `text_encoder_name_or_path`和`unet_name_or_path`。
-> * `--per_device_train_batch_size`: 训练时每张显卡所使用的`batch_size批量`，当我们的显存较小的时候，需要将这个值设置的小一点。
-> * `--gradient_accumulation_steps`: 梯度累积的步数，用户可以指定梯度累积的步数，在梯度累积的step中。减少多卡之间梯度的通信，减少更新的次数，扩大训练的batch_size。
-> * `--learning_rate`: 学习率。
-> * `--weight_decay`: `AdamW`优化器的`weight_decay`。
-> * `--max_steps`: 最大的训练步数。
-> * `--save_steps`: 每间隔多少步`（global step步数）`，保存模型。
-> * `--save_total_limit`: 最多保存多少个模型。
-> * `--lr_scheduler_type`: 要使用的学习率调度策略。默认为 `constant`。
-> * `--warmup_steps`: 用于从 0 到 `learning_rate` 的线性 warmup 的步数。
-> * `--image_logging_steps`: 每隔多少步，log训练过程中的图片，默认为`1000`步，注意`image_logging_steps`需要是`logging_steps`的整数倍。
-> * `--logging_steps`: logging日志的步数，默认为`50`步。
-> * `--output_dir`: 模型保存路径。
-> * `--seed`: 随机种子，为了可以复现训练结果，Tips：当前paddle设置该随机种子后仍无法完美复现。
-> * `--dataloader_num_workers`: Dataloader所使用的`num_workers`参数。
-> * `--file_path`: 训练数据文件夹所在的地址，上述例子我们使用了`fill50k`目录。
-> * `--num_inference_steps`: 推理预测时候使用的步数。
-> * `--model_max_length`: `tokenizer`中的`model_max_length`参数，超过该长度将会被截断。
-> * `--tokenizer_name`: 我们需要使用的`tokenizer_name`，我们可以使用英文的分词器`bert-base-uncased`，也可以使用中文的分词器`ernie-1.0`。
-> * `--use_ema`: 是否对`unet`使用`ema`，默认为`False`。
-> * `--max_grad_norm`: 梯度剪裁的最大norm值，`-1`表示不使用梯度裁剪策略。
-> * `--use_paddle_conv_init`: 是否使用`paddle`的卷积初始化策略，当我们开启该策略后可以很快发现在`fill50k`数据集上，模型很快就收敛了，默认值为 `False`。
-> * `--recompute`: 是否开启重计算，(`bool`, 可选, 默认为 `False`)，在开启后我们可以增大`batch_size`。
-> * `--fp16`: 是否使用 fp16 混合精度训练而不是 fp32 训练。(`bool`, 可选, 默认为 `False`)
-> * `--fp16_opt_level`: 混合精度训练模式，可为``O1``或``O2``模式，默认``O1``模式，默认O1. 只在fp16选项开启时候生效。
-> * `--is_ldmbert`: 是否使用`ldmbert`作为`text_encoder`，默认为`False`，即使用 `clip text_encoder`。
-
-
-
-**Tips**:
-> 结合 `paddle` 文档和 `torch` 文档可知，`paddle` 卷积层初始化是 `Xavier Normal`，`torch` 卷积层初始化是 `Uniform`，初始化方法边界值是`(-sqrt(groups/(in_channels*prod(*kernal_size))), sqrt(groups/(in_channels*prod(*kernal_size))))`。
-<p align="center">
-    <img src="https://user-images.githubusercontent.com/50394665/222323097-1ff4472c-b4d0-48a0-92c7-44fbb18997f5.png" width="700">
-    <img src="https://user-images.githubusercontent.com/50394665/222323163-11ecf153-1f79-4384-b455-d5429748d184.png" width="700">
-</p>
-
-### 单机多卡训练 (多机多卡训练，仅需在 paddle.distributed.launch 后加个 --ips IP1,IP2,IP3,IP4)
-```bash
-export FLAGS_conv_workspace_size_limit=4096
-python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" train_txt2img_control_trainer.py \
-    --do_train \
-    --output_dir ./sd15_control \
-    --per_device_train_batch_size 4 \
-    --gradient_accumulation_steps 1 \
-    --learning_rate 1e-5 \
-    --weight_decay 0.02 \
-    --lr_scheduler_type "constant" \
-    --warmup_steps 0 \
-    --sd_locked True \
-    --max_steps 10000000 \
-    --logging_steps 50 \
-    --image_logging_steps 400 \
-    --save_steps 2000 \
-    --save_total_limit 2 \
-    --seed 23 \
-    --dataloader_num_workers 4 \
-    --pretrained_model_name_or_path runwayml/stable-diffusion-v1-5 \
-    --max_grad_norm -1 \
-    --file_path ./fill50k \
-    --recompute True \
-    --overwrite_output_dir
-```
-
-## 模型推理
-待模型训练完毕，会在`output_dir`保存训练好的模型权重，我们可以使用如下的代码进行推理
-```python
-from ppdiffusers import StableDiffusionControlNetPipeline, ControlNetModel
-from ppdiffusers.utils import load_image
-controlnet = ControlNetModel.from_pretrained("./sd15_control/checkpoint-12000/controlnet")
-pipe = StableDiffusionControlNetPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", controlnet = controlnet, safety_checker=None)
-canny_edged_image = load_image("https://user-images.githubusercontent.com/50394665/221844474-fd539851-7649-470e-bded-4d174271cc7f.png")
-img = pipe(prompt="pale golden rod circle with old lace background", image=canny_edged_image, guidance_scale=9, num_inference_steps=50).images[0]
-img.save("demo.png")
-```
-<p align="center">
-    <img src="https://user-images.githubusercontent.com/50394665/221844474-fd539851-7649-470e-bded-4d174271cc7f.png">
-    <img src="https://user-images.githubusercontent.com/50394665/222058833-7e94bfa5-7cc2-4b9e-9022-37c9d47398de.png">
-</p>
-
-
-# 参考资料
-- https://github.com/lllyasviel/ControlNet/edit/main/docs/train.md
-- https://github.com/huggingface/diffusers
-
-[https://github.com/Submerge-Gu/Images/blob/main/8.png]: https://github.com/Submerge-Gu/Images/raw/main/8.png
diff --git a/ppdiffusers/examples/controlnet/annotator/_base_/ade20k.yml b/ppdiffusers/examples/controlnet/annotator/_base_/ade20k.yml
deleted file mode 100644
index 595f02243e56..000000000000
--- a/ppdiffusers/examples/controlnet/annotator/_base_/ade20k.yml
+++ /dev/null
@@ -1,44 +0,0 @@
-batch_size: 4
-iters: 80000
-
-train_dataset:
-  type: ADE20K
-  dataset_root: data/ADEChallengeData2016/
-  transforms:
-    - type: ResizeStepScaling
-      min_scale_factor: 0.5
-      max_scale_factor: 2.0
-      scale_step_size: 0.25
-    - type: RandomPaddingCrop
-      crop_size: [512, 512]
-    - type: RandomHorizontalFlip
-    - type: RandomDistort
-      brightness_range: 0.4
-      contrast_range: 0.4
-      saturation_range: 0.4
-    - type: Normalize
-  mode: train
-
-val_dataset:
-  type: ADE20K
-  dataset_root: data/ADEChallengeData2016/
-  transforms:
-    - type: Normalize
-  mode: val
-
-
-optimizer:
-  type: sgd
-  momentum: 0.9
-  weight_decay: 4.0e-5
-
-lr_scheduler:
-  type: PolynomialDecay
-  learning_rate: 0.01
-  end_lr: 0
-  power: 0.9
-
-loss:
-  types:
-    - type: CrossEntropyLoss
-  coef: [1]
diff --git a/ppdiffusers/examples/controlnet/annotator/_base_/cityscapes.yml b/ppdiffusers/examples/controlnet/annotator/_base_/cityscapes.yml
deleted file mode 100644
index 4afbcbe944f0..000000000000
--- a/ppdiffusers/examples/controlnet/annotator/_base_/cityscapes.yml
+++ /dev/null
@@ -1,44 +0,0 @@
-batch_size: 2
-iters: 80000
-
-train_dataset:
-  type: Cityscapes
-  dataset_root: data/cityscapes
-  transforms:
-    - type: ResizeStepScaling
-      min_scale_factor: 0.5
-      max_scale_factor: 2.0
-      scale_step_size: 0.25
-    - type: RandomPaddingCrop
-      crop_size: [1024, 512]
-    - type: RandomHorizontalFlip
-    - type: RandomDistort
-      brightness_range: 0.4
-      contrast_range: 0.4
-      saturation_range: 0.4
-    - type: Normalize
-  mode: train
-
-val_dataset:
-  type: Cityscapes
-  dataset_root: data/cityscapes
-  transforms:
-    - type: Normalize
-  mode: val
-
-
-optimizer:
-  type: sgd
-  momentum: 0.9
-  weight_decay: 4.0e-5
-
-lr_scheduler:
-  type: PolynomialDecay
-  learning_rate: 0.01
-  end_lr: 0
-  power: 0.9
-
-loss:
-  types:
-    - type: CrossEntropyLoss
-  coef: [1]
diff --git a/ppdiffusers/examples/controlnet/annotator/_base_/cityscapes_1024x1024.yml b/ppdiffusers/examples/controlnet/annotator/_base_/cityscapes_1024x1024.yml
deleted file mode 100644
index 9a097378ecfc..000000000000
--- a/ppdiffusers/examples/controlnet/annotator/_base_/cityscapes_1024x1024.yml
+++ /dev/null
@@ -1,20 +0,0 @@
-_base_: './cityscapes.yml'
-
-train_dataset:
-  transforms:
-    - type: ResizeStepScaling
-      min_scale_factor: 0.5
-      max_scale_factor: 2.0
-      scale_step_size: 0.25
-    - type: RandomPaddingCrop
-      crop_size: [1024, 1024]
-    - type: RandomHorizontalFlip
-    - type: RandomDistort
-      brightness_range: 0.4
-      contrast_range: 0.4
-      saturation_range: 0.4
-    - type: Normalize
-
-val_dataset:
-  transforms:
-    - type: Normalize
diff --git a/ppdiffusers/examples/controlnet/annotator/canny/__init__.py b/ppdiffusers/examples/controlnet/annotator/canny/__init__.py
deleted file mode 100644
index 1a656e976c6a..000000000000
--- a/ppdiffusers/examples/controlnet/annotator/canny/__init__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import cv2
-
-
-class CannyDetector:
-    def __call__(self, img, low_threshold, high_threshold):
-        return cv2.Canny(img, low_threshold, high_threshold)
diff --git a/ppdiffusers/examples/controlnet/annotator/hed/__init__.py b/ppdiffusers/examples/controlnet/annotator/hed/__init__.py
deleted file mode 100644
index 831b52931893..000000000000
--- a/ppdiffusers/examples/controlnet/annotator/hed/__init__.py
+++ /dev/null
@@ -1,168 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-import cv2
-import numpy as np
-import paddle
-
-from ..util import annotator_ckpts_path
-
-
-class Network(paddle.nn.Layer):
-    def __init__(self, model_path=None):
-        super().__init__()
-
-        self.netVggOne = paddle.nn.Sequential(
-            paddle.nn.Conv2D(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1),
-            paddle.nn.ReLU(),
-            paddle.nn.Conv2D(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1),
-            paddle.nn.ReLU(),
-        )
-
-        self.netVggTwo = paddle.nn.Sequential(
-            paddle.nn.MaxPool2D(kernel_size=2, stride=2),
-            paddle.nn.Conv2D(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1),
-            paddle.nn.ReLU(),
-            paddle.nn.Conv2D(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1),
-            paddle.nn.ReLU(),
-        )
-
-        self.netVggThr = paddle.nn.Sequential(
-            paddle.nn.MaxPool2D(kernel_size=2, stride=2),
-            paddle.nn.Conv2D(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1),
-            paddle.nn.ReLU(),
-            paddle.nn.Conv2D(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1),
-            paddle.nn.ReLU(),
-            paddle.nn.Conv2D(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1),
-            paddle.nn.ReLU(),
-        )
-
-        self.netVggFou = paddle.nn.Sequential(
-            paddle.nn.MaxPool2D(kernel_size=2, stride=2),
-            paddle.nn.Conv2D(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1),
-            paddle.nn.ReLU(),
-            paddle.nn.Conv2D(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
-            paddle.nn.ReLU(),
-            paddle.nn.Conv2D(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
-            paddle.nn.ReLU(),
-        )
-
-        self.netVggFiv = paddle.nn.Sequential(
-            paddle.nn.MaxPool2D(kernel_size=2, stride=2),
-            paddle.nn.Conv2D(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
-            paddle.nn.ReLU(),
-            paddle.nn.Conv2D(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
-            paddle.nn.ReLU(),
-            paddle.nn.Conv2D(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
-            paddle.nn.ReLU(),
-        )
-
-        self.netScoreOne = paddle.nn.Conv2D(in_channels=64, out_channels=1, kernel_size=1, stride=1, padding=0)
-        self.netScoreTwo = paddle.nn.Conv2D(in_channels=128, out_channels=1, kernel_size=1, stride=1, padding=0)
-        self.netScoreThr = paddle.nn.Conv2D(in_channels=256, out_channels=1, kernel_size=1, stride=1, padding=0)
-        self.netScoreFou = paddle.nn.Conv2D(in_channels=512, out_channels=1, kernel_size=1, stride=1, padding=0)
-        self.netScoreFiv = paddle.nn.Conv2D(in_channels=512, out_channels=1, kernel_size=1, stride=1, padding=0)
-
-        self.netCombine = paddle.nn.Sequential(
-            paddle.nn.Conv2D(in_channels=5, out_channels=1, kernel_size=1, stride=1, padding=0), paddle.nn.Sigmoid()
-        )
-
-        if model_path:
-            self.set_state_dict(paddle.load(model_path))
-
-    def forward(self, tenInput):
-        tenInput = tenInput * 255.0
-        tenInput = tenInput - paddle.to_tensor(
-            [104.00698793, 116.66876762, 122.67891434],
-            dtype=tenInput.dtype,
-        ).reshape([1, 3, 1, 1])
-
-        tenVggOne = self.netVggOne(tenInput)
-        tenVggTwo = self.netVggTwo(tenVggOne)
-        tenVggThr = self.netVggThr(tenVggTwo)
-        tenVggFou = self.netVggFou(tenVggThr)
-        tenVggFiv = self.netVggFiv(tenVggFou)
-
-        tenScoreOne = self.netScoreOne(tenVggOne)
-        tenScoreTwo = self.netScoreTwo(tenVggTwo)
-        tenScoreThr = self.netScoreThr(tenVggThr)
-        tenScoreFou = self.netScoreFou(tenVggFou)
-        tenScoreFiv = self.netScoreFiv(tenVggFiv)
-
-        tenScoreOne = paddle.nn.functional.interpolate(
-            tenScoreOne, size=(tenInput.shape[2], tenInput.shape[3]), mode="bilinear", align_corners=False
-        )
-        tenScoreTwo = paddle.nn.functional.interpolate(
-            tenScoreTwo, size=(tenInput.shape[2], tenInput.shape[3]), mode="bilinear", align_corners=False
-        )
-        tenScoreThr = paddle.nn.functional.interpolate(
-            tenScoreThr, size=(tenInput.shape[2], tenInput.shape[3]), mode="bilinear", align_corners=False
-        )
-        tenScoreFou = paddle.nn.functional.interpolate(
-            tenScoreFou, size=(tenInput.shape[2], tenInput.shape[3]), mode="bilinear", align_corners=False
-        )
-        tenScoreFiv = paddle.nn.functional.interpolate(
-            tenScoreFiv, size=(tenInput.shape[2], tenInput.shape[3]), mode="bilinear", align_corners=False
-        )
-
-        return self.netCombine(paddle.concat([tenScoreOne, tenScoreTwo, tenScoreThr, tenScoreFou, tenScoreFiv], 1))
-
-
-remote_model_path = (
-    "https://paddlenlp.bj.bcebos.com/models/community/westfish/network-bsds500-paddle/network-bsds500.pdparams"
-)
-
-
-class HEDdetector:
-    def __init__(self, modelpath=None):
-        modelpath = os.path.join(annotator_ckpts_path, "network-bsds500.pdparams")
-        if not os.path.exists(modelpath):
-            from paddlenlp.utils.downloader import get_path_from_url_with_filelock
-
-            get_path_from_url_with_filelock(remote_model_path, root_dir=annotator_ckpts_path)
-        self.model_path = modelpath
-        self.netNetwork = Network(modelpath)
-        self.netNetwork.eval()
-
-    def __call__(self, input_image):
-        assert input_image.ndim == 3
-        input_image = input_image[:, :, ::-1].copy()
-        with paddle.no_grad():
-            image_hed = paddle.to_tensor(input_image).astype(paddle.float32)
-            image_hed = image_hed / 255.0
-            image_hed = image_hed.transpose([2, 0, 1]).unsqueeze(axis=0)
-            edge = self.netNetwork(image_hed)[0]
-            edge = (edge.cpu().numpy() * 255.0).clip(0, 255).astype(np.uint8)
-            return edge[0]
-
-
-def nms(x, t, s):
-    x = cv2.GaussianBlur(x.astype(np.float32), (0, 0), s)
-
-    f1 = np.array([[0, 0, 0], [1, 1, 1], [0, 0, 0]], dtype=np.uint8)
-    f2 = np.array([[0, 1, 0], [0, 1, 0], [0, 1, 0]], dtype=np.uint8)
-    f3 = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]], dtype=np.uint8)
-    f4 = np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0]], dtype=np.uint8)
-
-    y = np.zeros_like(x)
-
-    for f in [f1, f2, f3, f4]:
-        np.putmask(y, cv2.dilate(x, kernel=f) == x, x)
-
-    z = np.zeros_like(y, dtype=np.uint8)
-    z[y > t] = 255
-    return z
diff --git a/ppdiffusers/examples/controlnet/annotator/midas_paddle/__init__.py b/ppdiffusers/examples/controlnet/annotator/midas_paddle/__init__.py
deleted file mode 100644
index 543d0774c523..000000000000
--- a/ppdiffusers/examples/controlnet/annotator/midas_paddle/__init__.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-
-import cv2
-import numpy as np
-from annotator.util import annotator_ckpts_path
-from einops import rearrange
-
-from .api_inference import MidasInference
-
-
-class MidasDetector_Infer:
-    def __init__(self):
-        self.model = MidasInference(annotator_ckpts_path)
-
-    def __call__(self, input_image, a=np.pi * 2.0, bg_th=0.1):
-        assert input_image.ndim == 3
-        image_depth = input_image
-        image_depth = rearrange(image_depth, "h w c -> 1 c h w")
-        image_depth = image_depth.astype("float32")
-        image_depth = image_depth / 127.5 - 1.0
-        depth = self.model.predict(image_depth)[0]
-        depth_pt = copy.deepcopy(depth)
-        depth_pt -= depth_pt.min()
-        depth_pt /= depth_pt.max()
-        depth_image = (depth_pt * 255.0).clip(min=0, max=255).astype(np.uint8)
-        depth_np = depth
-        x = cv2.Sobel(depth_np, cv2.CV_32F, 1, 0, ksize=3)
-        y = cv2.Sobel(depth_np, cv2.CV_32F, 0, 1, ksize=3)
-        z = np.ones_like(x) * a
-        x[depth_pt < bg_th] = 0
-        y[depth_pt < bg_th] = 0
-        normal = np.stack([x, y, z], axis=2)
-        normal /= np.sum(normal**2.0, axis=2, keepdims=True) ** 0.5
-        normal_image = (normal * 127.5 + 127.5).clip(min=0, max=255).astype(np.uint8)
-        return depth_image, normal_image
diff --git a/ppdiffusers/examples/controlnet/annotator/midas_paddle/api_inference.py b/ppdiffusers/examples/controlnet/annotator/midas_paddle/api_inference.py
deleted file mode 100644
index 4f578bbaf306..000000000000
--- a/ppdiffusers/examples/controlnet/annotator/midas_paddle/api_inference.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-import paddle.inference as paddle_infer
-
-from paddlenlp.utils.downloader import get_path_from_url_with_filelock
-
-
-def checkmodel(model_dir, model_name):
-    if not os.path.exists(os.path.join(model_dir, model_name, model_name + ".pdmodel")):
-        model_url = "https://bj.bcebos.com/v1/paddledet/models/dpt_hybrid.zip"
-        get_path_from_url_with_filelock(model_url, root_dir=model_dir)
-
-
-class MidasInference:
-    def __init__(self, model_dir, model_name="dpt_hybrid", batchsize=8, device="GPU", run_mode="paddle"):
-        checkmodel(model_dir, model_name)
-        model_file = os.path.join(model_dir, model_name, model_name + ".pdmodel")
-        params_file = os.path.join(model_dir, model_name, model_name + ".pdiparams")
-        config = paddle_infer.Config(model_file, params_file)
-        self.batchsize = batchsize
-        if device == "GPU":
-            # initial GPU memory(M), device ID
-            config.enable_use_gpu(200, 0)
-            # optimize graph and fuse op
-            config.switch_ir_optim(True)
-        elif device == "XPU":
-            if config.lite_engine_enabled():
-                config.enable_lite_engine()
-            config.enable_xpu(10 * 1024 * 1024)
-        elif device == "NPU":
-            if config.lite_engine_enabled():
-                config.enable_lite_engine()
-            config.enable_custom_device("npu")
-        else:
-            config.disable_gpu()
-            config.set_cpu_math_library_num_threads(4)
-
-        precision_map = {
-            "trt_int8": paddle_infer.Config.Precision.Int8,
-            "trt_fp32": paddle_infer.Config.Precision.Float32,
-            "trt_fp16": paddle_infer.Config.Precision.Half,
-        }
-        if run_mode in precision_map.keys():
-            config.enable_tensorrt_engine(
-                workspace_size=(1 << 25) * batchsize,
-                max_batch_size=batchsize,
-                min_subgraph_size=3,
-                precision_mode=precision_map[run_mode],
-                use_static=False,
-                use_calib_mode=False,
-            )
-            min_input_shape = {"image": [1, 3, 224, 224]}
-            max_input_shape = {"image": [1, 3, 1280, 1280]}
-            opt_input_shape = {"image": [1, 3, 384, 384]}
-            config.set_trt_dynamic_shape_info(min_input_shape, max_input_shape, opt_input_shape)
-
-        # disable print log when predict
-        config.disable_glog_info()
-        # enable shared memory
-        config.enable_memory_optim()
-        # disable feed, fetch OP, needed by zero_copy_run
-        config.switch_use_feed_fetch_ops(False)
-        self.predictor = paddle_infer.create_predictor(config)
-
-    def predict(self, inputs):
-
-        input_names = self.predictor.get_input_names()
-        input_handle = self.predictor.get_input_handle(input_names[0])
-
-        input_handle.copy_from_cpu(inputs)
-        self.predictor.run()
-        output_names = self.predictor.get_output_names()
-        output_handle = self.predictor.get_output_handle(output_names[0])
-        output_data = output_handle.copy_to_cpu()  # numpy.ndarray
-        return output_data
diff --git a/ppdiffusers/examples/controlnet/annotator/mlsd/__init__.py b/ppdiffusers/examples/controlnet/annotator/mlsd/__init__.py
deleted file mode 100644
index 8e453eef33c2..000000000000
--- a/ppdiffusers/examples/controlnet/annotator/mlsd/__init__.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-import cv2
-import numpy as np
-import paddle
-from annotator.util import annotator_ckpts_path
-
-from .models.mbv2_mlsd_large import MobileV2_MLSD_Large
-from .utils import pred_lines
-
-remote_model_path = "https://bj.bcebos.com/v1/paddlenlp/models/community/ppdiffusers/mlsd_large_512_fp32.pdparams"
-
-
-class MLSDdetector:
-    def __init__(self):
-        model_path = os.path.join(annotator_ckpts_path, "mlsd_large_512_fp32.pdparams")
-        if not os.path.exists(model_path):
-            from basicsr.utils.download_util import load_file_from_url
-
-            load_file_from_url(remote_model_path, model_dir=annotator_ckpts_path)
-        self.model = MobileV2_MLSD_Large()
-        self.model.eval()
-        self.model.set_dict(paddle.load(model_path))
-
-    def __call__(self, input_image, thr_v, thr_d):
-        assert input_image.ndim == 3
-        img = input_image
-        img_output = np.zeros_like(img)
-        with paddle.no_grad():
-            lines = pred_lines(img, self.model, [img.shape[0], img.shape[1]], thr_v, thr_d)
-            for line in lines:
-                x_start, y_start, x_end, y_end = [int(val) for val in line]
-                cv2.line(img_output, (x_start, y_start), (x_end, y_end), [255, 255, 255], 1)
-        return img_output[:, :, (0)]
diff --git a/ppdiffusers/examples/controlnet/annotator/mlsd/models/mbv2_mlsd_large.py b/ppdiffusers/examples/controlnet/annotator/mlsd/models/mbv2_mlsd_large.py
deleted file mode 100644
index 2866c79b2abe..000000000000
--- a/ppdiffusers/examples/controlnet/annotator/mlsd/models/mbv2_mlsd_large.py
+++ /dev/null
@@ -1,309 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-from annotator.mlsd import utils
-
-
-class BlockTypeA(paddle.nn.Layer):
-    def __init__(self, in_c1, in_c2, out_c1, out_c2, upscale=True):
-        super(BlockTypeA, self).__init__()
-        self.conv1 = paddle.nn.Sequential(
-            paddle.nn.Conv2D(in_channels=in_c2, out_channels=out_c2, kernel_size=1),
-            paddle.nn.BatchNorm2D(
-                num_features=out_c2,
-                momentum=1 - 0.1,
-                epsilon=1e-05,
-                weight_attr=None,
-                bias_attr=None,
-                use_global_stats=True,
-            ),
-            paddle.nn.ReLU(),
-        )
-        self.conv2 = paddle.nn.Sequential(
-            paddle.nn.Conv2D(in_channels=in_c1, out_channels=out_c1, kernel_size=1),
-            paddle.nn.BatchNorm2D(
-                num_features=out_c1,
-                momentum=1 - 0.1,
-                epsilon=1e-05,
-                weight_attr=None,
-                bias_attr=None,
-                use_global_stats=True,
-            ),
-            paddle.nn.ReLU(),
-        )
-        self.upscale = upscale
-
-    def forward(self, a, b):
-        b = self.conv1(b)
-        a = self.conv2(a)
-        if self.upscale:
-            b = paddle.nn.functional.interpolate(x=b, scale_factor=2.0, mode="bilinear", align_corners=True)
-        return paddle.concat(x=(a, b), axis=1)
-
-
-class BlockTypeB(paddle.nn.Layer):
-    def __init__(self, in_c, out_c):
-        super(BlockTypeB, self).__init__()
-        self.conv1 = paddle.nn.Sequential(
-            paddle.nn.Conv2D(in_channels=in_c, out_channels=in_c, kernel_size=3, padding=1),
-            paddle.nn.BatchNorm2D(
-                num_features=in_c,
-                momentum=1 - 0.1,
-                epsilon=1e-05,
-                weight_attr=None,
-                bias_attr=None,
-                use_global_stats=True,
-            ),
-            paddle.nn.ReLU(),
-        )
-        self.conv2 = paddle.nn.Sequential(
-            paddle.nn.Conv2D(in_channels=in_c, out_channels=out_c, kernel_size=3, padding=1),
-            paddle.nn.BatchNorm2D(
-                num_features=out_c,
-                momentum=1 - 0.1,
-                epsilon=1e-05,
-                weight_attr=None,
-                bias_attr=None,
-                use_global_stats=True,
-            ),
-            paddle.nn.ReLU(),
-        )
-
-    def forward(self, x):
-        x = self.conv1(x) + x
-        x = self.conv2(x)
-        return x
-
-
-class BlockTypeC(paddle.nn.Layer):
-    def __init__(self, in_c, out_c):
-        super(BlockTypeC, self).__init__()
-        self.conv1 = paddle.nn.Sequential(
-            paddle.nn.Conv2D(in_channels=in_c, out_channels=in_c, kernel_size=3, padding=5, dilation=5),
-            paddle.nn.BatchNorm2D(
-                num_features=in_c,
-                momentum=1 - 0.1,
-                epsilon=1e-05,
-                weight_attr=None,
-                bias_attr=None,
-                use_global_stats=True,
-            ),
-            paddle.nn.ReLU(),
-        )
-        self.conv2 = paddle.nn.Sequential(
-            paddle.nn.Conv2D(in_channels=in_c, out_channels=in_c, kernel_size=3, padding=1),
-            paddle.nn.BatchNorm2D(
-                num_features=in_c,
-                momentum=1 - 0.1,
-                epsilon=1e-05,
-                weight_attr=None,
-                bias_attr=None,
-                use_global_stats=True,
-            ),
-            paddle.nn.ReLU(),
-        )
-        self.conv3 = paddle.nn.Conv2D(in_channels=in_c, out_channels=out_c, kernel_size=1)
-
-    def forward(self, x):
-        x = self.conv1(x)
-        x = self.conv2(x)
-        x = self.conv3(x)
-        return x
-
-
-def _make_divisible(v, divisor, min_value=None):
-    """
-    This function is taken from the original tf repo.
-    It ensures that all layers have a channel number that is divisible by 8
-    It can be seen here:
-    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
-    :param v:
-    :param divisor:
-    :param min_value:
-    :return:
-    """
-    if min_value is None:
-        min_value = divisor
-    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
-    if new_v < 0.9 * v:
-        new_v += divisor
-    return new_v
-
-
-class ConvBNReLU(paddle.nn.Sequential):
-    def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1):
-        self.channel_pad = out_planes - in_planes
-        self.stride = stride
-        if stride == 2:
-            padding = 0
-        else:
-            padding = (kernel_size - 1) // 2
-        super(ConvBNReLU, self).__init__(
-            paddle.nn.Conv2D(
-                in_channels=in_planes,
-                out_channels=out_planes,
-                kernel_size=kernel_size,
-                stride=stride,
-                padding=padding,
-                groups=groups,
-                bias_attr=False,
-            ),
-            paddle.nn.BatchNorm2D(
-                num_features=out_planes,
-                momentum=1 - 0.1,
-                epsilon=1e-05,
-                weight_attr=None,
-                bias_attr=None,
-                use_global_stats=True,
-            ),
-            paddle.nn.ReLU6(),
-        )
-        self.max_pool = paddle.nn.MaxPool2D(kernel_size=stride, stride=stride)
-
-    def forward(self, x):
-        if self.stride == 2:
-            x = paddle.nn.functional.pad(x=x, pad=(0, 1, 0, 1), mode="constant", value=0)
-        for module in self:
-            if not isinstance(module, paddle.nn.MaxPool2D):
-                x = module(x)
-        return x
-
-
-class InvertedResidual(paddle.nn.Layer):
-    def __init__(self, inp, oup, stride, expand_ratio):
-        super(InvertedResidual, self).__init__()
-        self.stride = stride
-        assert stride in [1, 2]
-        hidden_dim = int(round(inp * expand_ratio))
-        self.use_res_connect = self.stride == 1 and inp == oup
-        layers = []
-        if expand_ratio != 1:
-            layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1))
-        layers.extend(
-            [
-                ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim),
-                paddle.nn.Conv2D(
-                    in_channels=hidden_dim, out_channels=oup, kernel_size=1, stride=1, padding=0, bias_attr=False
-                ),
-                paddle.nn.BatchNorm2D(
-                    num_features=oup,
-                    momentum=1 - 0.1,
-                    epsilon=1e-05,
-                    weight_attr=None,
-                    bias_attr=None,
-                    use_global_stats=True,
-                ),
-            ]
-        )
-        self.conv = paddle.nn.Sequential(*layers)
-
-    def forward(self, x):
-        if self.use_res_connect:
-            return x + self.conv(x)
-        else:
-            return self.conv(x)
-
-
-class MobileNetV2(paddle.nn.Layer):
-    def __init__(self):
-        """
-        MobileNet V2 main class
-        Args:
-            num_classes (int): Number of classes
-            width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount
-            inverted_residual_setting: Network structure
-            round_nearest (int): Round the number of channels in each layer to be a multiple of this number
-            Set to 1 to turn off rounding
-            block: Module specifying inverted residual building block for mobilenet
-        """
-        super(MobileNetV2, self).__init__()
-        block = InvertedResidual
-        input_channel = 32
-        last_channel = 1280
-        width_mult = 1.0
-        round_nearest = 8
-        inverted_residual_setting = [[1, 16, 1, 1], [6, 24, 2, 2], [6, 32, 3, 2], [6, 64, 4, 2], [6, 96, 3, 1]]
-        if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4:
-            raise ValueError(
-                "inverted_residual_setting should be non-empty or a 4-element list, got {}".format(
-                    inverted_residual_setting
-                )
-            )
-        input_channel = _make_divisible(input_channel * width_mult, round_nearest)
-        self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest)
-        features = [ConvBNReLU(4, input_channel, stride=2)]
-        for t, c, n, s in inverted_residual_setting:
-            output_channel = _make_divisible(c * width_mult, round_nearest)
-            for i in range(n):
-                stride = s if i == 0 else 1
-                features.append(block(input_channel, output_channel, stride, expand_ratio=t))
-                input_channel = output_channel
-        self.features = paddle.nn.Sequential(*features)
-        self.fpn_selected = [1, 3, 6, 10, 13]
-        for m in self.named_sublayers():
-            if isinstance(m, paddle.nn.Conv2D):
-                utils.kaiming_normal_(m.weight, mode="fan_out")
-                if m.bias is not None:
-                    utils.zeros_(m.bias)
-            elif isinstance(m, paddle.nn.BatchNorm2D):
-                utils.ones_(m.weight)
-                utils.zeros_(m.bias)
-            elif isinstance(m, paddle.nn.Linear):
-                utils.normal_(m.weight, 0, 0.01)
-                utils.zeros_(m.bias)
-
-    def _forward_impl(self, x):
-        fpn_features = []
-        for i, f in enumerate(self.features):
-            if i > self.fpn_selected[-1]:
-                break
-            x = f(x)
-            if i in self.fpn_selected:
-                fpn_features.append(x)
-        c1, c2, c3, c4, c5 = fpn_features
-        return c1, c2, c3, c4, c5
-
-    def forward(self, x):
-        return self._forward_impl(x)
-
-
-class MobileV2_MLSD_Large(paddle.nn.Layer):
-    def __init__(self):
-        super(MobileV2_MLSD_Large, self).__init__()
-        self.backbone = MobileNetV2()
-        self.block15 = BlockTypeA(in_c1=64, in_c2=96, out_c1=64, out_c2=64, upscale=False)
-        self.block16 = BlockTypeB(128, 64)
-        self.block17 = BlockTypeA(in_c1=32, in_c2=64, out_c1=64, out_c2=64)
-        self.block18 = BlockTypeB(128, 64)
-        self.block19 = BlockTypeA(in_c1=24, in_c2=64, out_c1=64, out_c2=64)
-        self.block20 = BlockTypeB(128, 64)
-        self.block21 = BlockTypeA(in_c1=16, in_c2=64, out_c1=64, out_c2=64)
-        self.block22 = BlockTypeB(128, 64)
-        self.block23 = BlockTypeC(64, 16)
-        print("MobileV2_MLSD_Large: ", MobileNetV2, self.backbone)
-
-    def forward(self, x):
-        c1, c2, c3, c4, c5 = self.backbone(x)
-        x = self.block15(c4, c5)
-        x = self.block16(x)
-        x = self.block17(c3, x)
-        x = self.block18(x)
-        x = self.block19(c2, x)
-        x = self.block20(x)
-        x = self.block21(c1, x)
-        x = self.block22(x)
-        x = self.block23(x)
-        x = x[:, 7:, :, :]
-        return x
diff --git a/ppdiffusers/examples/controlnet/annotator/mlsd/utils.py b/ppdiffusers/examples/controlnet/annotator/mlsd/utils.py
deleted file mode 100644
index 64c89a2a2e68..000000000000
--- a/ppdiffusers/examples/controlnet/annotator/mlsd/utils.py
+++ /dev/null
@@ -1,208 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import math
-
-import cv2
-import numpy as np
-import paddle
-
-"""
-M-LSD
-Copyright 2021-present NAVER Corp.
-Apache License v2.0
-"""
-
-
-def normal_(tensor, mean=0.0, std=1.0):
-    """
-    Modified tensor inspace using normal_
-    Args:
-        tensor (paddle.Tensor): paddle Tensor
-        mean (float|int): mean value.
-        std (float|int): std value.
-    Return:
-        tensor
-    """
-    return _no_grad_normal_(tensor, mean, std)
-
-
-def zeros_(tensor):
-    """
-    Modified tensor inspace using zeros_
-    Args:
-        tensor (paddle.Tensor): paddle Tensor
-    Return:
-        tensor
-    """
-    return _no_grad_fill_(tensor, 0)
-
-
-def kaiming_normal_(tensor, a=0, mode="fan_in", nonlinearity="leaky_relu", reverse=False):
-    """
-    Modified tensor inspace using kaiming_normal_
-    Args:
-        tensor (paddle.Tensor): paddle Tensor
-        mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut
-        nonlinearity (str): nonlinearity method name
-        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
-    Return:
-        tensor
-    """
-    fan = _calculate_correct_fan(tensor, mode, reverse)
-    gain = _calculate_gain(nonlinearity, a)
-    std = gain / math.sqrt(fan)
-    return _no_grad_normal_(tensor, 0, std)
-
-
-def _no_grad_fill_(tensor, value=0.0):
-    with paddle.no_grad():
-        tensor.set_value(paddle.full_like(tensor, value, dtype=tensor.dtype))
-    return tensor
-
-
-def _no_grad_normal_(tensor, mean=0.0, std=1.0):
-    with paddle.no_grad():
-        tensor.set_value(paddle.normal(mean=mean, std=std, shape=tensor.shape))
-    return tensor
-
-
-def _calculate_gain(nonlinearity, param=None):
-    linear_fns = ["linear", "conv1d", "conv2d", "conv3d", "conv_transpose1d", "conv_transpose2d", "conv_transpose3d"]
-    if nonlinearity in linear_fns or nonlinearity == "sigmoid":
-        return 1
-    elif nonlinearity == "tanh":
-        return 5.0 / 3
-    elif nonlinearity == "relu":
-        return math.sqrt(2.0)
-    elif nonlinearity == "leaky_relu":
-        if param is None:
-            negative_slope = 0.01
-        elif not isinstance(param, bool) and isinstance(param, int) or isinstance(param, float):
-            # True/False are instances of int, hence check above
-            negative_slope = param
-        else:
-            raise ValueError("negative_slope {} not a valid number".format(param))
-        return math.sqrt(2.0 / (1 + negative_slope**2))
-    elif nonlinearity == "selu":
-        return 3.0 / 4
-    else:
-        raise ValueError("Unsupported nonlinearity {}".format(nonlinearity))
-
-
-# reference: https://pytorch.org/docs/stable/_modules/torch/nn/init.html
-def _calculate_correct_fan(tensor, mode, reverse=False):
-    mode = mode.lower()
-    valid_modes = ["fan_in", "fan_out"]
-    if mode not in valid_modes:
-        raise ValueError("Mode {} not supported, please use one of {}".format(mode, valid_modes))
-
-    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse)
-
-    return fan_in if mode == "fan_in" else fan_out
-
-
-def _calculate_fan_in_and_fan_out(tensor, reverse=False):
-    """
-    Calculate (fan_in, _fan_out) for tensor
-    Args:
-        tensor (Tensor): paddle.Tensor
-        reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. e.g. : conv.weight [cout, cin, kh, kw] is False; linear.weight [cin, cout] is True
-    Return:
-        Tuple[fan_in, fan_out]
-    """
-    if tensor.ndim < 2:
-        raise ValueError("Fan in and fan out can not be computed for tensor with fewer than 2 dimensions")
-
-    if reverse:
-        num_input_fmaps, num_output_fmaps = tensor.shape[0], tensor.shape[1]
-    else:
-        num_input_fmaps, num_output_fmaps = tensor.shape[1], tensor.shape[0]
-
-    receptive_field_size = 1
-    if tensor.ndim > 2:
-        receptive_field_size = np.prod(tensor.shape[2:])
-
-    fan_in = num_input_fmaps * receptive_field_size
-    fan_out = num_output_fmaps * receptive_field_size
-
-    return fan_in, fan_out
-
-
-def deccode_output_score_and_ptss(tpMap, topk_n=200, ksize=5):
-    """
-    tpMap:
-    center: tpMap[1, 0, :, :]
-    displacement: tpMap[1, 1:5, :, :]
-    """
-    b, c, h, w = tpMap.shape
-    assert b == 1, "only support bsize==1"
-    displacement = tpMap[:, 1:5, :, :][0]
-    center = tpMap[:, (0), :, :]
-    heat = paddle.nn.functional.sigmoid(x=center).unsqueeze(0)
-    hmax = paddle.nn.functional.max_pool2d(
-        kernel_size=(ksize, ksize), stride=1, padding=(ksize - 1) // 2, x=heat
-    ).squeeze(0)
-    keep = (hmax == heat).astype(dtype="float32")
-    heat = heat * keep
-    heat = heat.reshape([-1])
-    scores, indices = paddle.topk(x=heat, k=topk_n, axis=-1, largest=True)
-    w_t = paddle.to_tensor(w)
-    yy = paddle.floor_divide(x=indices, y=w_t).unsqueeze(axis=-1)
-    xx = paddle.mod(indices, w_t).unsqueeze(axis=-1)
-    ptss = paddle.concat(x=(yy, xx), axis=-1)
-    ptss = ptss.detach().cpu().numpy()
-    scores = scores.detach().cpu().numpy()
-    displacement = displacement.detach().cpu().numpy()
-    displacement = displacement.transpose((1, 2, 0))
-    return ptss, scores, displacement
-
-
-def pred_lines(image, model, input_shape=[512, 512], score_thr=0.1, dist_thr=20.0):
-    h, w, _ = image.shape
-    h_ratio, w_ratio = [h / input_shape[0], w / input_shape[1]]
-    resized_image = np.concatenate(
-        [
-            cv2.resize(image, (input_shape[1], input_shape[0]), interpolation=cv2.INTER_AREA),
-            np.ones([input_shape[0], input_shape[1], 1]),
-        ],
-        axis=-1,
-    )
-    resized_image = resized_image.transpose((2, 0, 1))
-    batch_image = np.expand_dims(resized_image, axis=0).astype("float32")
-    batch_image = batch_image / 127.5 - 1.0
-    batch_image = paddle.to_tensor(data=batch_image).astype(dtype="float32")
-    outputs = model(batch_image)
-    pts, pts_score, vmap = deccode_output_score_and_ptss(outputs, 200, 3)
-    start = vmap[:, :, :2]
-    end = vmap[:, :, 2:]
-    dist_map = np.sqrt(np.sum((start - end) ** 2, axis=-1))
-    segments_list = []
-    for center, score in zip(pts, pts_score):
-        y, x = center
-        distance = dist_map[y, x]
-        if score > score_thr and distance > dist_thr:
-            disp_x_start, disp_y_start, disp_x_end, disp_y_end = vmap[(y), (x), :]
-            x_start = x + disp_x_start
-            y_start = y + disp_y_start
-            x_end = x + disp_x_end
-            y_end = y + disp_y_end
-            segments_list.append([x_start, y_start, x_end, y_end])
-    lines = 2 * np.array(segments_list)
-    lines[:, (0)] = lines[:, (0)] * w_ratio
-    lines[:, (1)] = lines[:, (1)] * h_ratio
-    lines[:, (2)] = lines[:, (2)] * w_ratio
-    lines[:, (3)] = lines[:, (3)] * h_ratio
-    return lines
diff --git a/ppdiffusers/examples/controlnet/annotator/openpose/__init__.py b/ppdiffusers/examples/controlnet/annotator/openpose/__init__.py
deleted file mode 100644
index e07f249e8c9f..000000000000
--- a/ppdiffusers/examples/controlnet/annotator/openpose/__init__.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-import numpy as np
-import paddle
-import paddlehub as hub
-from annotator.util import annotator_ckpts_path
-
-from . import util
-
-
-class OpenposePaddleDetector:
-    def __init__(self):
-        self.body_estimation = hub.Module(name="openpose_body_estimation")
-        self.hand_estimation = hub.Module(name="openpose_hands_estimation")
-
-    def __call__(self, oriImg, hand=False):
-        oriImg = oriImg[:, :, ::-1].copy()
-        with paddle.no_grad():
-            canvas = oriImg[:, :, ::-1].copy()
-            canvas.fill(0)
-            result = self.body_estimation.predict(oriImg, save_path="saved_images", visualization=False)
-            canvas = self.body_estimation.draw_pose(canvas, result["candidate"], result["subset"])
-            if hand:
-                hands_list = util.hand_detect(result["candidate"], result["subset"], oriImg)
-                all_hand_peaks = []
-                for x, y, w, is_left in hands_list:
-                    scale_search = [0.5, 1.0, 1.5, 2.0]
-                    peaks = self.hand_estimation.hand_estimation(
-                        oriImg[y : y + w, x : x + w, :], scale_search=scale_search
-                    )
-                    peaks[:, 0] = np.where(peaks[:, 0] == 0, peaks[:, 0], peaks[:, 0] + x)
-                    peaks[:, 1] = np.where(peaks[:, 1] == 0, peaks[:, 1], peaks[:, 1] + y)
-                    all_hand_peaks.append(peaks)
-                canvas = self.hand_estimation.draw_hand(canvas, all_hand_peaks)
-
-            return canvas, dict(candidate=result["candidate"].tolist(), subset=result["subset"].tolist())
diff --git a/ppdiffusers/examples/controlnet/annotator/openpose/util.py b/ppdiffusers/examples/controlnet/annotator/openpose/util.py
deleted file mode 100644
index 0f7c1092d3ca..000000000000
--- a/ppdiffusers/examples/controlnet/annotator/openpose/util.py
+++ /dev/null
@@ -1,247 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-
-import cv2
-import matplotlib
-import numpy as np
-
-
-def pad_right_down_corner(img, stride, padValue):
-    h = img.shape[0]
-    w = img.shape[1]
-
-    pad = 4 * [None]
-    pad[0] = 0  # up
-    pad[1] = 0  # left
-    pad[2] = 0 if (h % stride == 0) else stride - (h % stride)  # down
-    pad[3] = 0 if (w % stride == 0) else stride - (w % stride)  # right
-
-    img_padded = img
-    pad_up = np.tile(img_padded[0:1, :, :] * 0 + padValue, (pad[0], 1, 1))
-    img_padded = np.concatenate((pad_up, img_padded), axis=0)
-    pad_left = np.tile(img_padded[:, 0:1, :] * 0 + padValue, (1, pad[1], 1))
-    img_padded = np.concatenate((pad_left, img_padded), axis=1)
-    pad_down = np.tile(img_padded[-2:-1, :, :] * 0 + padValue, (pad[2], 1, 1))
-    img_padded = np.concatenate((img_padded, pad_down), axis=0)
-    pad_right = np.tile(img_padded[:, -2:-1, :] * 0 + padValue, (1, pad[3], 1))
-    img_padded = np.concatenate((img_padded, pad_right), axis=1)
-
-    return img_padded, pad
-
-
-# transfer caffe model to pytorch which will match the layer name
-def transfer(model, model_weights):
-    transfered_model_weights = {}
-    for weights_name in model.state_dict().keys():
-        transfered_model_weights[weights_name] = model_weights[".".join(weights_name.split(".")[1:])]
-    return transfered_model_weights
-
-
-# draw the body keypoint and lims
-def draw_bodypose(canvas, candidate, subset):
-    stickwidth = 4
-    limbSeq = [
-        [2, 3],
-        [2, 6],
-        [3, 4],
-        [4, 5],
-        [6, 7],
-        [7, 8],
-        [2, 9],
-        [9, 10],
-        [10, 11],
-        [2, 12],
-        [12, 13],
-        [13, 14],
-        [2, 1],
-        [1, 15],
-        [15, 17],
-        [1, 16],
-        [16, 18],
-        [3, 17],
-        [6, 18],
-    ]
-
-    colors = [
-        [255, 0, 0],
-        [255, 85, 0],
-        [255, 170, 0],
-        [255, 255, 0],
-        [170, 255, 0],
-        [85, 255, 0],
-        [0, 255, 0],
-        [0, 255, 85],
-        [0, 255, 170],
-        [0, 255, 255],
-        [0, 170, 255],
-        [0, 85, 255],
-        [0, 0, 255],
-        [85, 0, 255],
-        [170, 0, 255],
-        [255, 0, 255],
-        [255, 0, 170],
-        [255, 0, 85],
-    ]
-    for i in range(18):
-        for n in range(len(subset)):
-            index = int(subset[n][i])
-            if index == -1:
-                continue
-            x, y = candidate[index][0:2]
-            cv2.circle(canvas, (int(x), int(y)), 4, colors[i], thickness=-1)
-    for i in range(17):
-        for n in range(len(subset)):
-            index = subset[n][np.array(limbSeq[i]) - 1]
-            if -1 in index:
-                continue
-            cur_canvas = canvas.copy()
-            Y = candidate[index.astype(int), 0]
-            X = candidate[index.astype(int), 1]
-            mX = np.mean(X)
-            mY = np.mean(Y)
-            length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
-            angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
-            polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1)
-            cv2.fillConvexPoly(cur_canvas, polygon, colors[i])
-            canvas = cv2.addWeighted(canvas, 0.4, cur_canvas, 0.6, 0)
-    return canvas
-
-
-# image drawed by opencv is not good.
-def draw_handpose(canvas, all_hand_peaks, show_number=False):
-    edges = [
-        [0, 1],
-        [1, 2],
-        [2, 3],
-        [3, 4],
-        [0, 5],
-        [5, 6],
-        [6, 7],
-        [7, 8],
-        [0, 9],
-        [9, 10],
-        [10, 11],
-        [11, 12],
-        [0, 13],
-        [13, 14],
-        [14, 15],
-        [15, 16],
-        [0, 17],
-        [17, 18],
-        [18, 19],
-        [19, 20],
-    ]
-
-    for peaks in all_hand_peaks:
-        for ie, e in enumerate(edges):
-            if np.sum(np.all(peaks[e], axis=1) == 0) == 0:
-                x1, y1 = peaks[e[0]]
-                x2, y2 = peaks[e[1]]
-                cv2.line(
-                    canvas,
-                    (x1, y1),
-                    (x2, y2),
-                    matplotlib.colors.hsv_to_rgb([ie / float(len(edges)), 1.0, 1.0]) * 255,
-                    thickness=2,
-                )
-
-        for i, keyponit in enumerate(peaks):
-            x, y = keyponit
-            cv2.circle(canvas, (x, y), 4, (0, 0, 255), thickness=-1)
-            if show_number:
-                cv2.putText(canvas, str(i), (x, y), cv2.FONT_HERSHEY_SIMPLEX, 0.3, (0, 0, 0), lineType=cv2.LINE_AA)
-    return canvas
-
-
-# detect hand according to body pose keypoints
-# please refer to https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/src/openpose/hand/handDetector.cpp
-def hand_detect(candidate, subset, oriImg):
-    # right hand: wrist 4, elbow 3, shoulder 2
-    # left hand: wrist 7, elbow 6, shoulder 5
-    ratioWristElbow = 0.33
-    detect_result = []
-    image_height, image_width = oriImg.shape[0:2]
-    for person in subset.astype(int):
-        # if any of three not detected
-        has_left = np.sum(person[[5, 6, 7]] == -1) == 0
-        has_right = np.sum(person[[2, 3, 4]] == -1) == 0
-        if not (has_left or has_right):
-            continue
-        hands = []
-        # left hand
-        if has_left:
-            left_shoulder_index, left_elbow_index, left_wrist_index = person[[5, 6, 7]]
-            x1, y1 = candidate[left_shoulder_index][:2]
-            x2, y2 = candidate[left_elbow_index][:2]
-            x3, y3 = candidate[left_wrist_index][:2]
-            hands.append([x1, y1, x2, y2, x3, y3, True])
-        # right hand
-        if has_right:
-            right_shoulder_index, right_elbow_index, right_wrist_index = person[[2, 3, 4]]
-            x1, y1 = candidate[right_shoulder_index][:2]
-            x2, y2 = candidate[right_elbow_index][:2]
-            x3, y3 = candidate[right_wrist_index][:2]
-            hands.append([x1, y1, x2, y2, x3, y3, False])
-
-        for x1, y1, x2, y2, x3, y3, is_left in hands:
-            # pos_hand = pos_wrist + ratio * (pos_wrist - pos_elbox) = (1 + ratio) * pos_wrist - ratio * pos_elbox
-            # handRectangle.x = posePtr[wrist*3] + ratioWristElbow * (posePtr[wrist*3] - posePtr[elbow*3]);
-            # handRectangle.y = posePtr[wrist*3+1] + ratioWristElbow * (posePtr[wrist*3+1] - posePtr[elbow*3+1]);
-            # const auto distanceWristElbow = getDistance(poseKeypoints, person, wrist, elbow);
-            # const auto distanceElbowShoulder = getDistance(poseKeypoints, person, elbow, shoulder);
-            # handRectangle.width = 1.5f * fastMax(distanceWristElbow, 0.9f * distanceElbowShoulder);
-            x = x3 + ratioWristElbow * (x3 - x2)
-            y = y3 + ratioWristElbow * (y3 - y2)
-            distanceWristElbow = math.sqrt((x3 - x2) ** 2 + (y3 - y2) ** 2)
-            distanceElbowShoulder = math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
-            width = 1.5 * max(distanceWristElbow, 0.9 * distanceElbowShoulder)
-            # x-y refers to the center --> offset to topLeft point
-            # handRectangle.x -= handRectangle.width / 2.f;
-            # handRectangle.y -= handRectangle.height / 2.f;
-            x -= width / 2
-            y -= width / 2  # width = height
-            # overflow the image
-            if x < 0:
-                x = 0
-            if y < 0:
-                y = 0
-            width1 = width
-            width2 = width
-            if x + width > image_width:
-                width1 = image_width - x
-            if y + width > image_height:
-                width2 = image_height - y
-            width = min(width1, width2)
-            # the max hand box value is 20 pixels
-            if width >= 20:
-                detect_result.append([int(x), int(y), int(width), is_left])
-
-    """
-    return value: [[x, y, w, True if left hand else False]].
-    width=height since the network require squared input.
-    x, y is the coordinate of top left
-    """
-    return detect_result
-
-
-# get max index of 2d array
-def npmax(array):
-    arrayindex = array.argmax(1)
-    arrayvalue = array.max(1)
-    i = arrayvalue.argmax()
-    j = arrayindex[i]
-    return i, j
diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/__init__.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/__init__.py
deleted file mode 100644
index 1cc5512cdad6..000000000000
--- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/__init__.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import cv2
-import numpy as np
-import paddle
-import paddlehub as hub
-
-from . import util
-from .det_keypoint_unite_infer import PPDetPose
-
-
-def keypoint_to_openpose_kpts(coco_keypoints_list):
-    # coco keypoints: [x1,y1,v1,...,xk,yk,vk]       (k=17)
-    #     ['Nose', Leye', 'Reye', 'Lear', 'Rear', 'Lsho', 'Rsho', 'Lelb',
-    #      'Relb', 'Lwri', 'Rwri', 'Lhip', 'Rhip', 'Lkne', 'Rkne', 'Lank', 'Rank']
-    # openpose keypoints: [y1,...,yk], [x1,...xk]   (k=18, with Neck)
-    #     ['Nose', *'Neck'*, 'Rsho', 'Relb', 'Rwri', 'Lsho', 'Lelb', 'Lwri','Rhip',
-    #      'Rkne', 'Rank', 'Lhip', 'Lkne', 'Lank', 'Reye', 'Leye', 'Rear', 'Lear']
-    indices = [0, 6, 8, 10, 5, 7, 9, 12, 14, 16, 11, 13, 15, 2, 1, 4, 3]
-    openpose_kpts = []
-    for i in indices:
-        openpose_kpts.append(coco_keypoints_list[i])
-
-    # Get 'Neck' keypoint by interpolating between 'Lsho' and 'Rsho' keypoints
-    l_shoulder_index = 5
-    r_shoulder_index = 6
-    l_shoulder_keypoint = coco_keypoints_list[l_shoulder_index]
-    r_shoulder_keypoint = coco_keypoints_list[r_shoulder_index]
-
-    neck_keypoint_y = int((l_shoulder_keypoint[1] + r_shoulder_keypoint[1]) / 2.0)
-    neck_keypoint_x = int((l_shoulder_keypoint[0] + r_shoulder_keypoint[0]) / 2.0)
-    neck_keypoint = [neck_keypoint_x, neck_keypoint_y, min(l_shoulder_keypoint[2], r_shoulder_keypoint[2])]
-    open_pose_neck_index = 1
-    openpose_kpts.insert(open_pose_neck_index, neck_keypoint)
-
-    return openpose_kpts
-
-
-class PPDetDetector:
-    def __init__(self):
-        self.body_estimation = hub.Module(name="openpose_body_estimation")
-        self.hand_estimation = hub.Module(name="openpose_hands_estimation")
-        self.ppdetpose = PPDetPose()
-
-    def __call__(self, oriImg, detect_resolution=512, hand=False):
-        with paddle.no_grad():
-            img_scalarfactor = detect_resolution / min(oriImg.shape[:2])
-            result = self.ppdetpose_pred(oriImg)
-            result["candidate"] = result["candidate"] * img_scalarfactor
-            oriImg = cv2.resize(oriImg, (0, 0), fx=img_scalarfactor, fy=img_scalarfactor)
-            canvas = oriImg.copy()
-            canvas.fill(0)
-            canvas = self.body_estimation.draw_pose(canvas, result["candidate"], result["subset"])
-            if hand:
-                hands_list = util.hand_detect(result["candidate"], result["subset"], oriImg)
-                all_hand_peaks = []
-                for x, y, w, is_left in hands_list:
-                    scale_search = [x * img_scalarfactor for x in [0.5, 1.0, 1.5, 2.0]]
-                    peaks = self.hand_estimation.hand_estimation(
-                        oriImg[y : y + w, x : x + w, ::-1], scale_search=scale_search
-                    )
-                    peaks[:, 0] = np.where(peaks[:, 0] == 0, peaks[:, 0], peaks[:, 0] + x)
-                    peaks[:, 1] = np.where(peaks[:, 1] == 0, peaks[:, 1], peaks[:, 1] + y)
-                    all_hand_peaks.append(peaks)
-                canvas = util.draw_handpose(canvas, all_hand_peaks)
-
-            return canvas, dict(candidate=result["candidate"].tolist(), subset=result["subset"].tolist())
-
-    def ppdetpose_pred(self, image, kpt_threshold=0.3):
-        poseres = self.ppdetpose.ppdet_hrnet_infer(image)
-        keypoints = poseres["keypoint"][0]
-        num_kpts = len(keypoints)
-        subset = np.ones((num_kpts, 20)) * -1
-        candidate = np.zeros((0, 4))
-        posnum = 0
-        for kptid, keypoint in enumerate(keypoints):
-            openpose_kpts = keypoint_to_openpose_kpts(keypoint)
-            for idx, item in enumerate(openpose_kpts):
-                if item[2] > kpt_threshold:
-                    subset[kptid][idx] = posnum
-                    kpt = np.array(
-                        item
-                        + [
-                            posnum,
-                        ]
-                    )
-                    candidate = np.vstack((candidate, kpt))
-                    posnum += 1
-        return {"candidate": candidate, "subset": subset}
diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/benchmark_utils.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/benchmark_utils.py
deleted file mode 100644
index f5a83c60ca68..000000000000
--- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/benchmark_utils.py
+++ /dev/null
@@ -1,273 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import os
-from pathlib import Path
-
-import paddle
-import paddle.inference as paddle_infer
-
-CUR_DIR = os.path.dirname(os.path.abspath(__file__))
-LOG_PATH_ROOT = f"{CUR_DIR}/../../output"
-
-
-class PaddleInferBenchmark(object):
-    def __init__(
-        self,
-        config,
-        model_info: dict = {},
-        data_info: dict = {},
-        perf_info: dict = {},
-        resource_info: dict = {},
-        **kwargs
-    ):
-        """
-        Construct PaddleInferBenchmark Class to format logs.
-        args:
-            config(paddle.inference.Config): paddle inference config
-            model_info(dict): basic model info
-                {'model_name': 'resnet50'
-                 'precision': 'fp32'}
-            data_info(dict): input data info
-                {'batch_size': 1
-                 'shape': '3,224,224'
-                 'data_num': 1000}
-            perf_info(dict): performance result
-                {'preprocess_time_s': 1.0
-                'inference_time_s': 2.0
-                'postprocess_time_s': 1.0
-                'total_time_s': 4.0}
-            resource_info(dict):
-                cpu and gpu resources
-                {'cpu_rss': 100
-                 'gpu_rss': 100
-                 'gpu_util': 60}
-        """
-        # PaddleInferBenchmark Log Version
-        self.log_version = "1.0.3"
-
-        # Paddle Version
-        self.paddle_version = paddle.__version__
-        self.paddle_commit = paddle.__git_commit__
-        paddle_infer_info = paddle_infer.get_version()
-        self.paddle_branch = paddle_infer_info.strip().split(": ")[-1]
-
-        # model info
-        self.model_info = model_info
-
-        # data info
-        self.data_info = data_info
-
-        # perf info
-        self.perf_info = perf_info
-
-        try:
-            # required value
-            self.model_name = model_info["model_name"]
-            self.precision = model_info["precision"]
-
-            self.batch_size = data_info["batch_size"]
-            self.shape = data_info["shape"]
-            self.data_num = data_info["data_num"]
-
-            self.inference_time_s = round(perf_info["inference_time_s"], 4)
-        except:
-            self.print_help()
-            raise ValueError("Set argument wrong, please check input argument and its type")
-
-        self.preprocess_time_s = perf_info.get("preprocess_time_s", 0)
-        self.postprocess_time_s = perf_info.get("postprocess_time_s", 0)
-        self.with_tracker = True if "tracking_time_s" in perf_info else False
-        self.tracking_time_s = perf_info.get("tracking_time_s", 0)
-        self.total_time_s = perf_info.get("total_time_s", 0)
-
-        self.inference_time_s_90 = perf_info.get("inference_time_s_90", "")
-        self.inference_time_s_99 = perf_info.get("inference_time_s_99", "")
-        self.succ_rate = perf_info.get("succ_rate", "")
-        self.qps = perf_info.get("qps", "")
-
-        # conf info
-        self.config_status = self.parse_config(config)
-
-        # mem info
-        if isinstance(resource_info, dict):
-            self.cpu_rss_mb = int(resource_info.get("cpu_rss_mb", 0))
-            self.cpu_vms_mb = int(resource_info.get("cpu_vms_mb", 0))
-            self.cpu_shared_mb = int(resource_info.get("cpu_shared_mb", 0))
-            self.cpu_dirty_mb = int(resource_info.get("cpu_dirty_mb", 0))
-            self.cpu_util = round(resource_info.get("cpu_util", 0), 2)
-
-            self.gpu_rss_mb = int(resource_info.get("gpu_rss_mb", 0))
-            self.gpu_util = round(resource_info.get("gpu_util", 0), 2)
-            self.gpu_mem_util = round(resource_info.get("gpu_mem_util", 0), 2)
-        else:
-            self.cpu_rss_mb = 0
-            self.cpu_vms_mb = 0
-            self.cpu_shared_mb = 0
-            self.cpu_dirty_mb = 0
-            self.cpu_util = 0
-
-            self.gpu_rss_mb = 0
-            self.gpu_util = 0
-            self.gpu_mem_util = 0
-
-        # init benchmark logger
-        self.benchmark_logger()
-
-    def benchmark_logger(self):
-        """
-        benchmark logger
-        """
-        # remove other logging handler
-        for handler in logging.root.handlers[:]:
-            logging.root.removeHandler(handler)
-
-        # Init logger
-        FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
-        log_output = f"{LOG_PATH_ROOT}/{self.model_name}.log"
-        Path(f"{LOG_PATH_ROOT}").mkdir(parents=True, exist_ok=True)
-        logging.basicConfig(
-            level=logging.INFO,
-            format=FORMAT,
-            handlers=[
-                logging.FileHandler(filename=log_output, mode="w"),
-                logging.StreamHandler(),
-            ],
-        )
-        self.logger = logging.getLogger(__name__)
-        self.logger.info(f"Paddle Inference benchmark log will be saved to {log_output}")
-
-    def parse_config(self, config) -> dict:
-        """
-        parse paddle predictor config
-        args:
-            config(paddle.inference.Config): paddle inference config
-        return:
-            config_status(dict): dict style config info
-        """
-        if isinstance(config, paddle_infer.Config):
-            config_status = {}
-            config_status["runtime_device"] = "gpu" if config.use_gpu() else "cpu"
-            config_status["ir_optim"] = config.ir_optim()
-            config_status["enable_tensorrt"] = config.tensorrt_engine_enabled()
-            config_status["precision"] = self.precision
-            config_status["enable_mkldnn"] = config.mkldnn_enabled()
-            config_status["cpu_math_library_num_threads"] = config.cpu_math_library_num_threads()
-        elif isinstance(config, dict):
-            config_status["runtime_device"] = config.get("runtime_device", "")
-            config_status["ir_optim"] = config.get("ir_optim", "")
-            config_status["enable_tensorrt"] = config.get("enable_tensorrt", "")
-            config_status["precision"] = config.get("precision", "")
-            config_status["enable_mkldnn"] = config.get("enable_mkldnn", "")
-            config_status["cpu_math_library_num_threads"] = config.get("cpu_math_library_num_threads", "")
-        else:
-            self.print_help()
-            raise ValueError("Set argument config wrong, please check input argument and its type")
-        return config_status
-
-    def report(self, identifier=None):
-        """
-        print log report
-        args:
-            identifier(string): identify log
-        """
-        if identifier:
-            identifier = f"[{identifier}]"
-        else:
-            identifier = ""
-
-        self.logger.info("\n")
-        self.logger.info("---------------------- Paddle info ----------------------")
-        self.logger.info(f"{identifier} paddle_version: {self.paddle_version}")
-        self.logger.info(f"{identifier} paddle_commit: {self.paddle_commit}")
-        self.logger.info(f"{identifier} paddle_branch: {self.paddle_branch}")
-        self.logger.info(f"{identifier} log_api_version: {self.log_version}")
-        self.logger.info("----------------------- Conf info -----------------------")
-        self.logger.info(f"{identifier} runtime_device: {self.config_status['runtime_device']}")
-        self.logger.info(f"{identifier} ir_optim: {self.config_status['ir_optim']}")
-        self.logger.info(f"{identifier} enable_memory_optim: {True}")
-        self.logger.info(f"{identifier} enable_tensorrt: {self.config_status['enable_tensorrt']}")
-        self.logger.info(f"{identifier} enable_mkldnn: {self.config_status['enable_mkldnn']}")
-        self.logger.info(
-            f"{identifier} cpu_math_library_num_threads: {self.config_status['cpu_math_library_num_threads']}"
-        )
-        self.logger.info("----------------------- Model info ----------------------")
-        self.logger.info(f"{identifier} model_name: {self.model_name}")
-        self.logger.info(f"{identifier} precision: {self.precision}")
-        self.logger.info("----------------------- Data info -----------------------")
-        self.logger.info(f"{identifier} batch_size: {self.batch_size}")
-        self.logger.info(f"{identifier} input_shape: {self.shape}")
-        self.logger.info(f"{identifier} data_num: {self.data_num}")
-        self.logger.info("----------------------- Perf info -----------------------")
-        self.logger.info(
-            f"{identifier} cpu_rss(MB): {self.cpu_rss_mb}, cpu_vms: {self.cpu_vms_mb}, cpu_shared_mb: {self.cpu_shared_mb}, cpu_dirty_mb: {self.cpu_dirty_mb}, cpu_util: {self.cpu_util}%"
-        )
-        self.logger.info(
-            f"{identifier} gpu_rss(MB): {self.gpu_rss_mb}, gpu_util: {self.gpu_util}%, gpu_mem_util: {self.gpu_mem_util}%"
-        )
-        self.logger.info(f"{identifier} total time spent(s): {self.total_time_s}")
-
-        if self.with_tracker:
-            self.logger.info(
-                f"{identifier} preprocess_time(ms): {round(self.preprocess_time_s*1000, 1)}, "
-                f"inference_time(ms): {round(self.inference_time_s*1000, 1)}, "
-                f"postprocess_time(ms): {round(self.postprocess_time_s*1000, 1)}, "
-                f"tracking_time(ms): {round(self.tracking_time_s*1000, 1)}"
-            )
-        else:
-            self.logger.info(
-                f"{identifier} preprocess_time(ms): {round(self.preprocess_time_s*1000, 1)}, "
-                f"inference_time(ms): {round(self.inference_time_s*1000, 1)}, "
-                f"postprocess_time(ms): {round(self.postprocess_time_s*1000, 1)}"
-            )
-        if self.inference_time_s_90:
-            self.looger.info(
-                f"{identifier} 90%_cost: {self.inference_time_s_90}, 99%_cost: {self.inference_time_s_99}, succ_rate: {self.succ_rate}"
-            )
-        if self.qps:
-            self.logger.info(f"{identifier} QPS: {self.qps}")
-
-    def print_help(self):
-        """
-        print function help
-        """
-        print(
-            """Usage:
-            ==== Print inference benchmark logs. ====
-            config = paddle.inference.Config()
-            model_info = {'model_name': 'resnet50'
-                          'precision': 'fp32'}
-            data_info = {'batch_size': 1
-                         'shape': '3,224,224'
-                         'data_num': 1000}
-            perf_info = {'preprocess_time_s': 1.0
-                         'inference_time_s': 2.0
-                         'postprocess_time_s': 1.0
-                         'total_time_s': 4.0}
-            resource_info = {'cpu_rss_mb': 100
-                             'gpu_rss_mb': 100
-                             'gpu_util': 60}
-            log = PaddleInferBenchmark(config, model_info, data_info, perf_info, resource_info)
-            log('Test')
-            """
-        )
-
-    def __call__(self, identifier=None):
-        """
-        __call__
-        args:
-            identifier(string): identify log
-        """
-        self.report(identifier)
diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/det_keypoint_unite_infer.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/det_keypoint_unite_infer.py
deleted file mode 100644
index 7f743e09667a..000000000000
--- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/det_keypoint_unite_infer.py
+++ /dev/null
@@ -1,448 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import math
-import os
-
-import cv2
-import numpy as np
-import paddle
-import yaml
-
-from paddlenlp.utils.downloader import get_path_from_url_with_filelock
-
-from .det_keypoint_unite_utils import argsparser
-from .infer import (  # noqa F401
-    Detector,
-    DetectorPicoDet,
-    PredictConfig,
-    bench_log,
-    get_test_images,
-    print_arguments,
-)
-from .keypoint_infer import KeyPointDetector
-from .keypoint_postprocess import translate_to_ori_images
-from .preprocess import decode_image
-from .utils import get_current_memory_mb
-from .visualize import visualize_pose
-
-KEYPOINT_SUPPORT_MODELS = {"HigherHRNet": "keypoint_bottomup", "HRNet": "keypoint_topdown"}
-
-
-def predict_with_given_det(image, det_res, keypoint_detector, keypoint_batch_size, run_benchmark):
-    keypoint_res = {}
-
-    rec_images, records, det_rects = keypoint_detector.get_person_from_rect(image, det_res)
-
-    if len(det_rects) == 0:
-        keypoint_res["keypoint"] = [[], []]
-        return keypoint_res
-
-    keypoint_vector = []
-    score_vector = []
-
-    rect_vector = det_rects
-    keypoint_results = keypoint_detector.predict_image(rec_images, run_benchmark, repeats=10, visual=False)
-    keypoint_vector, score_vector = translate_to_ori_images(keypoint_results, np.array(records))
-    keypoint_res["keypoint"] = (
-        [keypoint_vector.tolist(), score_vector.tolist()] if len(keypoint_vector) > 0 else [[], []]
-    )
-    keypoint_res["bbox"] = rect_vector
-    return keypoint_res
-
-
-def topdown_unite_predict(detector, topdown_keypoint_detector, image_list, keypoint_batch_size=1, save_res=False):
-    det_timer = detector.get_timer()
-    store_res = []
-    for i, img_file in enumerate(image_list):
-        # Decode image in advance in det + pose prediction
-        det_timer.preprocess_time_s.start()
-        image, _ = decode_image(img_file, {})
-        det_timer.preprocess_time_s.end()
-
-        if FLAGS.run_benchmark:
-            results = detector.predict_image([image], run_benchmark=True, repeats=10)
-
-            cm, gm, gu = get_current_memory_mb()
-            detector.cpu_mem += cm
-            detector.gpu_mem += gm
-            detector.gpu_util += gu
-        else:
-            results = detector.predict_image([image], visual=False)
-        results = detector.filter_box(results, FLAGS.det_threshold)
-        if results["boxes_num"] > 0:
-            keypoint_res = predict_with_given_det(
-                image, results, topdown_keypoint_detector, keypoint_batch_size, FLAGS.run_benchmark
-            )
-
-            if save_res:
-                save_name = img_file if isinstance(img_file, str) else i
-                store_res.append(
-                    [save_name, keypoint_res["bbox"], [keypoint_res["keypoint"][0], keypoint_res["keypoint"][1]]]
-                )
-        else:
-            results["keypoint"] = [[], []]
-            keypoint_res = results
-        if FLAGS.run_benchmark:
-            cm, gm, gu = get_current_memory_mb()
-            topdown_keypoint_detector.cpu_mem += cm
-            topdown_keypoint_detector.gpu_mem += gm
-            topdown_keypoint_detector.gpu_util += gu
-        else:
-            if not os.path.exists(FLAGS.output_dir):
-                os.makedirs(FLAGS.output_dir)
-            visualize_pose(img_file, keypoint_res, visual_thresh=FLAGS.keypoint_threshold, save_dir=FLAGS.output_dir)
-    if save_res:
-        """
-        1) store_res: a list of image_data
-        2) image_data: [imageid, rects, [keypoints, scores]]
-        3) rects: list of rect [xmin, ymin, xmax, ymax]
-        4) keypoints: 17(joint numbers)*[x, y, conf], total 51 data in list
-        5) scores: mean of all joint conf
-        """
-        with open("det_keypoint_unite_image_results.json", "w") as wf:
-            json.dump(store_res, wf, indent=4)
-
-
-def topdown_unite_predict_singleimage(
-    detector, topdown_keypoint_detector, image, keypoint_batch_size=8, det_threshold=0.25
-):
-
-    results = detector.predict_image([image], visual=False)
-    results = detector.filter_box(results, det_threshold)
-    if results["boxes_num"] > 0:
-        keypoint_res = predict_with_given_det(image, results, topdown_keypoint_detector, keypoint_batch_size, False)
-
-    else:
-        results["keypoint"] = [[], []]
-        keypoint_res = results
-    return keypoint_res
-
-
-def topdown_unite_predict_video(detector, topdown_keypoint_detector, camera_id, keypoint_batch_size=1, save_res=False):
-    video_name = "output.mp4"
-    if camera_id != -1:
-        capture = cv2.VideoCapture(camera_id)
-    else:
-        capture = cv2.VideoCapture(FLAGS.video_file)
-        video_name = os.path.split(FLAGS.video_file)[-1]
-    # Get Video info : resolution, fps, frame count
-    width = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH))
-    height = int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
-    fps = int(capture.get(cv2.CAP_PROP_FPS))
-    frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
-    print("fps: %d, frame_count: %d" % (fps, frame_count))
-
-    if not os.path.exists(FLAGS.output_dir):
-        os.makedirs(FLAGS.output_dir)
-    out_path = os.path.join(FLAGS.output_dir, video_name)
-    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
-    writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height))
-    index = 0
-    store_res = []
-    keypoint_smoothing = KeypointSmoothing(width, height, filter_type=FLAGS.filter_type, beta=0.05)
-
-    while 1:
-        ret, frame = capture.read()
-        if not ret:
-            break
-        index += 1
-        print("detect frame: %d" % (index))
-
-        frame2 = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-
-        results = detector.predict_image([frame2], visual=False)
-        results = detector.filter_box(results, FLAGS.det_threshold)
-        if results["boxes_num"] == 0:
-            writer.write(frame)
-            continue
-
-        keypoint_res = predict_with_given_det(
-            frame2, results, topdown_keypoint_detector, keypoint_batch_size, FLAGS.run_benchmark
-        )
-
-        if FLAGS.smooth and len(keypoint_res["keypoint"][0]) == 1:
-            current_keypoints = np.array(keypoint_res["keypoint"][0][0])
-            smooth_keypoints = keypoint_smoothing.smooth_process(current_keypoints)
-
-            keypoint_res["keypoint"][0][0] = smooth_keypoints.tolist()
-
-        im = visualize_pose(frame, keypoint_res, visual_thresh=FLAGS.keypoint_threshold, returnimg=True)
-
-        if save_res:
-            store_res.append([index, keypoint_res["bbox"], [keypoint_res["keypoint"][0], keypoint_res["keypoint"][1]]])
-
-        writer.write(im)
-        if camera_id != -1:
-            cv2.imshow("Mask Detection", im)
-            if cv2.waitKey(1) & 0xFF == ord("q"):
-                break
-    writer.release()
-    print("output_video saved to: {}".format(out_path))
-    if save_res:
-        """
-        1) store_res: a list of frame_data
-        2) frame_data: [frameid, rects, [keypoints, scores]]
-        3) rects: list of rect [xmin, ymin, xmax, ymax]
-        4) keypoints: 17(joint numbers)*[x, y, conf], total 51 data in list
-        5) scores: mean of all joint conf
-        """
-        with open("det_keypoint_unite_video_results.json", "w") as wf:
-            json.dump(store_res, wf, indent=4)
-
-
-class KeypointSmoothing(object):
-    # The following code are modified from:
-    # https://github.com/jaantollander/OneEuroFilter
-
-    def __init__(self, width, height, filter_type, alpha=0.5, fc_d=0.1, fc_min=0.1, beta=0.1, thres_mult=0.3):
-        super(KeypointSmoothing, self).__init__()
-        self.image_width = width
-        self.image_height = height
-        self.threshold = (
-            np.array(
-                [
-                    0.005,
-                    0.005,
-                    0.005,
-                    0.005,
-                    0.005,
-                    0.01,
-                    0.01,
-                    0.01,
-                    0.01,
-                    0.01,
-                    0.01,
-                    0.01,
-                    0.01,
-                    0.01,
-                    0.01,
-                    0.01,
-                    0.01,
-                ]
-            )
-            * thres_mult
-        )
-        self.filter_type = filter_type
-        self.alpha = alpha
-        self.dx_prev_hat = None
-        self.x_prev_hat = None
-        self.fc_d = fc_d
-        self.fc_min = fc_min
-        self.beta = beta
-
-        if self.filter_type == "OneEuro":
-            self.smooth_func = self.one_euro_filter
-        elif self.filter_type == "EMA":
-            self.smooth_func = self.ema_filter
-        else:
-            raise ValueError("filter type must be one_euro or ema")
-
-    def smooth_process(self, current_keypoints):
-        if self.x_prev_hat is None:
-            self.x_prev_hat = current_keypoints[:, :2]
-            self.dx_prev_hat = np.zeros(current_keypoints[:, :2].shape)
-            return current_keypoints
-        else:
-            result = current_keypoints
-            num_keypoints = len(current_keypoints)
-            for i in range(num_keypoints):
-                result[i, :2] = self.smooth(current_keypoints[i, :2], self.threshold[i], i)
-            return result
-
-    def smooth(self, current_keypoint, threshold, index):
-        distance = np.sqrt(
-            np.square((current_keypoint[0] - self.x_prev_hat[index][0]) / self.image_width)
-            + np.square((current_keypoint[1] - self.x_prev_hat[index][1]) / self.image_height)
-        )
-        if distance < threshold:
-            result = self.x_prev_hat[index]
-        else:
-            result = self.smooth_func(current_keypoint, self.x_prev_hat[index], index)
-
-        return result
-
-    def one_euro_filter(self, x_cur, x_pre, index):
-        te = 1
-        self.alpha = self.smoothing_factor(te, self.fc_d)
-        dx_cur = (x_cur - x_pre) / te
-        dx_cur_hat = self.exponential_smoothing(dx_cur, self.dx_prev_hat[index])
-
-        fc = self.fc_min + self.beta * np.abs(dx_cur_hat)
-        self.alpha = self.smoothing_factor(te, fc)
-        x_cur_hat = self.exponential_smoothing(x_cur, x_pre)
-        self.dx_prev_hat[index] = dx_cur_hat
-        self.x_prev_hat[index] = x_cur_hat
-        return x_cur_hat
-
-    def ema_filter(self, x_cur, x_pre, index):
-        x_cur_hat = self.exponential_smoothing(x_cur, x_pre)
-        self.x_prev_hat[index] = x_cur_hat
-        return x_cur_hat
-
-    def smoothing_factor(self, te, fc):
-        r = 2 * math.pi * fc * te
-        return r / (r + 1)
-
-    def exponential_smoothing(self, x_cur, x_pre, index=0):
-        return self.alpha * x_cur + (1 - self.alpha) * x_pre
-
-
-det_model_dir = "annotator/ppdet_hrnet/models/picodet_v2_s_320_pedestrian/"
-keypoint_model_dir = "annotator/ppdet_hrnet/models/dark_hrnet_w32_256x192/"
-keypoint_batch_size = 8
-use_dark = True
-device = "gpu"
-run_mode = "paddle"
-trt_min_shape = 1
-trt_max_shape = 1920
-trt_opt_shape = 640
-trt_calib_mode = False
-cpu_threads = 4
-enable_mkldnn = False
-det_threshold = 0.4
-
-if not os.path.exists(det_model_dir):
-    detmodel_url = (
-        "https://bj.bcebos.com/v1/paddledet/models/keypoint/tinypose_enhance/picodet_s_320_lcnet_pedestrian.zip"
-    )
-    get_path_from_url_with_filelock(detmodel_url, root_dir="annotator/ppdet_hrnet/models/")
-if not os.path.exists(keypoint_model_dir):
-    kptmodel_url = "https://bj.bcebos.com/v1/paddledet/models/pipeline/dark_hrnet_w32_256x192.zip"
-    get_path_from_url_with_filelock(kptmodel_url, root_dir="annotator/ppdet_hrnet/models/")
-
-
-class PPDetPose(object):
-    def __init__(self) -> None:
-        deploy_file = os.path.join(det_model_dir, "infer_cfg.yml")
-        with open(deploy_file) as f:
-            yml_conf = yaml.safe_load(f)
-        arch = yml_conf["arch"]
-        detector_func = "Detector"
-        if arch == "PicoDet":
-            detector_func = "DetectorPicoDet"
-
-        self.detector = eval(detector_func)(
-            det_model_dir,
-            device=device,
-            run_mode=run_mode,
-            trt_min_shape=trt_min_shape,
-            trt_max_shape=trt_max_shape,
-            trt_opt_shape=trt_opt_shape,
-            trt_calib_mode=trt_calib_mode,
-            cpu_threads=cpu_threads,
-            enable_mkldnn=enable_mkldnn,
-            threshold=det_threshold,
-        )
-
-        self.topdown_keypoint_detector = KeyPointDetector(
-            keypoint_model_dir,
-            device=device,
-            run_mode=run_mode,
-            batch_size=keypoint_batch_size,
-            trt_min_shape=trt_min_shape,
-            trt_max_shape=trt_max_shape,
-            trt_opt_shape=trt_opt_shape,
-            trt_calib_mode=trt_calib_mode,
-            cpu_threads=cpu_threads,
-            enable_mkldnn=enable_mkldnn,
-            use_dark=use_dark,
-        )
-        keypoint_arch = self.topdown_keypoint_detector.pred_config.arch
-        assert (
-            KEYPOINT_SUPPORT_MODELS[keypoint_arch] == "keypoint_topdown"
-        ), "Detection-Keypoint unite inference only supports topdown models."
-
-    def ppdet_hrnet_infer(self, image):
-        # predict from image
-        return topdown_unite_predict_singleimage(
-            self.detector, self.topdown_keypoint_detector, image, keypoint_batch_size, det_threshold
-        )
-
-
-def main():
-    deploy_file = os.path.join(FLAGS.det_model_dir, "infer_cfg.yml")
-    with open(deploy_file) as f:
-        yml_conf = yaml.safe_load(f)
-    arch = yml_conf["arch"]
-    detector_func = "Detector"
-    if arch == "PicoDet":
-        detector_func = "DetectorPicoDet"
-
-    detector = eval(detector_func)(
-        FLAGS.det_model_dir,
-        device=FLAGS.device,
-        run_mode=FLAGS.run_mode,
-        trt_min_shape=FLAGS.trt_min_shape,
-        trt_max_shape=FLAGS.trt_max_shape,
-        trt_opt_shape=FLAGS.trt_opt_shape,
-        trt_calib_mode=FLAGS.trt_calib_mode,
-        cpu_threads=FLAGS.cpu_threads,
-        enable_mkldnn=FLAGS.enable_mkldnn,
-        threshold=FLAGS.det_threshold,
-    )
-
-    topdown_keypoint_detector = KeyPointDetector(
-        FLAGS.keypoint_model_dir,
-        device=FLAGS.device,
-        run_mode=FLAGS.run_mode,
-        batch_size=FLAGS.keypoint_batch_size,
-        trt_min_shape=FLAGS.trt_min_shape,
-        trt_max_shape=FLAGS.trt_max_shape,
-        trt_opt_shape=FLAGS.trt_opt_shape,
-        trt_calib_mode=FLAGS.trt_calib_mode,
-        cpu_threads=FLAGS.cpu_threads,
-        enable_mkldnn=FLAGS.enable_mkldnn,
-        use_dark=FLAGS.use_dark,
-    )
-    keypoint_arch = topdown_keypoint_detector.pred_config.arch
-    assert (
-        KEYPOINT_SUPPORT_MODELS[keypoint_arch] == "keypoint_topdown"
-    ), "Detection-Keypoint unite inference only supports topdown models."
-
-    # predict from video file or camera video stream
-    if FLAGS.video_file is not None or FLAGS.camera_id != -1:
-        topdown_unite_predict_video(
-            detector, topdown_keypoint_detector, FLAGS.camera_id, FLAGS.keypoint_batch_size, FLAGS.save_res
-        )
-    else:
-        # predict from image
-        img_list = get_test_images(FLAGS.image_dir, FLAGS.image_file)
-        topdown_unite_predict(detector, topdown_keypoint_detector, img_list, FLAGS.keypoint_batch_size, FLAGS.save_res)
-        if not FLAGS.run_benchmark:
-            detector.det_times.info(average=True)
-            topdown_keypoint_detector.det_times.info(average=True)
-        else:
-            mode = FLAGS.run_mode
-            det_model_dir = FLAGS.det_model_dir
-            det_model_info = {"model_name": det_model_dir.strip("/").split("/")[-1], "precision": mode.split("_")[-1]}
-            bench_log(detector, img_list, det_model_info, name="Det")
-            keypoint_model_dir = FLAGS.keypoint_model_dir
-            keypoint_model_info = {
-                "model_name": keypoint_model_dir.strip("/").split("/")[-1],
-                "precision": mode.split("_")[-1],
-            }
-            bench_log(topdown_keypoint_detector, img_list, keypoint_model_info, FLAGS.keypoint_batch_size, "KeyPoint")
-
-
-if __name__ == "__main__":
-    paddle.enable_static()
-    parser = argsparser()
-    FLAGS = parser.parse_args()
-    print_arguments(FLAGS)
-    FLAGS.device = FLAGS.device.upper()
-    assert FLAGS.device in ["CPU", "GPU", "XPU"], "device should be CPU, GPU or XPU"
-
-    main()
diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/det_keypoint_unite_utils.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/det_keypoint_unite_utils.py
deleted file mode 100644
index 14d0955c971f..000000000000
--- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/det_keypoint_unite_utils.py
+++ /dev/null
@@ -1,122 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import ast
-
-
-def argsparser():
-    parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument(
-        "--det_model_dir",
-        type=str,
-        default=None,
-        help=(
-            "Directory include:'model.pdiparams', 'model.pdmodel', "
-            "'infer_cfg.yml', created by tools/export_model.py."
-        ),
-        required=True,
-    )
-    parser.add_argument(
-        "--keypoint_model_dir",
-        type=str,
-        default=None,
-        help=(
-            "Directory include:'model.pdiparams', 'model.pdmodel', "
-            "'infer_cfg.yml', created by tools/export_model.py."
-        ),
-        required=True,
-    )
-    parser.add_argument("--image_file", type=str, default=None, help="Path of image file.")
-    parser.add_argument(
-        "--image_dir", type=str, default=None, help="Dir of image file, `image_file` has a higher priority."
-    )
-    parser.add_argument(
-        "--keypoint_batch_size",
-        type=int,
-        default=8,
-        help=(
-            "batch_size for keypoint inference. In detection-keypoint unit"
-            "inference, the batch size in detection is 1. Then collate det "
-            "result in batch for keypoint inference."
-        ),
-    )
-    parser.add_argument(
-        "--video_file",
-        type=str,
-        default=None,
-        help="Path of video file, `video_file` or `camera_id` has a highest priority.",
-    )
-    parser.add_argument("--camera_id", type=int, default=-1, help="device id of camera to predict.")
-    parser.add_argument("--det_threshold", type=float, default=0.5, help="Threshold of score.")
-    parser.add_argument("--keypoint_threshold", type=float, default=0.5, help="Threshold of score.")
-    parser.add_argument("--output_dir", type=str, default="output", help="Directory of output visualization files.")
-    parser.add_argument(
-        "--run_mode", type=str, default="paddle", help="mode of running(paddle/trt_fp32/trt_fp16/trt_int8)"
-    )
-    parser.add_argument(
-        "--device",
-        type=str,
-        default="cpu",
-        help="Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU.",
-    )
-    parser.add_argument(
-        "--run_benchmark",
-        type=ast.literal_eval,
-        default=False,
-        help="Whether to predict a image_file repeatedly for benchmark",
-    )
-    parser.add_argument("--enable_mkldnn", type=ast.literal_eval, default=False, help="Whether use mkldnn with CPU.")
-    parser.add_argument("--cpu_threads", type=int, default=1, help="Num of threads with CPU.")
-    parser.add_argument("--trt_min_shape", type=int, default=1, help="min_shape for TensorRT.")
-    parser.add_argument("--trt_max_shape", type=int, default=1280, help="max_shape for TensorRT.")
-    parser.add_argument("--trt_opt_shape", type=int, default=640, help="opt_shape for TensorRT.")
-    parser.add_argument(
-        "--trt_calib_mode",
-        type=bool,
-        default=False,
-        help="If the model is produced by TRT offline quantitative " "calibration, trt_calib_mode need to set True.",
-    )
-    parser.add_argument(
-        "--use_dark",
-        type=ast.literal_eval,
-        default=True,
-        help="whether to use darkpose to get better keypoint position predict ",
-    )
-    parser.add_argument(
-        "--save_res",
-        type=bool,
-        default=False,
-        help=(
-            "whether to save predict results to json file"
-            "1) store_res: a list of image_data"
-            "2) image_data: [imageid, rects, [keypoints, scores]]"
-            "3) rects: list of rect [xmin, ymin, xmax, ymax]"
-            "4) keypoints: 17(joint numbers)*[x, y, conf], total 51 data in list"
-            "5) scores: mean of all joint conf"
-        ),
-    )
-    parser.add_argument(
-        "--smooth",
-        type=ast.literal_eval,
-        default=False,
-        help="smoothing keypoints for each frame, new incoming keypoints will be more stable.",
-    )
-    parser.add_argument(
-        "--filter_type",
-        type=str,
-        default="OneEuro",
-        help="when set --smooth True, choose filter type you want to use, it can be [OneEuro] or [EMA].",
-    )
-    return parser
diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/infer.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/infer.py
deleted file mode 100644
index 5e6a61af6c89..000000000000
--- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/infer.py
+++ /dev/null
@@ -1,1052 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import glob
-import json
-import math
-import os
-from pathlib import Path
-
-import cv2
-import numpy as np
-import paddle
-import yaml
-from paddle.inference import Config, create_predictor
-
-from .benchmark_utils import PaddleInferBenchmark
-from .keypoint_preprocess import EvalAffine, TopDownEvalAffine, expand_crop  # noqa F401
-from .picodet_postprocess import PicoDetPostProcess
-from .preprocess import (  # noqa F401
-    LetterBoxResize,
-    NormalizeImage,
-    Pad,
-    PadStride,
-    Permute,
-    Resize,
-    WarpAffine,
-    decode_image,
-    preprocess,
-)
-from .utils import (
-    Timer,
-    argsparser,
-    coco_clsid2catid,
-    get_current_memory_mb,
-    multiclass_nms,
-)
-from .visualize import visualize_box_mask
-
-# Global dictionary
-SUPPORT_MODELS = {
-    "YOLO",
-    "PPYOLOE",
-    "RCNN",
-    "SSD",
-    "Face",
-    "FCOS",
-    "SOLOv2",
-    "TTFNet",
-    "S2ANet",
-    "JDE",
-    "FairMOT",
-    "DeepSORT",
-    "GFL",
-    "PicoDet",
-    "CenterNet",
-    "TOOD",
-    "RetinaNet",
-    "StrongBaseline",
-    "STGCN",
-    "YOLOX",
-    "YOLOF",
-    "PPHGNet",
-    "PPLCNet",
-    "DETR",
-    "CenterTrack",
-}
-
-TUNED_TRT_DYNAMIC_MODELS = {"DETR"}
-
-
-def bench_log(detector, img_list, model_info, batch_size=1, name=None):
-    mems = {
-        "cpu_rss_mb": detector.cpu_mem / len(img_list),
-        "gpu_rss_mb": detector.gpu_mem / len(img_list),
-        "gpu_util": detector.gpu_util * 100 / len(img_list),
-    }
-    perf_info = detector.det_times.report(average=True)
-    data_info = {"batch_size": batch_size, "shape": "dynamic_shape", "data_num": perf_info["img_num"]}
-    log = PaddleInferBenchmark(detector.config, model_info, data_info, perf_info, mems)
-    log(name)
-
-
-class Detector(object):
-    """
-    Args:
-        pred_config (object): config of model, defined by `Config(model_dir)`
-        model_dir (str): root path of model.pdiparams, model.pdmodel and infer_cfg.yml
-        device (str): Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU
-        run_mode (str): mode of running(paddle/trt_fp32/trt_fp16)
-        batch_size (int): size of pre batch in inference
-        trt_min_shape (int): min shape for dynamic shape in trt
-        trt_max_shape (int): max shape for dynamic shape in trt
-        trt_opt_shape (int): opt shape for dynamic shape in trt
-        trt_calib_mode (bool): If the model is produced by TRT offline quantitative
-            calibration, trt_calib_mode need to set True
-        cpu_threads (int): cpu threads
-        enable_mkldnn (bool): whether to open MKLDNN
-        enable_mkldnn_bfloat16 (bool): whether to turn on mkldnn bfloat16
-        output_dir (str): The path of output
-        threshold (float): The threshold of score for visualization
-        delete_shuffle_pass (bool): whether to remove shuffle_channel_detect_pass in TensorRT.
-                                    Used by action model.
-    """
-
-    def __init__(
-        self,
-        model_dir,
-        device="CPU",
-        run_mode="paddle",
-        batch_size=1,
-        trt_min_shape=1,
-        trt_max_shape=1280,
-        trt_opt_shape=640,
-        trt_calib_mode=False,
-        cpu_threads=1,
-        enable_mkldnn=False,
-        enable_mkldnn_bfloat16=False,
-        output_dir="output",
-        threshold=0.5,
-        delete_shuffle_pass=False,
-    ):
-        self.pred_config = self.set_config(model_dir)
-        self.predictor, self.config = load_predictor(
-            model_dir,
-            self.pred_config.arch,
-            run_mode=run_mode,
-            batch_size=batch_size,
-            min_subgraph_size=self.pred_config.min_subgraph_size,
-            device=device,
-            use_dynamic_shape=self.pred_config.use_dynamic_shape,
-            trt_min_shape=trt_min_shape,
-            trt_max_shape=trt_max_shape,
-            trt_opt_shape=trt_opt_shape,
-            trt_calib_mode=trt_calib_mode,
-            cpu_threads=cpu_threads,
-            enable_mkldnn=enable_mkldnn,
-            enable_mkldnn_bfloat16=enable_mkldnn_bfloat16,
-            delete_shuffle_pass=delete_shuffle_pass,
-        )
-        self.det_times = Timer()
-        self.cpu_mem, self.gpu_mem, self.gpu_util = 0, 0, 0
-        self.batch_size = batch_size
-        self.output_dir = output_dir
-        self.threshold = threshold
-
-    def set_config(self, model_dir):
-        return PredictConfig(model_dir)
-
-    def preprocess(self, image_list):
-        preprocess_ops = []
-        for op_info in self.pred_config.preprocess_infos:
-            new_op_info = op_info.copy()
-            op_type = new_op_info.pop("type")
-            preprocess_ops.append(eval(op_type)(**new_op_info))
-
-        input_im_lst = []
-        input_im_info_lst = []
-        for im_path in image_list:
-            im, im_info = preprocess(im_path, preprocess_ops)
-            input_im_lst.append(im)
-            input_im_info_lst.append(im_info)
-        inputs = create_inputs(input_im_lst, input_im_info_lst)
-        input_names = self.predictor.get_input_names()
-        for i in range(len(input_names)):
-            input_tensor = self.predictor.get_input_handle(input_names[i])
-            if input_names[i] == "x":
-                input_tensor.copy_from_cpu(inputs["image"])
-            else:
-                input_tensor.copy_from_cpu(inputs[input_names[i]])
-
-        return inputs
-
-    def postprocess(self, inputs, result):
-        # postprocess output of predictor
-        np_boxes_num = result["boxes_num"]
-        assert isinstance(np_boxes_num, np.ndarray), "`np_boxes_num` should be a `numpy.ndarray`"
-
-        result = {k: v for k, v in result.items() if v is not None}
-        return result
-
-    def filter_box(self, result, threshold):
-        np_boxes_num = result["boxes_num"]
-        boxes = result["boxes"]
-        start_idx = 0
-        filter_boxes = []
-        filter_num = []
-        for i in range(len(np_boxes_num)):
-            boxes_num = np_boxes_num[i]
-            boxes_i = boxes[start_idx : start_idx + boxes_num, :]
-            idx = boxes_i[:, 1] > threshold
-            filter_boxes_i = boxes_i[idx, :]
-            filter_boxes.append(filter_boxes_i)
-            filter_num.append(filter_boxes_i.shape[0])
-            start_idx += boxes_num
-        boxes = np.concatenate(filter_boxes)
-        filter_num = np.array(filter_num)
-        filter_res = {"boxes": boxes, "boxes_num": filter_num}
-        return filter_res
-
-    def predict(self, repeats=1, run_benchmark=False):
-        """
-        Args:
-            repeats (int): repeats number for prediction
-        Returns:
-            result (dict): include 'boxes': np.ndarray: shape:[N,6], N: number of box,
-                            matix element:[class, score, x_min, y_min, x_max, y_max]
-                            MaskRCNN's result include 'masks': np.ndarray:
-                            shape: [N, im_h, im_w]
-        """
-        # model prediction
-        np_boxes_num, np_boxes, np_masks = np.array([0]), None, None
-
-        if run_benchmark:
-            for i in range(repeats):
-                self.predictor.run()
-                paddle.device.cuda.synchronize()
-            result = dict(boxes=np_boxes, masks=np_masks, boxes_num=np_boxes_num)
-            return result
-
-        for i in range(repeats):
-            self.predictor.run()
-            output_names = self.predictor.get_output_names()
-            boxes_tensor = self.predictor.get_output_handle(output_names[0])
-            np_boxes = boxes_tensor.copy_to_cpu()
-            if len(output_names) == 1:
-                # some exported model can not get tensor 'bbox_num'
-                np_boxes_num = np.array([len(np_boxes)])
-            else:
-                boxes_num = self.predictor.get_output_handle(output_names[1])
-                np_boxes_num = boxes_num.copy_to_cpu()
-            if self.pred_config.mask:
-                masks_tensor = self.predictor.get_output_handle(output_names[2])
-                np_masks = masks_tensor.copy_to_cpu()
-        result = dict(boxes=np_boxes, masks=np_masks, boxes_num=np_boxes_num)
-        return result
-
-    def merge_batch_result(self, batch_result):
-        if len(batch_result) == 1:
-            return batch_result[0]
-        res_key = batch_result[0].keys()
-        results = {k: [] for k in res_key}
-        for res in batch_result:
-            for k, v in res.items():
-                results[k].append(v)
-        for k, v in results.items():
-            if k not in ["masks", "segm"]:
-                results[k] = np.concatenate(v)
-        return results
-
-    def get_timer(self):
-        return self.det_times
-
-    def predict_image_slice(
-        self,
-        img_list,
-        slice_size=[640, 640],
-        overlap_ratio=[0.25, 0.25],
-        combine_method="nms",
-        match_threshold=0.6,
-        match_metric="ios",
-        run_benchmark=False,
-        repeats=1,
-        visual=True,
-        save_results=False,
-    ):
-        # slice infer only support bs=1
-        results = []
-        try:
-            import sahi
-        except Exception as e:
-            print(
-                "sahi not found, plaese install sahi. "
-                "for example: `pip install sahi`, see https://github.com/obss/sahi."
-            )
-            raise e
-        num_classes = len(self.pred_config.labels)
-        for i in range(len(img_list)):
-            ori_image = img_list[i]
-            slice_image_result = sahi.slicing.slice_image(
-                image=ori_image,
-                slice_height=slice_size[0],
-                slice_width=slice_size[1],
-                overlap_height_ratio=overlap_ratio[0],
-                overlap_width_ratio=overlap_ratio[1],
-            )
-            sub_img_num = len(slice_image_result)
-            merged_bboxs = []
-            print("slice to {} sub_samples.", sub_img_num)
-
-            batch_image_list = [slice_image_result.images[_ind] for _ind in range(sub_img_num)]
-            if run_benchmark:
-                # preprocess
-                inputs = self.preprocess(batch_image_list)  # warmup
-                self.det_times.preprocess_time_s.start()
-                inputs = self.preprocess(batch_image_list)
-                self.det_times.preprocess_time_s.end()
-
-                # model prediction
-                result = self.predict(repeats=50, run_benchmark=True)  # warmup
-                self.det_times.inference_time_s.start()
-                result = self.predict(repeats=repeats, run_benchmark=True)
-                self.det_times.inference_time_s.end(repeats=repeats)
-
-                # postprocess
-                self.postprocess(inputs, result)  # warmup
-                self.det_times.postprocess_time_s.start()
-                result = self.postprocess(inputs, result)
-                self.det_times.postprocess_time_s.end()
-                self.det_times.img_num += 1
-
-                cm, gm, gu = get_current_memory_mb()
-                self.cpu_mem += cm
-                self.gpu_mem += gm
-                self.gpu_util += gu
-            else:
-                # preprocess
-                self.det_times.preprocess_time_s.start()
-                inputs = self.preprocess(batch_image_list)
-                self.det_times.preprocess_time_s.end()
-
-                # model prediction
-                self.det_times.inference_time_s.start()
-                result = self.predict()
-                self.det_times.inference_time_s.end()
-
-                # postprocess
-                self.det_times.postprocess_time_s.start()
-                result = self.postprocess(inputs, result)
-                self.det_times.postprocess_time_s.end()
-                self.det_times.img_num += 1
-
-            st, ed = 0, result["boxes_num"][0]  # start_index, end_index
-            for _ind in range(sub_img_num):
-                boxes_num = result["boxes_num"][_ind]
-                ed = st + boxes_num
-                shift_amount = slice_image_result.starting_pixels[_ind]
-                result["boxes"][st:ed][:, 2:4] = result["boxes"][st:ed][:, 2:4] + shift_amount
-                result["boxes"][st:ed][:, 4:6] = result["boxes"][st:ed][:, 4:6] + shift_amount
-                merged_bboxs.append(result["boxes"][st:ed])
-                st = ed
-
-            merged_results = {"boxes": []}
-            if combine_method == "nms":
-                final_boxes = multiclass_nms(np.concatenate(merged_bboxs), num_classes, match_threshold, match_metric)
-                merged_results["boxes"] = np.concatenate(final_boxes)
-            elif combine_method == "concat":
-                merged_results["boxes"] = np.concatenate(merged_bboxs)
-            else:
-                raise ValueError("Now only support 'nms' or 'concat' to fuse detection results.")
-            merged_results["boxes_num"] = np.array([len(merged_results["boxes"])], dtype=np.int32)
-
-            if visual:
-                visualize(
-                    [ori_image],  # should be list
-                    merged_results,
-                    self.pred_config.labels,
-                    output_dir=self.output_dir,
-                    threshold=self.threshold,
-                )
-
-            results.append(merged_results)
-
-        results = self.merge_batch_result(results)
-        if save_results:
-            Path(self.output_dir).mkdir(exist_ok=True)
-            self.save_coco_results(img_list, results, use_coco_category=FLAGS.use_coco_category)
-        return results
-
-    def predict_image(self, image_list, run_benchmark=False, repeats=1, visual=True, save_results=False):
-        batch_loop_cnt = math.ceil(float(len(image_list)) / self.batch_size)
-        results = []
-        for i in range(batch_loop_cnt):
-            start_index = i * self.batch_size
-            end_index = min((i + 1) * self.batch_size, len(image_list))
-            batch_image_list = image_list[start_index:end_index]
-            if run_benchmark:
-                # preprocess
-                inputs = self.preprocess(batch_image_list)  # warmup
-                self.det_times.preprocess_time_s.start()
-                inputs = self.preprocess(batch_image_list)
-                self.det_times.preprocess_time_s.end()
-
-                # model prediction
-                result = self.predict(repeats=50, run_benchmark=True)  # warmup
-                self.det_times.inference_time_s.start()
-                result = self.predict(repeats=repeats, run_benchmark=True)
-                self.det_times.inference_time_s.end(repeats=repeats)
-
-                # postprocess
-                self.postprocess(inputs, result)  # warmup
-                self.det_times.postprocess_time_s.start()
-                result = self.postprocess(inputs, result)
-                self.det_times.postprocess_time_s.end()
-                self.det_times.img_num += len(batch_image_list)
-
-                cm, gm, gu = get_current_memory_mb()
-                self.cpu_mem += cm
-                self.gpu_mem += gm
-                self.gpu_util += gu
-            else:
-                # preprocess
-                self.det_times.preprocess_time_s.start()
-                inputs = self.preprocess(batch_image_list)
-                self.det_times.preprocess_time_s.end()
-
-                # model prediction
-                self.det_times.inference_time_s.start()
-                result = self.predict()
-                self.det_times.inference_time_s.end()
-
-                # postprocess
-                self.det_times.postprocess_time_s.start()
-                result = self.postprocess(inputs, result)
-                self.det_times.postprocess_time_s.end()
-                self.det_times.img_num += len(batch_image_list)
-
-                if visual:
-                    visualize(
-                        batch_image_list,
-                        result,
-                        self.pred_config.labels,
-                        output_dir=self.output_dir,
-                        threshold=self.threshold,
-                    )
-            results.append(result)
-        results = self.merge_batch_result(results)
-        if save_results:
-            Path(self.output_dir).mkdir(exist_ok=True)
-            self.save_coco_results(image_list, results, use_coco_category=FLAGS.use_coco_category)
-        return results
-
-    def predict_video(self, video_file, camera_id):
-        video_out_name = "output.mp4"
-        if camera_id != -1:
-            capture = cv2.VideoCapture(camera_id)
-        else:
-            capture = cv2.VideoCapture(video_file)
-            video_out_name = os.path.split(video_file)[-1]
-        # Get Video info : resolution, fps, frame count
-        width = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH))
-        height = int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
-        fps = int(capture.get(cv2.CAP_PROP_FPS))
-        frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
-        print("fps: %d, frame_count: %d" % (fps, frame_count))
-
-        if not os.path.exists(self.output_dir):
-            os.makedirs(self.output_dir)
-        out_path = os.path.join(self.output_dir, video_out_name)
-        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
-        writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height))
-        index = 1
-        while 1:
-            ret, frame = capture.read()
-            if not ret:
-                break
-            print("detect frame: %d" % (index))
-            index += 1
-            results = self.predict_image([frame[:, :, ::-1]], visual=False)
-
-            im = visualize_box_mask(frame, results, self.pred_config.labels, threshold=self.threshold)
-            im = np.array(im)
-            writer.write(im)
-            if camera_id != -1:
-                cv2.imshow("Mask Detection", im)
-                if cv2.waitKey(1) & 0xFF == ord("q"):
-                    break
-        writer.release()
-
-    def save_coco_results(self, image_list, results, use_coco_category=False):
-        bbox_results = []
-        mask_results = []
-        idx = 0
-        print("Start saving coco json files...")
-        for i, box_num in enumerate(results["boxes_num"]):
-            file_name = os.path.split(image_list[i])[-1]
-            if use_coco_category:
-                img_id = int(os.path.splitext(file_name)[0])
-            else:
-                img_id = i
-
-            if "boxes" in results:
-                boxes = results["boxes"][idx : idx + box_num].tolist()
-                bbox_results.extend(
-                    [
-                        {
-                            "image_id": img_id,
-                            "category_id": coco_clsid2catid[int(box[0])] if use_coco_category else int(box[0]),
-                            "file_name": file_name,
-                            "bbox": [box[2], box[3], box[4] - box[2], box[5] - box[3]],  # xyxy -> xywh
-                            "score": box[1],
-                        }
-                        for box in boxes
-                    ]
-                )
-
-            if "masks" in results:
-                import pycocotools.mask as mask_util
-
-                boxes = results["boxes"][idx : idx + box_num].tolist()
-                masks = results["masks"][i][:box_num].astype(np.uint8)
-                seg_res = []
-                for box, mask in zip(boxes, masks):
-                    rle = mask_util.encode(np.array(mask[:, :, None], dtype=np.uint8, order="F"))[0]
-                    if "counts" in rle:
-                        rle["counts"] = rle["counts"].decode("utf8")
-                    seg_res.append(
-                        {
-                            "image_id": img_id,
-                            "category_id": coco_clsid2catid[int(box[0])] if use_coco_category else int(box[0]),
-                            "file_name": file_name,
-                            "segmentation": rle,
-                            "score": box[1],
-                        }
-                    )
-                mask_results.extend(seg_res)
-
-            idx += box_num
-
-        if bbox_results:
-            bbox_file = os.path.join(self.output_dir, "bbox.json")
-            with open(bbox_file, "w") as f:
-                json.dump(bbox_results, f)
-            print(f"The bbox result is saved to {bbox_file}")
-        if mask_results:
-            mask_file = os.path.join(self.output_dir, "mask.json")
-            with open(mask_file, "w") as f:
-                json.dump(mask_results, f)
-            print(f"The mask result is saved to {mask_file}")
-
-
-class DetectorSOLOv2(Detector):
-    """
-    Args:
-        model_dir (str): root path of model.pdiparams, model.pdmodel and infer_cfg.yml
-        device (str): Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU
-        run_mode (str): mode of running(paddle/trt_fp32/trt_fp16)
-        batch_size (int): size of pre batch in inference
-        trt_min_shape (int): min shape for dynamic shape in trt
-        trt_max_shape (int): max shape for dynamic shape in trt
-        trt_opt_shape (int): opt shape for dynamic shape in trt
-        trt_calib_mode (bool): If the model is produced by TRT offline quantitative
-            calibration, trt_calib_mode need to set True
-        cpu_threads (int): cpu threads
-        enable_mkldnn (bool): whether to open MKLDNN
-        enable_mkldnn_bfloat16 (bool): Whether to turn on mkldnn bfloat16
-        output_dir (str): The path of output
-        threshold (float): The threshold of score for visualization
-
-    """
-
-    def __init__(
-        self,
-        model_dir,
-        device="CPU",
-        run_mode="paddle",
-        batch_size=1,
-        trt_min_shape=1,
-        trt_max_shape=1280,
-        trt_opt_shape=640,
-        trt_calib_mode=False,
-        cpu_threads=1,
-        enable_mkldnn=False,
-        enable_mkldnn_bfloat16=False,
-        output_dir="./",
-        threshold=0.5,
-    ):
-        super(DetectorSOLOv2, self).__init__(
-            model_dir=model_dir,
-            device=device,
-            run_mode=run_mode,
-            batch_size=batch_size,
-            trt_min_shape=trt_min_shape,
-            trt_max_shape=trt_max_shape,
-            trt_opt_shape=trt_opt_shape,
-            trt_calib_mode=trt_calib_mode,
-            cpu_threads=cpu_threads,
-            enable_mkldnn=enable_mkldnn,
-            enable_mkldnn_bfloat16=enable_mkldnn_bfloat16,
-            output_dir=output_dir,
-            threshold=threshold,
-        )
-
-    def predict(self, repeats=1, run_benchmark=False):
-        """
-        Args:
-            repeats (int): repeat number for prediction
-        Returns:
-            result (dict): 'segm': np.ndarray,shape:[N, im_h, im_w]
-                            'cate_label': label of segm, shape:[N]
-                            'cate_score': confidence score of segm, shape:[N]
-        """
-        np_segms, np_label, np_score, np_boxes_num = None, None, None, np.array([0])
-
-        if run_benchmark:
-            for i in range(repeats):
-                self.predictor.run()
-                paddle.device.cuda.synchronize()
-            result = dict(segm=np_segms, label=np_label, score=np_score, boxes_num=np_boxes_num)
-            return result
-
-        for i in range(repeats):
-            self.predictor.run()
-            output_names = self.predictor.get_output_names()
-            np_boxes_num = self.predictor.get_output_handle(output_names[0]).copy_to_cpu()
-            np_label = self.predictor.get_output_handle(output_names[1]).copy_to_cpu()
-            np_score = self.predictor.get_output_handle(output_names[2]).copy_to_cpu()
-            np_segms = self.predictor.get_output_handle(output_names[3]).copy_to_cpu()
-
-        result = dict(segm=np_segms, label=np_label, score=np_score, boxes_num=np_boxes_num)
-        return result
-
-
-class DetectorPicoDet(Detector):
-    """
-    Args:
-        model_dir (str): root path of model.pdiparams, model.pdmodel and infer_cfg.yml
-        device (str): Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU
-        run_mode (str): mode of running(paddle/trt_fp32/trt_fp16)
-        batch_size (int): size of pre batch in inference
-        trt_min_shape (int): min shape for dynamic shape in trt
-        trt_max_shape (int): max shape for dynamic shape in trt
-        trt_opt_shape (int): opt shape for dynamic shape in trt
-        trt_calib_mode (bool): If the model is produced by TRT offline quantitative
-            calibration, trt_calib_mode need to set True
-        cpu_threads (int): cpu threads
-        enable_mkldnn (bool): whether to turn on MKLDNN
-        enable_mkldnn_bfloat16 (bool): whether to turn on MKLDNN_BFLOAT16
-    """
-
-    def __init__(
-        self,
-        model_dir,
-        device="CPU",
-        run_mode="paddle",
-        batch_size=1,
-        trt_min_shape=1,
-        trt_max_shape=1280,
-        trt_opt_shape=640,
-        trt_calib_mode=False,
-        cpu_threads=1,
-        enable_mkldnn=False,
-        enable_mkldnn_bfloat16=False,
-        output_dir="./",
-        threshold=0.5,
-    ):
-        super(DetectorPicoDet, self).__init__(
-            model_dir=model_dir,
-            device=device,
-            run_mode=run_mode,
-            batch_size=batch_size,
-            trt_min_shape=trt_min_shape,
-            trt_max_shape=trt_max_shape,
-            trt_opt_shape=trt_opt_shape,
-            trt_calib_mode=trt_calib_mode,
-            cpu_threads=cpu_threads,
-            enable_mkldnn=enable_mkldnn,
-            enable_mkldnn_bfloat16=enable_mkldnn_bfloat16,
-            output_dir=output_dir,
-            threshold=threshold,
-        )
-
-    def postprocess(self, inputs, result):
-        # postprocess output of predictor
-        np_score_list = result["boxes"]
-        np_boxes_list = result["boxes_num"]
-        postprocessor = PicoDetPostProcess(
-            inputs["image"].shape[2:],
-            inputs["im_shape"],
-            inputs["scale_factor"],
-            strides=self.pred_config.fpn_stride,
-            nms_threshold=self.pred_config.nms["nms_threshold"],
-        )
-        np_boxes, np_boxes_num = postprocessor(np_score_list, np_boxes_list)
-        result = dict(boxes=np_boxes, boxes_num=np_boxes_num)
-        return result
-
-    def predict(self, repeats=1, run_benchmark=False):
-        """
-        Args:
-            repeats (int): repeat number for prediction
-        Returns:
-            result (dict): include 'boxes': np.ndarray: shape:[N,6], N: number of box,
-                            matix element:[class, score, x_min, y_min, x_max, y_max]
-        """
-        np_score_list, np_boxes_list = [], []
-
-        if run_benchmark:
-            for i in range(repeats):
-                self.predictor.run()
-                paddle.device.cuda.synchronize()
-            result = dict(boxes=np_score_list, boxes_num=np_boxes_list)
-            return result
-
-        for i in range(repeats):
-            self.predictor.run()
-            np_score_list.clear()
-            np_boxes_list.clear()
-            output_names = self.predictor.get_output_names()
-            num_outs = int(len(output_names) / 2)
-            for out_idx in range(num_outs):
-                np_score_list.append(self.predictor.get_output_handle(output_names[out_idx]).copy_to_cpu())
-                np_boxes_list.append(self.predictor.get_output_handle(output_names[out_idx + num_outs]).copy_to_cpu())
-        result = dict(boxes=np_score_list, boxes_num=np_boxes_list)
-        return result
-
-
-def create_inputs(imgs, im_info):
-    """generate input for different model type
-    Args:
-        imgs (list(numpy)): list of images (np.ndarray)
-        im_info (list(dict)): list of image info
-    Returns:
-        inputs (dict): input of model
-    """
-    inputs = {}
-
-    im_shape = []
-    scale_factor = []
-    if len(imgs) == 1:
-        inputs["image"] = np.array((imgs[0],)).astype("float32")
-        inputs["im_shape"] = np.array((im_info[0]["im_shape"],)).astype("float32")
-        inputs["scale_factor"] = np.array((im_info[0]["scale_factor"],)).astype("float32")
-        return inputs
-
-    for e in im_info:
-        im_shape.append(np.array((e["im_shape"],)).astype("float32"))
-        scale_factor.append(np.array((e["scale_factor"],)).astype("float32"))
-
-    inputs["im_shape"] = np.concatenate(im_shape, axis=0)
-    inputs["scale_factor"] = np.concatenate(scale_factor, axis=0)
-
-    imgs_shape = [[e.shape[1], e.shape[2]] for e in imgs]
-    max_shape_h = max([e[0] for e in imgs_shape])
-    max_shape_w = max([e[1] for e in imgs_shape])
-    padding_imgs = []
-    for img in imgs:
-        im_c, im_h, im_w = img.shape[:]
-        padding_im = np.zeros((im_c, max_shape_h, max_shape_w), dtype=np.float32)
-        padding_im[:, :im_h, :im_w] = img
-        padding_imgs.append(padding_im)
-    inputs["image"] = np.stack(padding_imgs, axis=0)
-    return inputs
-
-
-class PredictConfig:
-    """set config of preprocess, postprocess and visualize
-    Args:
-        model_dir (str): root path of model.yml
-    """
-
-    def __init__(self, model_dir):
-        # parsing Yaml config for Preprocess
-        deploy_file = os.path.join(model_dir, "infer_cfg.yml")
-        with open(deploy_file) as f:
-            yml_conf = yaml.safe_load(f)
-        self.check_model(yml_conf)
-        self.arch = yml_conf["arch"]
-        self.preprocess_infos = yml_conf["Preprocess"]
-        self.min_subgraph_size = yml_conf["min_subgraph_size"]
-        self.labels = yml_conf["label_list"]
-        self.mask = False
-        self.use_dynamic_shape = yml_conf["use_dynamic_shape"]
-        if "mask" in yml_conf:
-            self.mask = yml_conf["mask"]
-        self.tracker = None
-        if "tracker" in yml_conf:
-            self.tracker = yml_conf["tracker"]
-        if "NMS" in yml_conf:
-            self.nms = yml_conf["NMS"]
-        if "fpn_stride" in yml_conf:
-            self.fpn_stride = yml_conf["fpn_stride"]
-        if self.arch == "RCNN" and yml_conf.get("export_onnx", False):
-            print("The RCNN export model is used for ONNX and it only supports batch_size = 1")
-        self.print_config()
-
-    def check_model(self, yml_conf):
-        """
-        Raises:
-            ValueError: loaded model not in supported model type
-        """
-        for support_model in SUPPORT_MODELS:
-            if support_model in yml_conf["arch"]:
-                return True
-        raise ValueError("Unsupported arch: {}, expect {}".format(yml_conf["arch"], SUPPORT_MODELS))
-
-    def print_config(self):
-        print("-----------  Model Configuration -----------")
-        print("%s: %s" % ("Model Arch", self.arch))
-        print("%s: " % ("Transform Order"))
-        for op_info in self.preprocess_infos:
-            print("--%s: %s" % ("transform op", op_info["type"]))
-        print("--------------------------------------------")
-
-
-def load_predictor(
-    model_dir,
-    arch,
-    run_mode="paddle",
-    batch_size=1,
-    device="CPU",
-    min_subgraph_size=3,
-    use_dynamic_shape=False,
-    trt_min_shape=1,
-    trt_max_shape=1280,
-    trt_opt_shape=640,
-    trt_calib_mode=False,
-    cpu_threads=1,
-    enable_mkldnn=False,
-    enable_mkldnn_bfloat16=False,
-    delete_shuffle_pass=False,
-    tuned_trt_shape_file="shape_range_info.pbtxt",
-):
-    """set AnalysisConfig, generate AnalysisPredictor
-    Args:
-        model_dir (str): root path of __model__ and __params__
-        device (str): Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU
-        run_mode (str): mode of running(paddle/trt_fp32/trt_fp16/trt_int8)
-        use_dynamic_shape (bool): use dynamic shape or not
-        trt_min_shape (int): min shape for dynamic shape in trt
-        trt_max_shape (int): max shape for dynamic shape in trt
-        trt_opt_shape (int): opt shape for dynamic shape in trt
-        trt_calib_mode (bool): If the model is produced by TRT offline quantitative
-            calibration, trt_calib_mode need to set True
-        delete_shuffle_pass (bool): whether to remove shuffle_channel_detect_pass in TensorRT.
-                                    Used by action model.
-    Returns:
-        predictor (PaddlePredictor): AnalysisPredictor
-    Raises:
-        ValueError: predict by TensorRT need device == 'GPU'.
-    """
-    if device != "GPU" and run_mode != "paddle":
-        raise ValueError(
-            "Predict by TensorRT mode: {}, expect device=='GPU', but device == {}".format(run_mode, device)
-        )
-    infer_model = os.path.join(model_dir, "model.pdmodel")
-    infer_params = os.path.join(model_dir, "model.pdiparams")
-    if not os.path.exists(infer_model):
-        infer_model = os.path.join(model_dir, "inference.pdmodel")
-        infer_params = os.path.join(model_dir, "inference.pdiparams")
-        if not os.path.exists(infer_model):
-            raise ValueError("Cannot find any inference model in dir: {},".format(model_dir))
-    config = Config(infer_model, infer_params)
-    if device == "GPU":
-        # initial GPU memory(M), device ID
-        config.enable_use_gpu(200, 0)
-        # optimize graph and fuse op
-        config.switch_ir_optim(True)
-    elif device == "XPU":
-        if config.lite_engine_enabled():
-            config.enable_lite_engine()
-        config.enable_xpu(10 * 1024 * 1024)
-    elif device == "NPU":
-        if config.lite_engine_enabled():
-            config.enable_lite_engine()
-        config.enable_custom_device("npu")
-    else:
-        config.disable_gpu()
-        config.set_cpu_math_library_num_threads(cpu_threads)
-        if enable_mkldnn:
-            try:
-                # cache 10 different shapes for mkldnn to avoid memory leak
-                config.set_mkldnn_cache_capacity(10)
-                config.enable_mkldnn()
-                if enable_mkldnn_bfloat16:
-                    config.enable_mkldnn_bfloat16()
-            except:
-                print("The current environment does not support `mkldnn`, so disable mkldnn.")
-                pass
-
-    precision_map = {
-        "trt_int8": Config.Precision.Int8,
-        "trt_fp32": Config.Precision.Float32,
-        "trt_fp16": Config.Precision.Half,
-    }
-    if run_mode in precision_map.keys():
-        if arch in TUNED_TRT_DYNAMIC_MODELS:
-            config.collect_shape_range_info(tuned_trt_shape_file)
-        config.enable_tensorrt_engine(
-            workspace_size=(1 << 25) * batch_size,
-            max_batch_size=batch_size,
-            min_subgraph_size=min_subgraph_size,
-            precision_mode=precision_map[run_mode],
-            use_static=False,
-            use_calib_mode=trt_calib_mode,
-        )
-        if arch in TUNED_TRT_DYNAMIC_MODELS:
-            config.enable_tuned_tensorrt_dynamic_shape(tuned_trt_shape_file, True)
-
-        if use_dynamic_shape:
-            min_input_shape = {"image": [batch_size, 3, trt_min_shape, trt_min_shape], "scale_factor": [batch_size, 2]}
-            max_input_shape = {"image": [batch_size, 3, trt_max_shape, trt_max_shape], "scale_factor": [batch_size, 2]}
-            opt_input_shape = {"image": [batch_size, 3, trt_opt_shape, trt_opt_shape], "scale_factor": [batch_size, 2]}
-            config.set_trt_dynamic_shape_info(min_input_shape, max_input_shape, opt_input_shape)
-            print("trt set dynamic shape done!")
-
-    # disable print log when predict
-    config.disable_glog_info()
-    # enable shared memory
-    config.enable_memory_optim()
-    # disable feed, fetch OP, needed by zero_copy_run
-    config.switch_use_feed_fetch_ops(False)
-    if delete_shuffle_pass:
-        config.delete_pass("shuffle_channel_detect_pass")
-    predictor = create_predictor(config)
-    return predictor, config
-
-
-def get_test_images(infer_dir, infer_img):
-    """
-    Get image path list in TEST mode
-    """
-    assert infer_img is not None or infer_dir is not None, "--image_file or --image_dir should be set"
-    assert infer_img is None or os.path.isfile(infer_img), "{} is not a file".format(infer_img)
-    assert infer_dir is None or os.path.isdir(infer_dir), "{} is not a directory".format(infer_dir)
-
-    # infer_img has a higher priority
-    if infer_img and os.path.isfile(infer_img):
-        return [infer_img]
-
-    images = set()
-    infer_dir = os.path.abspath(infer_dir)
-    assert os.path.isdir(infer_dir), "infer_dir {} is not a directory".format(infer_dir)
-    exts = ["jpg", "jpeg", "png", "bmp"]
-    exts += [ext.upper() for ext in exts]
-    for ext in exts:
-        images.update(glob.glob("{}/*.{}".format(infer_dir, ext)))
-    images = list(images)
-
-    assert len(images) > 0, "no image found in {}".format(infer_dir)
-    print("Found {} inference images in total.".format(len(images)))
-
-    return images
-
-
-def visualize(image_list, result, labels, output_dir="output/", threshold=0.5):
-    # visualize the predict result
-    start_idx = 0
-    for idx, image_file in enumerate(image_list):
-        im_bboxes_num = result["boxes_num"][idx]
-        im_results = {}
-        if "boxes" in result:
-            im_results["boxes"] = result["boxes"][start_idx : start_idx + im_bboxes_num, :]
-        if "masks" in result:
-            im_results["masks"] = result["masks"][start_idx : start_idx + im_bboxes_num, :]
-        if "segm" in result:
-            im_results["segm"] = result["segm"][start_idx : start_idx + im_bboxes_num, :]
-        if "label" in result:
-            im_results["label"] = result["label"][start_idx : start_idx + im_bboxes_num]
-        if "score" in result:
-            im_results["score"] = result["score"][start_idx : start_idx + im_bboxes_num]
-
-        start_idx += im_bboxes_num
-        im = visualize_box_mask(image_file, im_results, labels, threshold=threshold)
-        img_name = os.path.split(image_file)[-1]
-        if not os.path.exists(output_dir):
-            os.makedirs(output_dir)
-        out_path = os.path.join(output_dir, img_name)
-        im.save(out_path, quality=95)
-        print("save result to: " + out_path)
-
-
-def print_arguments(args):
-    print("-----------  Running Arguments -----------")
-    for arg, value in sorted(vars(args).items()):
-        print("%s: %s" % (arg, value))
-    print("------------------------------------------")
-
-
-def main():
-    deploy_file = os.path.join(FLAGS.model_dir, "infer_cfg.yml")
-    with open(deploy_file) as f:
-        yml_conf = yaml.safe_load(f)
-    arch = yml_conf["arch"]
-    detector_func = "Detector"
-    if arch == "SOLOv2":
-        detector_func = "DetectorSOLOv2"
-    elif arch == "PicoDet":
-        detector_func = "DetectorPicoDet"
-
-    detector = eval(detector_func)(
-        FLAGS.model_dir,
-        device=FLAGS.device,
-        run_mode=FLAGS.run_mode,
-        batch_size=FLAGS.batch_size,
-        trt_min_shape=FLAGS.trt_min_shape,
-        trt_max_shape=FLAGS.trt_max_shape,
-        trt_opt_shape=FLAGS.trt_opt_shape,
-        trt_calib_mode=FLAGS.trt_calib_mode,
-        cpu_threads=FLAGS.cpu_threads,
-        enable_mkldnn=FLAGS.enable_mkldnn,
-        enable_mkldnn_bfloat16=FLAGS.enable_mkldnn_bfloat16,
-        threshold=FLAGS.threshold,
-        output_dir=FLAGS.output_dir,
-    )
-
-    # predict from video file or camera video stream
-    if FLAGS.video_file is not None or FLAGS.camera_id != -1:
-        detector.predict_video(FLAGS.video_file, FLAGS.camera_id)
-    else:
-        # predict from image
-        if FLAGS.image_dir is None and FLAGS.image_file is not None:
-            assert FLAGS.batch_size == 1, "batch_size should be 1, when image_file is not None"
-        img_list = get_test_images(FLAGS.image_dir, FLAGS.image_file)
-        if FLAGS.slice_infer:
-            detector.predict_image_slice(
-                img_list,
-                FLAGS.slice_size,
-                FLAGS.overlap_ratio,
-                FLAGS.combine_method,
-                FLAGS.match_threshold,
-                FLAGS.match_metric,
-                visual=FLAGS.save_images,
-                save_results=FLAGS.save_results,
-            )
-        else:
-            detector.predict_image(
-                img_list, FLAGS.run_benchmark, repeats=100, visual=FLAGS.save_images, save_results=FLAGS.save_results
-            )
-        if not FLAGS.run_benchmark:
-            detector.det_times.info(average=True)
-        else:
-            mode = FLAGS.run_mode
-            model_dir = FLAGS.model_dir
-            model_info = {"model_name": model_dir.strip("/").split("/")[-1], "precision": mode.split("_")[-1]}
-            bench_log(detector, img_list, model_info, name="DET")
-
-
-if __name__ == "__main__":
-    paddle.enable_static()
-    parser = argsparser()
-    FLAGS = parser.parse_args()
-    print_arguments(FLAGS)
-    FLAGS.device = FLAGS.device.upper()
-    assert FLAGS.device in ["CPU", "GPU", "XPU", "NPU"], "device should be CPU, GPU, XPU or NPU"
-    assert not FLAGS.use_gpu, "use_gpu has been deprecated, please use --device"
-
-    assert not (
-        FLAGS.enable_mkldnn is False and FLAGS.enable_mkldnn_bfloat16 is True
-    ), "To enable mkldnn bfloat, please turn on both enable_mkldnn and enable_mkldnn_bfloat16"
-
-    main()
diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_infer.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_infer.py
deleted file mode 100644
index 5db16102d6e1..000000000000
--- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_infer.py
+++ /dev/null
@@ -1,373 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-import os
-
-import cv2
-import numpy as np
-import paddle
-import yaml
-
-from .benchmark_utils import PaddleInferBenchmark
-from .infer import Detector, get_test_images, print_arguments
-from .keypoint_postprocess import HrHRNetPostProcess, HRNetPostProcess
-from .keypoint_preprocess import expand_crop
-from .utils import argsparser, get_current_memory_mb
-from .visualize import visualize_pose
-
-# Global dictionary
-KEYPOINT_SUPPORT_MODELS = {"HigherHRNet": "keypoint_bottomup", "HRNet": "keypoint_topdown"}
-
-
-class KeyPointDetector(Detector):
-    """
-    Args:
-        model_dir (str): root path of model.pdiparams, model.pdmodel and infer_cfg.yml
-        device (str): Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU
-        run_mode (str): mode of running(paddle/trt_fp32/trt_fp16)
-        batch_size (int): size of pre batch in inference
-        trt_min_shape (int): min shape for dynamic shape in trt
-        trt_max_shape (int): max shape for dynamic shape in trt
-        trt_opt_shape (int): opt shape for dynamic shape in trt
-        trt_calib_mode (bool): If the model is produced by TRT offline quantitative
-            calibration, trt_calib_mode need to set True
-        cpu_threads (int): cpu threads
-        enable_mkldnn (bool): whether to open MKLDNN
-        use_dark(bool): whether to use postprocess in DarkPose
-    """
-
-    def __init__(
-        self,
-        model_dir,
-        device="CPU",
-        run_mode="paddle",
-        batch_size=1,
-        trt_min_shape=1,
-        trt_max_shape=1280,
-        trt_opt_shape=640,
-        trt_calib_mode=False,
-        cpu_threads=1,
-        enable_mkldnn=False,
-        output_dir="output",
-        threshold=0.5,
-        use_dark=True,
-    ):
-        super(KeyPointDetector, self).__init__(
-            model_dir=model_dir,
-            device=device,
-            run_mode=run_mode,
-            batch_size=batch_size,
-            trt_min_shape=trt_min_shape,
-            trt_max_shape=trt_max_shape,
-            trt_opt_shape=trt_opt_shape,
-            trt_calib_mode=trt_calib_mode,
-            cpu_threads=cpu_threads,
-            enable_mkldnn=enable_mkldnn,
-            output_dir=output_dir,
-            threshold=threshold,
-        )
-        self.use_dark = use_dark
-
-    def set_config(self, model_dir):
-        return PredictConfig_KeyPoint(model_dir)
-
-    def get_person_from_rect(self, image, results):
-        # crop the person result from image
-        self.det_times.preprocess_time_s.start()
-        valid_rects = results["boxes"]
-        rect_images = []
-        new_rects = []
-        org_rects = []
-        for rect in valid_rects:
-            rect_image, new_rect, org_rect = expand_crop(image, rect)
-            if rect_image is None or rect_image.size == 0:
-                continue
-            rect_images.append(rect_image)
-            new_rects.append(new_rect)
-            org_rects.append(org_rect)
-        self.det_times.preprocess_time_s.end()
-        return rect_images, new_rects, org_rects
-
-    def postprocess(self, inputs, result):
-        np_heatmap = result["heatmap"]
-        np_masks = result["masks"]
-        # postprocess output of predictor
-        if KEYPOINT_SUPPORT_MODELS[self.pred_config.arch] == "keypoint_bottomup":
-            results = {}
-            h, w = inputs["im_shape"][0]
-            preds = [np_heatmap]
-            if np_masks is not None:
-                preds += np_masks
-            preds += [h, w]
-            keypoint_postprocess = HrHRNetPostProcess()
-            kpts, scores = keypoint_postprocess(*preds)
-            results["keypoint"] = kpts
-            results["score"] = scores
-            return results
-        elif KEYPOINT_SUPPORT_MODELS[self.pred_config.arch] == "keypoint_topdown":
-            results = {}
-            imshape = inputs["im_shape"][:, ::-1]
-            center = np.round(imshape / 2.0)
-            scale = imshape / 200.0
-            keypoint_postprocess = HRNetPostProcess(use_dark=self.use_dark)
-            kpts, scores = keypoint_postprocess(np_heatmap, center, scale)
-            results["keypoint"] = kpts
-            results["score"] = scores
-            return results
-        else:
-            raise ValueError("Unsupported arch: {}, expect {}".format(self.pred_config.arch, KEYPOINT_SUPPORT_MODELS))
-
-    def predict(self, repeats=1):
-        """
-        Args:
-            repeats (int): repeat number for prediction
-        Returns:
-            results (dict): include 'boxes': np.ndarray: shape:[N,6], N: number of box,
-                            matix element:[class, score, x_min, y_min, x_max, y_max]
-                            MaskRCNN's results include 'masks': np.ndarray:
-                            shape: [N, im_h, im_w]
-        """
-        # model prediction
-        np_heatmap, np_masks = None, None
-        for i in range(repeats):
-            self.predictor.run()
-            output_names = self.predictor.get_output_names()
-            heatmap_tensor = self.predictor.get_output_handle(output_names[0])
-            np_heatmap = heatmap_tensor.copy_to_cpu()
-            if self.pred_config.tagmap:
-                masks_tensor = self.predictor.get_output_handle(output_names[1])
-                heat_k = self.predictor.get_output_handle(output_names[2])
-                inds_k = self.predictor.get_output_handle(output_names[3])
-                np_masks = [masks_tensor.copy_to_cpu(), heat_k.copy_to_cpu(), inds_k.copy_to_cpu()]
-        result = dict(heatmap=np_heatmap, masks=np_masks)
-        return result
-
-    def predict_image(self, image_list, run_benchmark=False, repeats=1, visual=True):
-        results = []
-        batch_loop_cnt = math.ceil(float(len(image_list)) / self.batch_size)
-        for i in range(batch_loop_cnt):
-            start_index = i * self.batch_size
-            end_index = min((i + 1) * self.batch_size, len(image_list))
-            batch_image_list = image_list[start_index:end_index]
-            if run_benchmark:
-                # preprocess
-                inputs = self.preprocess(batch_image_list)  # warmup
-                self.det_times.preprocess_time_s.start()
-                inputs = self.preprocess(batch_image_list)
-                self.det_times.preprocess_time_s.end()
-
-                # model prediction
-                self.predict(repeats=repeats)  # warmup
-                self.det_times.inference_time_s.start()
-                result = self.predict(repeats=repeats)
-                self.det_times.inference_time_s.end(repeats=repeats)
-
-                # postprocess
-                self.postprocess(inputs, result)  # warmup
-                self.det_times.postprocess_time_s.start()
-                result = self.postprocess(inputs, result)
-                self.det_times.postprocess_time_s.end()
-                self.det_times.img_num += len(batch_image_list)
-
-                cm, gm, gu = get_current_memory_mb()
-                self.cpu_mem += cm
-                self.gpu_mem += gm
-                self.gpu_util += gu
-
-            else:
-                # preprocess
-                self.det_times.preprocess_time_s.start()
-                inputs = self.preprocess(batch_image_list)
-                self.det_times.preprocess_time_s.end()
-
-                # model prediction
-                self.det_times.inference_time_s.start()
-                result = self.predict()
-                self.det_times.inference_time_s.end()
-
-                # postprocess
-                self.det_times.postprocess_time_s.start()
-                result = self.postprocess(inputs, result)
-                self.det_times.postprocess_time_s.end()
-                self.det_times.img_num += len(batch_image_list)
-
-                if visual:
-                    if not os.path.exists(self.output_dir):
-                        os.makedirs(self.output_dir)
-                    visualize(batch_image_list, result, visual_thresh=self.threshold, save_dir=self.output_dir)
-
-            results.append(result)
-        results = self.merge_batch_result(results)
-        return results
-
-    def predict_video(self, video_file, camera_id):
-        video_name = "output.mp4"
-        if camera_id != -1:
-            capture = cv2.VideoCapture(camera_id)
-        else:
-            capture = cv2.VideoCapture(video_file)
-            video_name = os.path.split(video_file)[-1]
-        # Get Video info : resolution, fps, frame count
-        width = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH))
-        height = int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
-        fps = int(capture.get(cv2.CAP_PROP_FPS))
-        frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
-        print("fps: %d, frame_count: %d" % (fps, frame_count))
-
-        if not os.path.exists(self.output_dir):
-            os.makedirs(self.output_dir)
-        out_path = os.path.join(self.output_dir, video_name)
-        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
-        writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height))
-        index = 1
-        while 1:
-            ret, frame = capture.read()
-            if not ret:
-                break
-            print("detect frame: %d" % (index))
-            index += 1
-            results = self.predict_image([frame[:, :, ::-1]], visual=False)
-            im_results = {}
-            im_results["keypoint"] = [results["keypoint"], results["score"]]
-            im = visualize_pose(frame, im_results, visual_thresh=self.threshold, returnimg=True)
-            writer.write(im)
-            if camera_id != -1:
-                cv2.imshow("Mask Detection", im)
-                if cv2.waitKey(1) & 0xFF == ord("q"):
-                    break
-        writer.release()
-
-
-def create_inputs(imgs, im_info):
-    """generate input for different model type
-    Args:
-        imgs (list(numpy)): list of image (np.ndarray)
-        im_info (list(dict)): list of image info
-    Returns:
-        inputs (dict): input of model
-    """
-    inputs = {}
-    inputs["image"] = np.stack(imgs, axis=0).astype("float32")
-    im_shape = []
-    for e in im_info:
-        im_shape.append(np.array((e["im_shape"])).astype("float32"))
-    inputs["im_shape"] = np.stack(im_shape, axis=0)
-    return inputs
-
-
-class PredictConfig_KeyPoint:
-    """set config of preprocess, postprocess and visualize
-    Args:
-        model_dir (str): root path of model.yml
-    """
-
-    def __init__(self, model_dir):
-        # parsing Yaml config for Preprocess
-        deploy_file = os.path.join(model_dir, "infer_cfg.yml")
-        with open(deploy_file) as f:
-            yml_conf = yaml.safe_load(f)
-        self.check_model(yml_conf)
-        self.arch = yml_conf["arch"]
-        self.archcls = KEYPOINT_SUPPORT_MODELS[yml_conf["arch"]]
-        self.preprocess_infos = yml_conf["Preprocess"]
-        self.min_subgraph_size = yml_conf["min_subgraph_size"]
-        self.labels = yml_conf["label_list"]
-        self.tagmap = False
-        self.use_dynamic_shape = yml_conf["use_dynamic_shape"]
-        if "keypoint_bottomup" == self.archcls:
-            self.tagmap = True
-        self.print_config()
-
-    def check_model(self, yml_conf):
-        """
-        Raises:
-            ValueError: loaded model not in supported model type
-        """
-        for support_model in KEYPOINT_SUPPORT_MODELS:
-            if support_model in yml_conf["arch"]:
-                return True
-        raise ValueError("Unsupported arch: {}, expect {}".format(yml_conf["arch"], KEYPOINT_SUPPORT_MODELS))
-
-    def print_config(self):
-        print("-----------  Model Configuration -----------")
-        print("%s: %s" % ("Model Arch", self.arch))
-        print("%s: " % ("Transform Order"))
-        for op_info in self.preprocess_infos:
-            print("--%s: %s" % ("transform op", op_info["type"]))
-        print("--------------------------------------------")
-
-
-def visualize(image_list, results, visual_thresh=0.6, save_dir="output"):
-    im_results = {}
-    for i, image_file in enumerate(image_list):
-        skeletons = results["keypoint"]
-        scores = results["score"]
-        skeleton = skeletons[i : i + 1]
-        score = scores[i : i + 1]
-        im_results["keypoint"] = [skeleton, score]
-        visualize_pose(image_file, im_results, visual_thresh=visual_thresh, save_dir=save_dir)
-
-
-def main():
-    detector = KeyPointDetector(
-        FLAGS.model_dir,
-        device=FLAGS.device,
-        run_mode=FLAGS.run_mode,
-        batch_size=FLAGS.batch_size,
-        trt_min_shape=FLAGS.trt_min_shape,
-        trt_max_shape=FLAGS.trt_max_shape,
-        trt_opt_shape=FLAGS.trt_opt_shape,
-        trt_calib_mode=FLAGS.trt_calib_mode,
-        cpu_threads=FLAGS.cpu_threads,
-        enable_mkldnn=FLAGS.enable_mkldnn,
-        threshold=FLAGS.threshold,
-        output_dir=FLAGS.output_dir,
-        use_dark=FLAGS.use_dark,
-    )
-
-    # predict from video file or camera video stream
-    if FLAGS.video_file is not None or FLAGS.camera_id != -1:
-        detector.predict_video(FLAGS.video_file, FLAGS.camera_id)
-    else:
-        # predict from image
-        img_list = get_test_images(FLAGS.image_dir, FLAGS.image_file)
-        detector.predict_image(img_list, FLAGS.run_benchmark, repeats=10)
-        if not FLAGS.run_benchmark:
-            detector.det_times.info(average=True)
-        else:
-            mems = {
-                "cpu_rss_mb": detector.cpu_mem / len(img_list),
-                "gpu_rss_mb": detector.gpu_mem / len(img_list),
-                "gpu_util": detector.gpu_util * 100 / len(img_list),
-            }
-            perf_info = detector.det_times.report(average=True)
-            model_dir = FLAGS.model_dir
-            mode = FLAGS.run_mode
-            model_info = {"model_name": model_dir.strip("/").split("/")[-1], "precision": mode.split("_")[-1]}
-            data_info = {"batch_size": 1, "shape": "dynamic_shape", "data_num": perf_info["img_num"]}
-            det_log = PaddleInferBenchmark(detector.config, model_info, data_info, perf_info, mems)
-            det_log("KeyPoint")
-
-
-if __name__ == "__main__":
-    paddle.enable_static()
-    parser = argsparser()
-    FLAGS = parser.parse_args()
-    print_arguments(FLAGS)
-    FLAGS.device = FLAGS.device.upper()
-    assert FLAGS.device in ["CPU", "GPU", "XPU"], "device should be CPU, GPU or XPU"
-    assert not FLAGS.use_gpu, "use_gpu has been deprecated, please use --device"
-
-    main()
diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_postprocess.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_postprocess.py
deleted file mode 100644
index dace8bddf48e..000000000000
--- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_postprocess.py
+++ /dev/null
@@ -1,347 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-from collections import defaultdict
-
-import cv2
-import numpy as np
-from scipy.optimize import linear_sum_assignment
-
-from .keypoint_preprocess import get_affine_mat_kernel, get_affine_transform
-
-
-class HrHRNetPostProcess(object):
-    """
-    HrHRNet postprocess contain:
-        1) get topk keypoints in the output heatmap
-        2) sample the tagmap's value corresponding to each of the topk coordinate
-        3) match different joints to combine to some people with Hungary algorithm
-        4) adjust the coordinate by +-0.25 to decrease error std
-        5) salvage missing joints by check positivity of heatmap - tagdiff_norm
-    Args:
-        max_num_people (int): max number of people support in postprocess
-        heat_thresh (float): value of topk below this threshhold will be ignored
-        tag_thresh (float): coord's value sampled in tagmap below this threshold belong to same people for init
-
-        inputs(list[heatmap]): the output list of model, [heatmap, heatmap_maxpool, tagmap], heatmap_maxpool used to get topk
-        original_height, original_width (float): the original image size
-    """
-
-    def __init__(self, max_num_people=30, heat_thresh=0.2, tag_thresh=1.0):
-        self.max_num_people = max_num_people
-        self.heat_thresh = heat_thresh
-        self.tag_thresh = tag_thresh
-
-    def lerp(self, j, y, x, heatmap):
-        H, W = heatmap.shape[-2:]
-        left = np.clip(x - 1, 0, W - 1)
-        right = np.clip(x + 1, 0, W - 1)
-        up = np.clip(y - 1, 0, H - 1)
-        down = np.clip(y + 1, 0, H - 1)
-        offset_y = np.where(heatmap[j, down, x] > heatmap[j, up, x], 0.25, -0.25)
-        offset_x = np.where(heatmap[j, y, right] > heatmap[j, y, left], 0.25, -0.25)
-        return offset_y + 0.5, offset_x + 0.5
-
-    def __call__(self, heatmap, tagmap, heat_k, inds_k, original_height, original_width):
-
-        N, J, H, W = heatmap.shape
-        assert N == 1, "only support batch size 1"
-        heatmap = heatmap[0]
-        tagmap = tagmap[0]
-        heats = heat_k[0]
-        inds_np = inds_k[0]
-        y = inds_np // W
-        x = inds_np % W
-        tags = tagmap[np.arange(J)[None, :].repeat(self.max_num_people), y.flatten(), x.flatten()].reshape(
-            J, -1, tagmap.shape[-1]
-        )
-        coords = np.stack((y, x), axis=2)
-        # threshold
-        mask = heats > self.heat_thresh
-        # cluster
-        cluster = defaultdict(
-            lambda: {"coords": np.zeros((J, 2), dtype=np.float32), "scores": np.zeros(J, dtype=np.float32), "tags": []}
-        )
-        for jid, m in enumerate(mask):
-            num_valid = m.sum()
-            if num_valid == 0:
-                continue
-            valid_inds = np.where(m)[0]
-            valid_tags = tags[jid, m, :]
-            if len(cluster) == 0:  # initialize
-                for i in valid_inds:
-                    tag = tags[jid, i]
-                    key = tag[0]
-                    cluster[key]["tags"].append(tag)
-                    cluster[key]["scores"][jid] = heats[jid, i]
-                    cluster[key]["coords"][jid] = coords[jid, i]
-                continue
-            candidates = list(cluster.keys())[: self.max_num_people]
-            centroids = [np.mean(cluster[k]["tags"], axis=0) for k in candidates]
-            num_clusters = len(centroids)
-            # shape is (num_valid, num_clusters, tag_dim)
-            dist = valid_tags[:, None, :] - np.array(centroids)[None, ...]
-            l2_dist = np.linalg.norm(dist, ord=2, axis=2)
-            # modulate dist with heat value, see `use_detection_val`
-            cost = np.round(l2_dist) * 100 - heats[jid, m, None]
-            # pad the cost matrix, otherwise new pose are ignored
-            if num_valid > num_clusters:
-                cost = np.pad(
-                    cost, ((0, 0), (0, num_valid - num_clusters)), "constant", constant_values=((0, 0), (0, 1e-10))
-                )
-            rows, cols = linear_sum_assignment(cost)
-            for y, x in zip(rows, cols):
-                tag = tags[jid, y]
-                if y < num_valid and x < num_clusters and l2_dist[y, x] < self.tag_thresh:
-                    key = candidates[x]  # merge to cluster
-                else:
-                    key = tag[0]  # initialize new cluster
-                cluster[key]["tags"].append(tag)
-                cluster[key]["scores"][jid] = heats[jid, y]
-                cluster[key]["coords"][jid] = coords[jid, y]
-
-        # shape is [k, J, 2] and [k, J]
-        pose_tags = np.array([cluster[k]["tags"] for k in cluster])
-        pose_coords = np.array([cluster[k]["coords"] for k in cluster])
-        pose_scores = np.array([cluster[k]["scores"] for k in cluster])
-        valid = pose_scores > 0
-
-        pose_kpts = np.zeros((pose_scores.shape[0], J, 3), dtype=np.float32)
-        if valid.sum() == 0:
-            return pose_kpts, pose_kpts
-
-        # refine coords
-        valid_coords = pose_coords[valid].astype(np.int32)
-        y = valid_coords[..., 0].flatten()
-        x = valid_coords[..., 1].flatten()
-        _, j = np.nonzero(valid)
-        offsets = self.lerp(j, y, x, heatmap)
-        pose_coords[valid, 0] += offsets[0]
-        pose_coords[valid, 1] += offsets[1]
-
-        # mean score before salvage
-        mean_score = pose_scores.mean(axis=1)
-        pose_kpts[valid, 2] = pose_scores[valid]
-
-        # salvage missing joints
-        if True:
-            for pid, coords in enumerate(pose_coords):
-                tag_mean = np.array(pose_tags[pid]).mean(axis=0)
-                norm = np.sum((tagmap - tag_mean) ** 2, axis=3) ** 0.5
-                score = heatmap - np.round(norm)  # (J, H, W)
-                flat_score = score.reshape(J, -1)
-                max_inds = np.argmax(flat_score, axis=1)
-                max_scores = np.max(flat_score, axis=1)
-                salvage_joints = (pose_scores[pid] == 0) & (max_scores > 0)
-                if salvage_joints.sum() == 0:
-                    continue
-                y = max_inds[salvage_joints] // W
-                x = max_inds[salvage_joints] % W
-                offsets = self.lerp(salvage_joints.nonzero()[0], y, x, heatmap)
-                y = y.astype(np.float32) + offsets[0]
-                x = x.astype(np.float32) + offsets[1]
-                pose_coords[pid][salvage_joints, 0] = y
-                pose_coords[pid][salvage_joints, 1] = x
-                pose_kpts[pid][salvage_joints, 2] = max_scores[salvage_joints]
-        pose_kpts[..., :2] = transpred(pose_coords[..., :2][..., ::-1], original_height, original_width, min(H, W))
-        return pose_kpts, mean_score
-
-
-def transpred(kpts, h, w, s):
-    trans, _ = get_affine_mat_kernel(h, w, s, inv=True)
-
-    return warp_affine_joints(kpts[..., :2].copy(), trans)
-
-
-def warp_affine_joints(joints, mat):
-    """Apply affine transformation defined by the transform matrix on the
-    joints.
-
-    Args:
-        joints (np.ndarray[..., 2]): Origin coordinate of joints.
-        mat (np.ndarray[3, 2]): The affine matrix.
-
-    Returns:
-        matrix (np.ndarray[..., 2]): Result coordinate of joints.
-    """
-    joints = np.array(joints)
-    shape = joints.shape
-    joints = joints.reshape(-1, 2)
-    return np.dot(np.concatenate((joints, joints[:, 0:1] * 0 + 1), axis=1), mat.T).reshape(shape)
-
-
-class HRNetPostProcess(object):
-    def __init__(self, use_dark=True):
-        self.use_dark = use_dark
-
-    def flip_back(self, output_flipped, matched_parts):
-        assert output_flipped.ndim == 4, "output_flipped should be [batch_size, num_joints, height, width]"
-
-        output_flipped = output_flipped[:, :, :, ::-1]
-
-        for pair in matched_parts:
-            tmp = output_flipped[:, pair[0], :, :].copy()
-            output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :]
-            output_flipped[:, pair[1], :, :] = tmp
-
-        return output_flipped
-
-    def get_max_preds(self, heatmaps):
-        """get predictions from score maps
-
-        Args:
-            heatmaps: numpy.ndarray([batch_size, num_joints, height, width])
-
-        Returns:
-            preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords
-            maxvals: numpy.ndarray([batch_size, num_joints, 2]), the maximum confidence of the keypoints
-        """
-        assert isinstance(heatmaps, np.ndarray), "heatmaps should be numpy.ndarray"
-        assert heatmaps.ndim == 4, "batch_images should be 4-ndim"
-
-        batch_size = heatmaps.shape[0]
-        num_joints = heatmaps.shape[1]
-        width = heatmaps.shape[3]
-        heatmaps_reshaped = heatmaps.reshape((batch_size, num_joints, -1))
-        idx = np.argmax(heatmaps_reshaped, 2)
-        maxvals = np.amax(heatmaps_reshaped, 2)
-
-        maxvals = maxvals.reshape((batch_size, num_joints, 1))
-        idx = idx.reshape((batch_size, num_joints, 1))
-
-        preds = np.tile(idx, (1, 1, 2)).astype(np.float32)
-
-        preds[:, :, 0] = (preds[:, :, 0]) % width
-        preds[:, :, 1] = np.floor((preds[:, :, 1]) / width)
-
-        pred_mask = np.tile(np.greater(maxvals, 0.0), (1, 1, 2))
-        pred_mask = pred_mask.astype(np.float32)
-
-        preds *= pred_mask
-
-        return preds, maxvals
-
-    def gaussian_blur(self, heatmap, kernel):
-        border = (kernel - 1) // 2
-        batch_size = heatmap.shape[0]
-        num_joints = heatmap.shape[1]
-        height = heatmap.shape[2]
-        width = heatmap.shape[3]
-        for i in range(batch_size):
-            for j in range(num_joints):
-                origin_max = np.max(heatmap[i, j])
-                dr = np.zeros((height + 2 * border, width + 2 * border))
-                dr[border:-border, border:-border] = heatmap[i, j].copy()
-                dr = cv2.GaussianBlur(dr, (kernel, kernel), 0)
-                heatmap[i, j] = dr[border:-border, border:-border].copy()
-                heatmap[i, j] *= origin_max / np.max(heatmap[i, j])
-        return heatmap
-
-    def dark_parse(self, hm, coord):
-        heatmap_height = hm.shape[0]
-        heatmap_width = hm.shape[1]
-        px = int(coord[0])
-        py = int(coord[1])
-        if 1 < px < heatmap_width - 2 and 1 < py < heatmap_height - 2:
-            dx = 0.5 * (hm[py][px + 1] - hm[py][px - 1])
-            dy = 0.5 * (hm[py + 1][px] - hm[py - 1][px])
-            dxx = 0.25 * (hm[py][px + 2] - 2 * hm[py][px] + hm[py][px - 2])
-            dxy = 0.25 * (hm[py + 1][px + 1] - hm[py - 1][px + 1] - hm[py + 1][px - 1] + hm[py - 1][px - 1])
-            dyy = 0.25 * (hm[py + 2 * 1][px] - 2 * hm[py][px] + hm[py - 2 * 1][px])
-            derivative = np.matrix([[dx], [dy]])
-            hessian = np.matrix([[dxx, dxy], [dxy, dyy]])
-            if dxx * dyy - dxy**2 != 0:
-                hessianinv = hessian.I
-                offset = -hessianinv * derivative
-                offset = np.squeeze(np.array(offset.T), axis=0)
-                coord += offset
-        return coord
-
-    def dark_postprocess(self, hm, coords, kernelsize):
-        """
-        refer to https://github.com/ilovepose/DarkPose/lib/core/inference.py
-
-        """
-        hm = self.gaussian_blur(hm, kernelsize)
-        hm = np.maximum(hm, 1e-10)
-        hm = np.log(hm)
-        for n in range(coords.shape[0]):
-            for p in range(coords.shape[1]):
-                coords[n, p] = self.dark_parse(hm[n][p], coords[n][p])
-        return coords
-
-    def get_final_preds(self, heatmaps, center, scale, kernelsize=3):
-        """the highest heatvalue location with a quarter offset in the
-        direction from the highest response to the second highest response.
-
-        Args:
-            heatmaps (numpy.ndarray): The predicted heatmaps
-            center (numpy.ndarray): The boxes center
-            scale (numpy.ndarray): The scale factor
-
-        Returns:
-            preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords
-            maxvals: numpy.ndarray([batch_size, num_joints, 1]), the maximum confidence of the keypoints
-        """
-
-        coords, maxvals = self.get_max_preds(heatmaps)
-
-        heatmap_height = heatmaps.shape[2]
-        heatmap_width = heatmaps.shape[3]
-
-        if self.use_dark:
-            coords = self.dark_postprocess(heatmaps, coords, kernelsize)
-        else:
-            for n in range(coords.shape[0]):
-                for p in range(coords.shape[1]):
-                    hm = heatmaps[n][p]
-                    px = int(math.floor(coords[n][p][0] + 0.5))
-                    py = int(math.floor(coords[n][p][1] + 0.5))
-                    if 1 < px < heatmap_width - 1 and 1 < py < heatmap_height - 1:
-                        diff = np.array([hm[py][px + 1] - hm[py][px - 1], hm[py + 1][px] - hm[py - 1][px]])
-                        coords[n][p] += np.sign(diff) * 0.25
-        preds = coords.copy()
-
-        # Transform back
-        for i in range(coords.shape[0]):
-            preds[i] = transform_preds(coords[i], center[i], scale[i], [heatmap_width, heatmap_height])
-
-        return preds, maxvals
-
-    def __call__(self, output, center, scale):
-        preds, maxvals = self.get_final_preds(output, center, scale)
-        return np.concatenate((preds, maxvals), axis=-1), np.mean(maxvals, axis=1)
-
-
-def transform_preds(coords, center, scale, output_size):
-    target_coords = np.zeros(coords.shape)
-    trans = get_affine_transform(center, scale * 200, 0, output_size, inv=1)
-    for p in range(coords.shape[0]):
-        target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans)
-    return target_coords
-
-
-def affine_transform(pt, t):
-    new_pt = np.array([pt[0], pt[1], 1.0]).T
-    new_pt = np.dot(t, new_pt)
-    return new_pt[:2]
-
-
-def translate_to_ori_images(keypoint_result, batch_records):
-    kpts = keypoint_result["keypoint"]
-    scores = keypoint_result["score"]
-    kpts[..., 0] += batch_records[:, 0:1]
-    kpts[..., 1] += batch_records[:, 1:2]
-    return kpts, scores
diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_preprocess.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_preprocess.py
deleted file mode 100644
index 0ae0c8adef43..000000000000
--- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_preprocess.py
+++ /dev/null
@@ -1,233 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-this code is based on https://github.com/open-mmlab/mmpose/mmpose/core/post_processing/post_transforms.py
-"""
-import cv2
-import numpy as np
-
-
-class EvalAffine(object):
-    def __init__(self, size, stride=64):
-        super(EvalAffine, self).__init__()
-        self.size = size
-        self.stride = stride
-
-    def __call__(self, image, im_info):
-        s = self.size
-        h, w, _ = image.shape
-        trans, size_resized = get_affine_mat_kernel(h, w, s, inv=False)
-        image_resized = cv2.warpAffine(image, trans, size_resized)
-        return image_resized, im_info
-
-
-def get_affine_mat_kernel(h, w, s, inv=False):
-    if w < h:
-        w_ = s
-        h_ = int(np.ceil((s / w * h) / 64.0) * 64)
-        scale_w = w
-        scale_h = h_ / w_ * w
-
-    else:
-        h_ = s
-        w_ = int(np.ceil((s / h * w) / 64.0) * 64)
-        scale_h = h
-        scale_w = w_ / h_ * h
-
-    center = np.array([np.round(w / 2.0), np.round(h / 2.0)])
-
-    size_resized = (w_, h_)
-    trans = get_affine_transform(center, np.array([scale_w, scale_h]), 0, size_resized, inv=inv)
-
-    return trans, size_resized
-
-
-def get_affine_transform(center, input_size, rot, output_size, shift=(0.0, 0.0), inv=False):
-    """Get the affine transform matrix, given the center/scale/rot/output_size.
-
-    Args:
-        center (np.ndarray[2, ]): Center of the bounding box (x, y).
-        scale (np.ndarray[2, ]): Scale of the bounding box
-            wrt [width, height].
-        rot (float): Rotation angle (degree).
-        output_size (np.ndarray[2, ]): Size of the destination heatmaps.
-        shift (0-100%): Shift translation ratio wrt the width/height.
-            Default (0., 0.).
-        inv (bool): Option to inverse the affine transform direction.
-            (inv=False: src->dst or inv=True: dst->src)
-
-    Returns:
-        np.ndarray: The transform matrix.
-    """
-    assert len(center) == 2
-    assert len(output_size) == 2
-    assert len(shift) == 2
-    if not isinstance(input_size, (np.ndarray, list)):
-        input_size = np.array([input_size, input_size], dtype=np.float32)
-    scale_tmp = input_size
-
-    shift = np.array(shift)
-    src_w = scale_tmp[0]
-    dst_w = output_size[0]
-    dst_h = output_size[1]
-
-    rot_rad = np.pi * rot / 180
-    src_dir = rotate_point([0.0, src_w * -0.5], rot_rad)
-    dst_dir = np.array([0.0, dst_w * -0.5])
-
-    src = np.zeros((3, 2), dtype=np.float32)
-    src[0, :] = center + scale_tmp * shift
-    src[1, :] = center + src_dir + scale_tmp * shift
-    src[2, :] = _get_3rd_point(src[0, :], src[1, :])
-
-    dst = np.zeros((3, 2), dtype=np.float32)
-    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
-    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
-    dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :])
-
-    if inv:
-        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
-    else:
-        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
-
-    return trans
-
-
-def get_warp_matrix(theta, size_input, size_dst, size_target):
-    """This code is based on
-        https://github.com/open-mmlab/mmpose/blob/master/mmpose/core/post_processing/post_transforms.py
-
-        Calculate the transformation matrix under the constraint of unbiased.
-    Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased
-    Data Processing for Human Pose Estimation (CVPR 2020).
-
-    Args:
-        theta (float): Rotation angle in degrees.
-        size_input (np.ndarray): Size of input image [w, h].
-        size_dst (np.ndarray): Size of output image [w, h].
-        size_target (np.ndarray): Size of ROI in input plane [w, h].
-
-    Returns:
-        matrix (np.ndarray): A matrix for transformation.
-    """
-    theta = np.deg2rad(theta)
-    matrix = np.zeros((2, 3), dtype=np.float32)
-    scale_x = size_dst[0] / size_target[0]
-    scale_y = size_dst[1] / size_target[1]
-    matrix[0, 0] = np.cos(theta) * scale_x
-    matrix[0, 1] = -np.sin(theta) * scale_x
-    matrix[0, 2] = scale_x * (
-        -0.5 * size_input[0] * np.cos(theta) + 0.5 * size_input[1] * np.sin(theta) + 0.5 * size_target[0]
-    )
-    matrix[1, 0] = np.sin(theta) * scale_y
-    matrix[1, 1] = np.cos(theta) * scale_y
-    matrix[1, 2] = scale_y * (
-        -0.5 * size_input[0] * np.sin(theta) - 0.5 * size_input[1] * np.cos(theta) + 0.5 * size_target[1]
-    )
-    return matrix
-
-
-def rotate_point(pt, angle_rad):
-    """Rotate a point by an angle.
-
-    Args:
-        pt (list[float]): 2 dimensional point to be rotated
-        angle_rad (float): rotation angle by radian
-
-    Returns:
-        list[float]: Rotated point.
-    """
-    assert len(pt) == 2
-    sn, cs = np.sin(angle_rad), np.cos(angle_rad)
-    new_x = pt[0] * cs - pt[1] * sn
-    new_y = pt[0] * sn + pt[1] * cs
-    rotated_pt = [new_x, new_y]
-
-    return rotated_pt
-
-
-def _get_3rd_point(a, b):
-    """To calculate the affine matrix, three pairs of points are required. This
-    function is used to get the 3rd point, given 2D points a & b.
-
-    The 3rd point is defined by rotating vector `a - b` by 90 degrees
-    anticlockwise, using b as the rotation center.
-
-    Args:
-        a (np.ndarray): point(x,y)
-        b (np.ndarray): point(x,y)
-
-    Returns:
-        np.ndarray: The 3rd point.
-    """
-    assert len(a) == 2
-    assert len(b) == 2
-    direction = a - b
-    third_pt = b + np.array([-direction[1], direction[0]], dtype=np.float32)
-
-    return third_pt
-
-
-class TopDownEvalAffine(object):
-    """apply affine transform to image and coords
-
-    Args:
-        trainsize (list): [w, h], the standard size used to train
-        use_udp (bool): whether to use Unbiased Data Processing.
-        records(dict): the dict contained the image and coords
-
-    Returns:
-        records (dict): contain the image and coords after tranformed
-
-    """
-
-    def __init__(self, trainsize, use_udp=False):
-        self.trainsize = trainsize
-        self.use_udp = use_udp
-
-    def __call__(self, image, im_info):
-        rot = 0
-        imshape = im_info["im_shape"][::-1]
-        center = im_info["center"] if "center" in im_info else imshape / 2.0
-        scale = im_info["scale"] if "scale" in im_info else imshape
-        if self.use_udp:
-            trans = get_warp_matrix(rot, center * 2.0, [self.trainsize[0] - 1.0, self.trainsize[1] - 1.0], scale)
-            image = cv2.warpAffine(
-                image, trans, (int(self.trainsize[0]), int(self.trainsize[1])), flags=cv2.INTER_LINEAR
-            )
-        else:
-            trans = get_affine_transform(center, scale, rot, self.trainsize)
-            image = cv2.warpAffine(
-                image, trans, (int(self.trainsize[0]), int(self.trainsize[1])), flags=cv2.INTER_LINEAR
-            )
-
-        return image, im_info
-
-
-def expand_crop(images, rect, expand_ratio=0.3):
-    imgh, imgw, c = images.shape
-    label, conf, xmin, ymin, xmax, ymax = [int(x) for x in rect.tolist()]
-    if label != 0:
-        return None, None, None
-    org_rect = [xmin, ymin, xmax, ymax]
-    h_half = (ymax - ymin) * (1 + expand_ratio) / 2.0
-    w_half = (xmax - xmin) * (1 + expand_ratio) / 2.0
-    if h_half > w_half * 4 / 3:
-        w_half = h_half * 0.75
-    center = [(ymin + ymax) / 2.0, (xmin + xmax) / 2.0]
-    ymin = max(0, int(center[0] - h_half))
-    ymax = min(imgh - 1, int(center[0] + h_half))
-    xmin = max(0, int(center[1] - w_half))
-    xmax = min(imgw - 1, int(center[1] + w_half))
-    return images[ymin:ymax, xmin:xmax, :], [xmin, ymin, xmax, ymax], org_rect
diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/picodet_postprocess.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/picodet_postprocess.py
deleted file mode 100644
index 4af0d85eb536..000000000000
--- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/picodet_postprocess.py
+++ /dev/null
@@ -1,221 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-from scipy.special import softmax
-
-
-def hard_nms(box_scores, iou_threshold, top_k=-1, candidate_size=200):
-    """
-    Args:
-        box_scores (N, 5): boxes in corner-form and probabilities.
-        iou_threshold: intersection over union threshold.
-        top_k: keep top_k results. If k <= 0, keep all the results.
-        candidate_size: only consider the candidates with the highest scores.
-    Returns:
-         picked: a list of indexes of the kept boxes
-    """
-    scores = box_scores[:, -1]
-    boxes = box_scores[:, :-1]
-    picked = []
-    indexes = np.argsort(scores)
-    indexes = indexes[-candidate_size:]
-    while len(indexes) > 0:
-        current = indexes[-1]
-        picked.append(current)
-        if 0 < top_k == len(picked) or len(indexes) == 1:
-            break
-        current_box = boxes[current, :]
-        indexes = indexes[:-1]
-        rest_boxes = boxes[indexes, :]
-        iou = iou_of(
-            rest_boxes,
-            np.expand_dims(current_box, axis=0),
-        )
-        indexes = indexes[iou <= iou_threshold]
-
-    return box_scores[picked, :]
-
-
-def iou_of(boxes0, boxes1, eps=1e-5):
-    """Return intersection-over-union (Jaccard index) of boxes.
-    Args:
-        boxes0 (N, 4): ground truth boxes.
-        boxes1 (N or 1, 4): predicted boxes.
-        eps: a small number to avoid 0 as denominator.
-    Returns:
-        iou (N): IoU values.
-    """
-    overlap_left_top = np.maximum(boxes0[..., :2], boxes1[..., :2])
-    overlap_right_bottom = np.minimum(boxes0[..., 2:], boxes1[..., 2:])
-
-    overlap_area = area_of(overlap_left_top, overlap_right_bottom)
-    area0 = area_of(boxes0[..., :2], boxes0[..., 2:])
-    area1 = area_of(boxes1[..., :2], boxes1[..., 2:])
-    return overlap_area / (area0 + area1 - overlap_area + eps)
-
-
-def area_of(left_top, right_bottom):
-    """Compute the areas of rectangles given two corners.
-    Args:
-        left_top (N, 2): left top corner.
-        right_bottom (N, 2): right bottom corner.
-    Returns:
-        area (N): return the area.
-    """
-    hw = np.clip(right_bottom - left_top, 0.0, None)
-    return hw[..., 0] * hw[..., 1]
-
-
-class PicoDetPostProcess(object):
-    """
-    Args:
-        input_shape (int): network input image size
-        ori_shape (int): ori image shape of before padding
-        scale_factor (float): scale factor of ori image
-        enable_mkldnn (bool): whether to open MKLDNN
-    """
-
-    def __init__(
-        self,
-        input_shape,
-        ori_shape,
-        scale_factor,
-        strides=[8, 16, 32, 64],
-        score_threshold=0.4,
-        nms_threshold=0.5,
-        nms_top_k=1000,
-        keep_top_k=100,
-    ):
-        self.ori_shape = ori_shape
-        self.input_shape = input_shape
-        self.scale_factor = scale_factor
-        self.strides = strides
-        self.score_threshold = score_threshold
-        self.nms_threshold = nms_threshold
-        self.nms_top_k = nms_top_k
-        self.keep_top_k = keep_top_k
-
-    def warp_boxes(self, boxes, ori_shape):
-        """Apply transform to boxes"""
-        width, height = ori_shape[1], ori_shape[0]
-        n = len(boxes)
-        if n:
-            # warp points
-            xy = np.ones((n * 4, 3))
-            xy[:, :2] = boxes[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(n * 4, 2)  # x1y1, x2y2, x1y2, x2y1
-            # xy = xy @ M.T  # transform
-            xy = (xy[:, :2] / xy[:, 2:3]).reshape(n, 8)  # rescale
-            # create new boxes
-            x = xy[:, [0, 2, 4, 6]]
-            y = xy[:, [1, 3, 5, 7]]
-            xy = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T
-            # clip boxes
-            xy[:, [0, 2]] = xy[:, [0, 2]].clip(0, width)
-            xy[:, [1, 3]] = xy[:, [1, 3]].clip(0, height)
-            return xy.astype(np.float32)
-        else:
-            return boxes
-
-    def __call__(self, scores, raw_boxes):
-        batch_size = raw_boxes[0].shape[0]
-        reg_max = int(raw_boxes[0].shape[-1] / 4 - 1)
-        out_boxes_num = []
-        out_boxes_list = []
-        for batch_id in range(batch_size):
-            # generate centers
-            decode_boxes = []
-            select_scores = []
-            for stride, box_distribute, score in zip(self.strides, raw_boxes, scores):
-                box_distribute = box_distribute[batch_id]
-                score = score[batch_id]
-                # centers
-                fm_h = self.input_shape[0] / stride
-                fm_w = self.input_shape[1] / stride
-                h_range = np.arange(fm_h)
-                w_range = np.arange(fm_w)
-                ww, hh = np.meshgrid(w_range, h_range)
-                ct_row = (hh.flatten() + 0.5) * stride
-                ct_col = (ww.flatten() + 0.5) * stride
-                center = np.stack((ct_col, ct_row, ct_col, ct_row), axis=1)
-
-                # box distribution to distance
-                reg_range = np.arange(reg_max + 1)
-                box_distance = box_distribute.reshape((-1, reg_max + 1))
-                box_distance = softmax(box_distance, axis=1)
-                box_distance = box_distance * np.expand_dims(reg_range, axis=0)
-                box_distance = np.sum(box_distance, axis=1).reshape((-1, 4))
-                box_distance = box_distance * stride
-
-                # top K candidate
-                topk_idx = np.argsort(score.max(axis=1))[::-1]
-                topk_idx = topk_idx[: self.nms_top_k]
-                center = center[topk_idx]
-                score = score[topk_idx]
-                box_distance = box_distance[topk_idx]
-
-                # decode box
-                decode_box = center + [-1, -1, 1, 1] * box_distance
-
-                select_scores.append(score)
-                decode_boxes.append(decode_box)
-
-            # nms
-            bboxes = np.concatenate(decode_boxes, axis=0)
-            confidences = np.concatenate(select_scores, axis=0)
-            picked_box_probs = []
-            picked_labels = []
-            for class_index in range(0, confidences.shape[1]):
-                probs = confidences[:, class_index]
-                mask = probs > self.score_threshold
-                probs = probs[mask]
-                if probs.shape[0] == 0:
-                    continue
-                subset_boxes = bboxes[mask, :]
-                box_probs = np.concatenate([subset_boxes, probs.reshape(-1, 1)], axis=1)
-                box_probs = hard_nms(
-                    box_probs,
-                    iou_threshold=self.nms_threshold,
-                    top_k=self.keep_top_k,
-                )
-                picked_box_probs.append(box_probs)
-                picked_labels.extend([class_index] * box_probs.shape[0])
-
-            if len(picked_box_probs) == 0:
-                out_boxes_list.append(np.empty((0, 4)))
-                out_boxes_num.append(0)
-
-            else:
-                picked_box_probs = np.concatenate(picked_box_probs)
-
-                # resize output boxes
-                picked_box_probs[:, :4] = self.warp_boxes(picked_box_probs[:, :4], self.ori_shape[batch_id])
-                im_scale = np.concatenate([self.scale_factor[batch_id][::-1], self.scale_factor[batch_id][::-1]])
-                picked_box_probs[:, :4] /= im_scale
-                # clas score box
-                out_boxes_list.append(
-                    np.concatenate(
-                        [
-                            np.expand_dims(np.array(picked_labels), axis=-1),
-                            np.expand_dims(picked_box_probs[:, 4], axis=-1),
-                            picked_box_probs[:, :4],
-                        ],
-                        axis=1,
-                    )
-                )
-                out_boxes_num.append(len(picked_labels))
-
-        out_boxes_list = np.concatenate(out_boxes_list, axis=0)
-        out_boxes_num = np.asarray(out_boxes_num).astype(np.int32)
-        return out_boxes_list, out_boxes_num
diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/preprocess.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/preprocess.py
deleted file mode 100644
index d6a4cab7c4a1..000000000000
--- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/preprocess.py
+++ /dev/null
@@ -1,482 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import cv2
-import numpy as np
-from PIL import Image
-
-from .keypoint_preprocess import get_affine_transform
-
-
-def decode_image(im_file, im_info):
-    """read rgb image
-    Args:
-        im_file (str|np.ndarray): input can be image path or np.ndarray
-        im_info (dict): info of image
-    Returns:
-        im (np.ndarray):  processed image (np.ndarray)
-        im_info (dict): info of processed image
-    """
-    if isinstance(im_file, str):
-        with open(im_file, "rb") as f:
-            im_read = f.read()
-        data = np.frombuffer(im_read, dtype="uint8")
-        im = cv2.imdecode(data, 1)  # BGR mode, but need RGB mode
-        im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
-    else:
-        im = im_file
-    im_info["im_shape"] = np.array(im.shape[:2], dtype=np.float32)
-    im_info["scale_factor"] = np.array([1.0, 1.0], dtype=np.float32)
-    return im, im_info
-
-
-class Resize_Mult32(object):
-    """resize image by target_size and max_size
-    Args:
-        target_size (int): the target size of image
-        keep_ratio (bool): whether keep_ratio or not, default true
-        interp (int): method of resize
-    """
-
-    def __init__(self, limit_side_len, limit_type, interp=cv2.INTER_LINEAR):
-        self.limit_side_len = limit_side_len
-        self.limit_type = limit_type
-        self.interp = interp
-
-    def __call__(self, im, im_info):
-        """
-        Args:
-            im (np.ndarray): image (np.ndarray)
-            im_info (dict): info of image
-        Returns:
-            im (np.ndarray):  processed image (np.ndarray)
-            im_info (dict): info of processed image
-        """
-        im_scale_y, im_scale_x = self.generate_scale(im)
-        im = cv2.resize(im, None, None, fx=im_scale_x, fy=im_scale_y, interpolation=self.interp)
-        im_info["im_shape"] = np.array(im.shape[:2]).astype("float32")
-        im_info["scale_factor"] = np.array([im_scale_y, im_scale_x]).astype("float32")
-        return im, im_info
-
-    def generate_scale(self, img):
-        """
-        Args:
-            img (np.ndarray): image (np.ndarray)
-        Returns:
-            im_scale_x: the resize ratio of X
-            im_scale_y: the resize ratio of Y
-        """
-        limit_side_len = self.limit_side_len
-        h, w, c = img.shape
-
-        # limit the max side
-        if self.limit_type == "max":
-            if h > w:
-                ratio = float(limit_side_len) / h
-            else:
-                ratio = float(limit_side_len) / w
-        elif self.limit_type == "min":
-            if h < w:
-                ratio = float(limit_side_len) / h
-            else:
-                ratio = float(limit_side_len) / w
-        elif self.limit_type == "resize_long":
-            ratio = float(limit_side_len) / max(h, w)
-        else:
-            raise Exception("not support limit type, image ")
-        resize_h = int(h * ratio)
-        resize_w = int(w * ratio)
-
-        resize_h = max(int(round(resize_h / 32) * 32), 32)
-        resize_w = max(int(round(resize_w / 32) * 32), 32)
-
-        im_scale_y = resize_h / float(h)
-        im_scale_x = resize_w / float(w)
-        return im_scale_y, im_scale_x
-
-
-class Resize(object):
-    """resize image by target_size and max_size
-    Args:
-        target_size (int): the target size of image
-        keep_ratio (bool): whether keep_ratio or not, default true
-        interp (int): method of resize
-    """
-
-    def __init__(self, target_size, keep_ratio=True, interp=cv2.INTER_LINEAR):
-        if isinstance(target_size, int):
-            target_size = [target_size, target_size]
-        self.target_size = target_size
-        self.keep_ratio = keep_ratio
-        self.interp = interp
-
-    def __call__(self, im, im_info):
-        """
-        Args:
-            im (np.ndarray): image (np.ndarray)
-            im_info (dict): info of image
-        Returns:
-            im (np.ndarray):  processed image (np.ndarray)
-            im_info (dict): info of processed image
-        """
-        assert len(self.target_size) == 2
-        assert self.target_size[0] > 0 and self.target_size[1] > 0
-        im_scale_y, im_scale_x = self.generate_scale(im)
-        im = cv2.resize(im, None, None, fx=im_scale_x, fy=im_scale_y, interpolation=self.interp)
-        im_info["im_shape"] = np.array(im.shape[:2]).astype("float32")
-        im_info["scale_factor"] = np.array([im_scale_y, im_scale_x]).astype("float32")
-        return im, im_info
-
-    def generate_scale(self, im):
-        """
-        Args:
-            im (np.ndarray): image (np.ndarray)
-        Returns:
-            im_scale_x: the resize ratio of X
-            im_scale_y: the resize ratio of Y
-        """
-        origin_shape = im.shape[:2]
-        if self.keep_ratio:
-            im_size_min = np.min(origin_shape)
-            im_size_max = np.max(origin_shape)
-            target_size_min = np.min(self.target_size)
-            target_size_max = np.max(self.target_size)
-            im_scale = float(target_size_min) / float(im_size_min)
-            if np.round(im_scale * im_size_max) > target_size_max:
-                im_scale = float(target_size_max) / float(im_size_max)
-            im_scale_x = im_scale
-            im_scale_y = im_scale
-        else:
-            resize_h, resize_w = self.target_size
-            im_scale_y = resize_h / float(origin_shape[0])
-            im_scale_x = resize_w / float(origin_shape[1])
-        return im_scale_y, im_scale_x
-
-
-class ShortSizeScale(object):
-    """
-    Scale images by short size.
-    Args:
-        short_size(float | int): Short size of an image will be scaled to the short_size.
-        fixed_ratio(bool): Set whether to zoom according to a fixed ratio. default: True
-        do_round(bool): Whether to round up when calculating the zoom ratio. default: False
-        backend(str): Choose pillow or cv2 as the graphics processing backend. default: 'pillow'
-    """
-
-    def __init__(self, short_size, fixed_ratio=True, keep_ratio=None, do_round=False, backend="pillow"):
-        self.short_size = short_size
-        assert (fixed_ratio and not keep_ratio) or (
-            not fixed_ratio
-        ), "fixed_ratio and keep_ratio cannot be true at the same time"
-        self.fixed_ratio = fixed_ratio
-        self.keep_ratio = keep_ratio
-        self.do_round = do_round
-
-        assert backend in ["pillow", "cv2"], "Scale's backend must be pillow or cv2, but get {backend}"
-
-        self.backend = backend
-
-    def __call__(self, img):
-        """
-        Performs resize operations.
-        Args:
-            img (PIL.Image): a PIL.Image.
-        return:
-            resized_img: a PIL.Image after scaling.
-        """
-
-        result_img = None
-
-        if isinstance(img, np.ndarray):
-            h, w, _ = img.shape
-        elif isinstance(img, Image.Image):
-            w, h = img.size
-        else:
-            raise NotImplementedError
-
-        if w <= h:
-            ow = self.short_size
-            if self.fixed_ratio:  # default is True
-                oh = int(self.short_size * 4.0 / 3.0)
-            elif not self.keep_ratio:  # no
-                oh = self.short_size
-            else:
-                scale_factor = self.short_size / w
-                oh = int(h * float(scale_factor) + 0.5) if self.do_round else int(h * self.short_size / w)
-                ow = int(w * float(scale_factor) + 0.5) if self.do_round else int(w * self.short_size / h)
-        else:
-            oh = self.short_size
-            if self.fixed_ratio:
-                ow = int(self.short_size * 4.0 / 3.0)
-            elif not self.keep_ratio:  # no
-                ow = self.short_size
-            else:
-                scale_factor = self.short_size / h
-                oh = int(h * float(scale_factor) + 0.5) if self.do_round else int(h * self.short_size / w)
-                ow = int(w * float(scale_factor) + 0.5) if self.do_round else int(w * self.short_size / h)
-
-        if type(img) == np.ndarray:
-            img = Image.fromarray(img, mode="RGB")
-
-        if self.backend == "pillow":
-            result_img = img.resize((ow, oh), Image.BILINEAR)
-        elif self.backend == "cv2" and (self.keep_ratio is not None):
-            result_img = cv2.resize(img, (ow, oh), interpolation=cv2.INTER_LINEAR)
-        else:
-            result_img = Image.fromarray(cv2.resize(np.asarray(img), (ow, oh), interpolation=cv2.INTER_LINEAR))
-
-        return result_img
-
-
-class NormalizeImage(object):
-    """normalize image
-    Args:
-        mean (list): im - mean
-        std (list): im / std
-        is_scale (bool): whether need im / 255
-        norm_type (str): type in ['mean_std', 'none']
-    """
-
-    def __init__(self, mean, std, is_scale=True, norm_type="mean_std"):
-        self.mean = mean
-        self.std = std
-        self.is_scale = is_scale
-        self.norm_type = norm_type
-
-    def __call__(self, im, im_info):
-        """
-        Args:
-            im (np.ndarray): image (np.ndarray)
-            im_info (dict): info of image
-        Returns:
-            im (np.ndarray):  processed image (np.ndarray)
-            im_info (dict): info of processed image
-        """
-        im = im.astype(np.float32, copy=False)
-        if self.is_scale:
-            scale = 1.0 / 255.0
-            im *= scale
-
-        if self.norm_type == "mean_std":
-            mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
-            std = np.array(self.std)[np.newaxis, np.newaxis, :]
-            im -= mean
-            im /= std
-        return im, im_info
-
-
-class Permute(object):
-    """permute image
-    Args:
-        to_bgr (bool): whether convert RGB to BGR
-        channel_first (bool): whether convert HWC to CHW
-    """
-
-    def __init__(
-        self,
-    ):
-        super(Permute, self).__init__()
-
-    def __call__(self, im, im_info):
-        """
-        Args:
-            im (np.ndarray): image (np.ndarray)
-            im_info (dict): info of image
-        Returns:
-            im (np.ndarray):  processed image (np.ndarray)
-            im_info (dict): info of processed image
-        """
-        im = im.transpose((2, 0, 1)).copy()
-        return im, im_info
-
-
-class PadStride(object):
-    """padding image for model with FPN, instead PadBatch(pad_to_stride) in original config
-    Args:
-        stride (bool): model with FPN need image shape % stride == 0
-    """
-
-    def __init__(self, stride=0):
-        self.coarsest_stride = stride
-
-    def __call__(self, im, im_info):
-        """
-        Args:
-            im (np.ndarray): image (np.ndarray)
-            im_info (dict): info of image
-        Returns:
-            im (np.ndarray):  processed image (np.ndarray)
-            im_info (dict): info of processed image
-        """
-        coarsest_stride = self.coarsest_stride
-        if coarsest_stride <= 0:
-            return im, im_info
-        im_c, im_h, im_w = im.shape
-        pad_h = int(np.ceil(float(im_h) / coarsest_stride) * coarsest_stride)
-        pad_w = int(np.ceil(float(im_w) / coarsest_stride) * coarsest_stride)
-        padding_im = np.zeros((im_c, pad_h, pad_w), dtype=np.float32)
-        padding_im[:, :im_h, :im_w] = im
-        return padding_im, im_info
-
-
-class LetterBoxResize(object):
-    def __init__(self, target_size):
-        """
-        Resize image to target size, convert normalized xywh to pixel xyxy
-        format ([x_center, y_center, width, height] -> [x0, y0, x1, y1]).
-        Args:
-            target_size (int|list): image target size.
-        """
-        super(LetterBoxResize, self).__init__()
-        if isinstance(target_size, int):
-            target_size = [target_size, target_size]
-        self.target_size = target_size
-
-    def letterbox(self, img, height, width, color=(127.5, 127.5, 127.5)):
-        # letterbox: resize a rectangular image to a padded rectangular
-        shape = img.shape[:2]  # [height, width]
-        ratio_h = float(height) / shape[0]
-        ratio_w = float(width) / shape[1]
-        ratio = min(ratio_h, ratio_w)
-        new_shape = (round(shape[1] * ratio), round(shape[0] * ratio))  # [width, height]
-        padw = (width - new_shape[0]) / 2
-        padh = (height - new_shape[1]) / 2
-        top, bottom = round(padh - 0.1), round(padh + 0.1)
-        left, right = round(padw - 0.1), round(padw + 0.1)
-
-        img = cv2.resize(img, new_shape, interpolation=cv2.INTER_AREA)  # resized, no border
-        img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # padded rectangular
-        return img, ratio, padw, padh
-
-    def __call__(self, im, im_info):
-        """
-        Args:
-            im (np.ndarray): image (np.ndarray)
-            im_info (dict): info of image
-        Returns:
-            im (np.ndarray):  processed image (np.ndarray)
-            im_info (dict): info of processed image
-        """
-        assert len(self.target_size) == 2
-        assert self.target_size[0] > 0 and self.target_size[1] > 0
-        height, width = self.target_size
-        h, w = im.shape[:2]
-        im, ratio, padw, padh = self.letterbox(im, height=height, width=width)
-
-        new_shape = [round(h * ratio), round(w * ratio)]
-        im_info["im_shape"] = np.array(new_shape, dtype=np.float32)
-        im_info["scale_factor"] = np.array([ratio, ratio], dtype=np.float32)
-        return im, im_info
-
-
-class Pad(object):
-    def __init__(self, size, fill_value=[114.0, 114.0, 114.0]):
-        """
-        Pad image to a specified size.
-        Args:
-            size (list[int]): image target size
-            fill_value (list[float]): rgb value of pad area, default (114.0, 114.0, 114.0)
-        """
-        super(Pad, self).__init__()
-        if isinstance(size, int):
-            size = [size, size]
-        self.size = size
-        self.fill_value = fill_value
-
-    def __call__(self, im, im_info):
-        im_h, im_w = im.shape[:2]
-        h, w = self.size
-        if h == im_h and w == im_w:
-            im = im.astype(np.float32)
-            return im, im_info
-
-        canvas = np.ones((h, w, 3), dtype=np.float32)
-        canvas *= np.array(self.fill_value, dtype=np.float32)
-        canvas[0:im_h, 0:im_w, :] = im.astype(np.float32)
-        im = canvas
-        return im, im_info
-
-
-class WarpAffine(object):
-    """Warp affine the image"""
-
-    def __init__(self, keep_res=False, pad=31, input_h=512, input_w=512, scale=0.4, shift=0.1, down_ratio=4):
-        self.keep_res = keep_res
-        self.pad = pad
-        self.input_h = input_h
-        self.input_w = input_w
-        self.scale = scale
-        self.shift = shift
-        self.down_ratio = down_ratio
-
-    def __call__(self, im, im_info):
-        """
-        Args:
-            im (np.ndarray): image (np.ndarray)
-            im_info (dict): info of image
-        Returns:
-            im (np.ndarray):  processed image (np.ndarray)
-            im_info (dict): info of processed image
-        """
-        img = cv2.cvtColor(im, cv2.COLOR_RGB2BGR)
-
-        h, w = img.shape[:2]
-
-        if self.keep_res:
-            # True in detection eval/infer
-            input_h = (h | self.pad) + 1
-            input_w = (w | self.pad) + 1
-            s = np.array([input_w, input_h], dtype=np.float32)
-            c = np.array([w // 2, h // 2], dtype=np.float32)
-
-        else:
-            # False in centertrack eval_mot/eval_mot
-            s = max(h, w) * 1.0
-            input_h, input_w = self.input_h, self.input_w
-            c = np.array([w / 2.0, h / 2.0], dtype=np.float32)
-
-        trans_input = get_affine_transform(c, s, 0, [input_w, input_h])
-        img = cv2.resize(img, (w, h))
-        inp = cv2.warpAffine(img, trans_input, (input_w, input_h), flags=cv2.INTER_LINEAR)
-
-        if not self.keep_res:
-            out_h = input_h // self.down_ratio
-            out_w = input_w // self.down_ratio
-            trans_output = get_affine_transform(c, s, 0, [out_w, out_h])
-
-            im_info.update(
-                {
-                    "center": c,
-                    "scale": s,
-                    "out_height": out_h,
-                    "out_width": out_w,
-                    "inp_height": input_h,
-                    "inp_width": input_w,
-                    "trans_input": trans_input,
-                    "trans_output": trans_output,
-                }
-            )
-        return inp, im_info
-
-
-def preprocess(im, preprocess_ops):
-    # process image by preprocess_ops
-    im_info = {
-        "scale_factor": np.array([1.0, 1.0], dtype=np.float32),
-        "im_shape": None,
-    }
-    im, im_info = decode_image(im, im_info)
-    for operator in preprocess_ops:
-        im, im_info = operator(im, im_info)
-    return im, im_info
diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/util.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/util.py
deleted file mode 100644
index 40c2383389fd..000000000000
--- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/util.py
+++ /dev/null
@@ -1,245 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-
-import cv2
-import matplotlib
-import numpy as np
-
-
-def pad_right_down_corner(img, stride, padValue):
-    h = img.shape[0]
-    w = img.shape[1]
-
-    pad = 4 * [None]
-    pad[0] = 0  # up
-    pad[1] = 0  # left
-    pad[2] = 0 if (h % stride == 0) else stride - (h % stride)  # down
-    pad[3] = 0 if (w % stride == 0) else stride - (w % stride)  # right
-
-    img_padded = img
-    pad_up = np.tile(img_padded[0:1, :, :] * 0 + padValue, (pad[0], 1, 1))
-    img_padded = np.concatenate((pad_up, img_padded), axis=0)
-    pad_left = np.tile(img_padded[:, 0:1, :] * 0 + padValue, (1, pad[1], 1))
-    img_padded = np.concatenate((pad_left, img_padded), axis=1)
-    pad_down = np.tile(img_padded[-2:-1, :, :] * 0 + padValue, (pad[2], 1, 1))
-    img_padded = np.concatenate((img_padded, pad_down), axis=0)
-    pad_right = np.tile(img_padded[:, -2:-1, :] * 0 + padValue, (1, pad[3], 1))
-    img_padded = np.concatenate((img_padded, pad_right), axis=1)
-
-    return img_padded, pad
-
-
-# transfer caffe model to pytorch which will match the layer name
-def transfer(model, model_weights):
-    transfered_model_weights = {}
-    for weights_name in model.state_dict().keys():
-        transfered_model_weights[weights_name] = model_weights[".".join(weights_name.split(".")[1:])]
-    return transfered_model_weights
-
-
-# draw the body keypoint and lims
-def draw_bodypose(canvas, candidate, subset):
-    stickwidth = 4
-    limbSeq = [
-        [2, 3],
-        [2, 6],
-        [3, 4],
-        [4, 5],
-        [6, 7],
-        [7, 8],
-        [2, 9],
-        [9, 10],
-        [10, 11],
-        [2, 12],
-        [12, 13],
-        [13, 14],
-        [2, 1],
-        [1, 15],
-        [15, 17],
-        [1, 16],
-        [16, 18],
-        [3, 17],
-        [6, 18],
-    ]
-
-    colors = [
-        [255, 0, 0],
-        [255, 85, 0],
-        [255, 170, 0],
-        [255, 255, 0],
-        [170, 255, 0],
-        [85, 255, 0],
-        [0, 255, 0],
-        [0, 255, 85],
-        [0, 255, 170],
-        [0, 255, 255],
-        [0, 170, 255],
-        [0, 85, 255],
-        [0, 0, 255],
-        [85, 0, 255],
-        [170, 0, 255],
-        [255, 0, 255],
-        [255, 0, 170],
-        [255, 0, 85],
-    ]
-    for i in range(18):
-        for n in range(len(subset)):
-            index = int(subset[n][i])
-            if index == -1:
-                continue
-            x, y = candidate[index][0:2]
-            cv2.circle(canvas, (int(x), int(y)), 4, colors[i], thickness=-1)
-    for i in range(17):
-        for n in range(len(subset)):
-            index = subset[n][np.array(limbSeq[i]) - 1]
-            if -1 in index:
-                continue
-            cur_canvas = canvas.copy()
-            Y = candidate[index.astype(int), 0]
-            X = candidate[index.astype(int), 1]
-            mX = np.mean(X)
-            mY = np.mean(Y)
-            length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
-            angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
-            polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1)
-            cv2.fillConvexPoly(cur_canvas, polygon, colors[i])
-            canvas = cv2.addWeighted(canvas, 0.4, cur_canvas, 0.6, 0)
-    return canvas
-
-
-def draw_handpose(canvas, all_hand_peaks, show_number=False):
-    edges = [
-        [0, 1],
-        [1, 2],
-        [2, 3],
-        [3, 4],
-        [0, 5],
-        [5, 6],
-        [6, 7],
-        [7, 8],
-        [0, 9],
-        [9, 10],
-        [10, 11],
-        [11, 12],
-        [0, 13],
-        [13, 14],
-        [14, 15],
-        [15, 16],
-        [0, 17],
-        [17, 18],
-        [18, 19],
-        [19, 20],
-    ]
-
-    for peaks in all_hand_peaks:
-        for ie, e in enumerate(edges):
-            if np.sum(np.all(peaks[e], axis=1) == 0) == 0:
-                x1, y1 = peaks[e[0]]
-                x2, y2 = peaks[e[1]]
-                cv2.line(
-                    canvas,
-                    (x1, y1),
-                    (x2, y2),
-                    matplotlib.colors.hsv_to_rgb([ie / float(len(edges)), 1.0, 1.0]) * 255,
-                    thickness=2,
-                )
-
-        for i, keyponit in enumerate(peaks):
-            x, y = keyponit
-            cv2.circle(canvas, (x, y), 4, (0, 0, 255), thickness=-1)
-            if show_number:
-                cv2.putText(canvas, str(i), (x, y), cv2.FONT_HERSHEY_SIMPLEX, 0.3, (0, 0, 0), lineType=cv2.LINE_AA)
-    return canvas
-
-
-# detect hand according to body pose keypoints
-# please refer to https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/src/openpose/hand/handDetector.cpp
-def hand_detect(candidate, subset, oriImg):
-    # right hand: wrist 4, elbow 3, shoulder 2
-    # left hand: wrist 7, elbow 6, shoulder 5
-    ratioWristElbow = 0.4
-    detect_result = []
-    image_height, image_width = oriImg.shape[0:2]
-    for person in subset.astype(int):
-        # if any of three not detected
-        has_left = np.sum(person[[5, 6, 7]] == -1) == 0
-        has_right = np.sum(person[[2, 3, 4]] == -1) == 0
-        if not (has_left or has_right):
-            continue
-        hands = []
-        # left hand
-        if has_left:
-            left_shoulder_index, left_elbow_index, left_wrist_index = person[[5, 6, 7]]
-            x1, y1 = candidate[left_shoulder_index][:2]
-            x2, y2 = candidate[left_elbow_index][:2]
-            x3, y3 = candidate[left_wrist_index][:2]
-            hands.append([x1, y1, x2, y2, x3, y3, True])
-        # right hand
-        if has_right:
-            right_shoulder_index, right_elbow_index, right_wrist_index = person[[2, 3, 4]]
-            x1, y1 = candidate[right_shoulder_index][:2]
-            x2, y2 = candidate[right_elbow_index][:2]
-            x3, y3 = candidate[right_wrist_index][:2]
-            hands.append([x1, y1, x2, y2, x3, y3, False])
-
-        for x1, y1, x2, y2, x3, y3, is_left in hands:
-            # pos_hand = pos_wrist + ratio * (pos_wrist - pos_elbox) = (1 + ratio) * pos_wrist - ratio * pos_elbox
-            # handRectangle.x = posePtr[wrist*3] + ratioWristElbow * (posePtr[wrist*3] - posePtr[elbow*3]);
-            # handRectangle.y = posePtr[wrist*3+1] + ratioWristElbow * (posePtr[wrist*3+1] - posePtr[elbow*3+1]);
-            # const auto distanceWristElbow = getDistance(poseKeypoints, person, wrist, elbow);
-            # const auto distanceElbowShoulder = getDistance(poseKeypoints, person, elbow, shoulder);
-            # handRectangle.width = 1.5f * fastMax(distanceWristElbow, 0.9f * distanceElbowShoulder);
-            x = x3 + ratioWristElbow * (x3 - x2)
-            y = y3 + ratioWristElbow * (y3 - y2)
-            distanceWristElbow = math.sqrt((x3 - x2) ** 2 + (y3 - y2) ** 2)
-            distanceElbowShoulder = math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
-            width = 1.0 * max(distanceWristElbow, 0.9 * distanceElbowShoulder)
-            # x-y refers to the center --> offset to topLeft point
-            # handRectangle.x -= handRectangle.width / 2.f;
-            # handRectangle.y -= handRectangle.height / 2.f;
-            x -= width / 2
-            y -= width / 2  # width = height
-            # overflow the image
-            if x < 0:
-                x = 0
-            if y < 0:
-                y = 0
-            width1 = width
-            width2 = width
-            if x + width > image_width:
-                width1 = image_width - x
-            if y + width > image_height:
-                width2 = image_height - y
-            width = min(width1, width2)
-            # the max hand box value is 20 pixels
-            if width >= 20:
-                detect_result.append([int(x), int(y), int(width), is_left])
-
-    """
-    return value: [[x, y, w, True if left hand else False]].
-    width=height since the network require squared input.
-    x, y is the coordinate of top left
-    """
-    return detect_result
-
-
-# get max index of 2d array
-def npmax(array):
-    arrayindex = array.argmax(1)
-    arrayvalue = array.max(1)
-    i = arrayvalue.argmax()
-    j = arrayindex[i]
-    return i, j
diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/utils.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/utils.py
deleted file mode 100644
index 8f2be3a37396..000000000000
--- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/utils.py
+++ /dev/null
@@ -1,473 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import ast
-import os
-import time
-
-import numpy as np
-
-
-def argsparser():
-    parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument(
-        "--model_dir",
-        type=str,
-        default=None,
-        help=(
-            "Directory include:'model.pdiparams', 'model.pdmodel', "
-            "'infer_cfg.yml', created by tools/export_model.py."
-        ),
-        required=True,
-    )
-    parser.add_argument("--image_file", type=str, default=None, help="Path of image file.")
-    parser.add_argument(
-        "--image_dir", type=str, default=None, help="Dir of image file, `image_file` has a higher priority."
-    )
-    parser.add_argument("--batch_size", type=int, default=1, help="batch_size for inference.")
-    parser.add_argument(
-        "--video_file",
-        type=str,
-        default=None,
-        help="Path of video file, `video_file` or `camera_id` has a highest priority.",
-    )
-    parser.add_argument("--camera_id", type=int, default=-1, help="device id of camera to predict.")
-    parser.add_argument("--threshold", type=float, default=0.5, help="Threshold of score.")
-    parser.add_argument("--output_dir", type=str, default="output", help="Directory of output visualization files.")
-    parser.add_argument(
-        "--run_mode", type=str, default="paddle", help="mode of running(paddle/trt_fp32/trt_fp16/trt_int8)"
-    )
-    parser.add_argument(
-        "--device",
-        type=str,
-        default="cpu",
-        help="Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU.",
-    )
-    parser.add_argument("--use_gpu", type=ast.literal_eval, default=False, help="Deprecated, please use `--device`.")
-    parser.add_argument(
-        "--run_benchmark",
-        type=ast.literal_eval,
-        default=False,
-        help="Whether to predict a image_file repeatedly for benchmark",
-    )
-    parser.add_argument("--enable_mkldnn", type=ast.literal_eval, default=False, help="Whether use mkldnn with CPU.")
-    parser.add_argument(
-        "--enable_mkldnn_bfloat16",
-        type=ast.literal_eval,
-        default=False,
-        help="Whether use mkldnn bfloat16 inference with CPU.",
-    )
-    parser.add_argument("--cpu_threads", type=int, default=1, help="Num of threads with CPU.")
-    parser.add_argument("--trt_min_shape", type=int, default=1, help="min_shape for TensorRT.")
-    parser.add_argument("--trt_max_shape", type=int, default=1280, help="max_shape for TensorRT.")
-    parser.add_argument("--trt_opt_shape", type=int, default=640, help="opt_shape for TensorRT.")
-    parser.add_argument(
-        "--trt_calib_mode",
-        type=bool,
-        default=False,
-        help="If the model is produced by TRT offline quantitative " "calibration, trt_calib_mode need to set True.",
-    )
-    parser.add_argument("--save_images", type=ast.literal_eval, default=True, help="Save visualization image results.")
-    parser.add_argument("--save_mot_txts", action="store_true", help="Save tracking results (txt).")
-    parser.add_argument(
-        "--save_mot_txt_per_img", action="store_true", help="Save tracking results (txt) for each image."
-    )
-    parser.add_argument(
-        "--scaled",
-        type=bool,
-        default=False,
-        help="Whether coords after detector outputs are scaled, False in JDE YOLOv3 " "True in general detector.",
-    )
-    parser.add_argument("--tracker_config", type=str, default=None, help=("tracker donfig"))
-    parser.add_argument(
-        "--reid_model_dir",
-        type=str,
-        default=None,
-        help=(
-            "Directory include:'model.pdiparams', 'model.pdmodel', "
-            "'infer_cfg.yml', created by tools/export_model.py."
-        ),
-    )
-    parser.add_argument("--reid_batch_size", type=int, default=50, help="max batch_size for reid model inference.")
-    parser.add_argument(
-        "--use_dark",
-        type=ast.literal_eval,
-        default=True,
-        help="whether to use darkpose to get better keypoint position predict ",
-    )
-    parser.add_argument("--action_file", type=str, default=None, help="Path of input file for action recognition.")
-    parser.add_argument(
-        "--window_size", type=int, default=50, help="Temporal size of skeleton feature for action recognition."
-    )
-    parser.add_argument(
-        "--random_pad", type=ast.literal_eval, default=False, help="Whether do random padding for action recognition."
-    )
-    parser.add_argument(
-        "--save_results",
-        action="store_true",
-        default=False,
-        help="Whether save detection result to file using coco format",
-    )
-    parser.add_argument(
-        "--use_coco_category",
-        action="store_true",
-        default=False,
-        help="Whether to use the coco format dictionary `clsid2catid`",
-    )
-    parser.add_argument(
-        "--slice_infer",
-        action="store_true",
-        help="Whether to slice the image and merge the inference results for small object detection.",
-    )
-    parser.add_argument("--slice_size", nargs="+", type=int, default=[640, 640], help="Height of the sliced image.")
-    parser.add_argument(
-        "--overlap_ratio",
-        nargs="+",
-        type=float,
-        default=[0.25, 0.25],
-        help="Overlap height ratio of the sliced image.",
-    )
-    parser.add_argument(
-        "--combine_method",
-        type=str,
-        default="nms",
-        help="Combine method of the sliced images' detection results, choose in ['nms', 'nmm', 'concat'].",
-    )
-    parser.add_argument("--match_threshold", type=float, default=0.6, help="Combine method matching threshold.")
-    parser.add_argument(
-        "--match_metric", type=str, default="ios", help="Combine method matching metric, choose in ['iou', 'ios']."
-    )
-    return parser
-
-
-class Times(object):
-    def __init__(self):
-        self.time = 0.0
-        # start time
-        self.st = 0.0
-        # end time
-        self.et = 0.0
-
-    def start(self):
-        self.st = time.time()
-
-    def end(self, repeats=1, accumulative=True):
-        self.et = time.time()
-        if accumulative:
-            self.time += (self.et - self.st) / repeats
-        else:
-            self.time = (self.et - self.st) / repeats
-
-    def reset(self):
-        self.time = 0.0
-        self.st = 0.0
-        self.et = 0.0
-
-    def value(self):
-        return round(self.time, 4)
-
-
-class Timer(Times):
-    def __init__(self, with_tracker=False):
-        super(Timer, self).__init__()
-        self.with_tracker = with_tracker
-        self.preprocess_time_s = Times()
-        self.inference_time_s = Times()
-        self.postprocess_time_s = Times()
-        self.tracking_time_s = Times()
-        self.img_num = 0
-
-    def info(self, average=False):
-        pre_time = self.preprocess_time_s.value()
-        infer_time = self.inference_time_s.value()
-        post_time = self.postprocess_time_s.value()
-        track_time = self.tracking_time_s.value()
-
-        total_time = pre_time + infer_time + post_time
-        if self.with_tracker:
-            total_time = total_time + track_time
-        total_time = round(total_time, 4)
-        print("------------------ Inference Time Info ----------------------")
-        print("total_time(ms): {}, img_num: {}".format(total_time * 1000, self.img_num))
-        preprocess_time = round(pre_time / max(1, self.img_num), 4) if average else pre_time
-        postprocess_time = round(post_time / max(1, self.img_num), 4) if average else post_time
-        inference_time = round(infer_time / max(1, self.img_num), 4) if average else infer_time
-        tracking_time = round(track_time / max(1, self.img_num), 4) if average else track_time
-
-        average_latency = total_time / max(1, self.img_num)
-        qps = 0
-        if total_time > 0:
-            qps = 1 / average_latency
-        print("average latency time(ms): {:.2f}, QPS: {:2f}".format(average_latency * 1000, qps))
-        if self.with_tracker:
-            print(
-                "preprocess_time(ms): {:.2f}, inference_time(ms): {:.2f}, postprocess_time(ms): {:.2f}, tracking_time(ms): {:.2f}".format(
-                    preprocess_time * 1000, inference_time * 1000, postprocess_time * 1000, tracking_time * 1000
-                )
-            )
-        else:
-            print(
-                "preprocess_time(ms): {:.2f}, inference_time(ms): {:.2f}, postprocess_time(ms): {:.2f}".format(
-                    preprocess_time * 1000, inference_time * 1000, postprocess_time * 1000
-                )
-            )
-
-    def report(self, average=False):
-        dic = {}
-        pre_time = self.preprocess_time_s.value()
-        infer_time = self.inference_time_s.value()
-        post_time = self.postprocess_time_s.value()
-        track_time = self.tracking_time_s.value()
-
-        dic["preprocess_time_s"] = round(pre_time / max(1, self.img_num), 4) if average else pre_time
-        dic["inference_time_s"] = round(infer_time / max(1, self.img_num), 4) if average else infer_time
-        dic["postprocess_time_s"] = round(post_time / max(1, self.img_num), 4) if average else post_time
-        dic["img_num"] = self.img_num
-        total_time = pre_time + infer_time + post_time
-        if self.with_tracker:
-            dic["tracking_time_s"] = round(track_time / max(1, self.img_num), 4) if average else track_time
-            total_time = total_time + track_time
-        dic["total_time_s"] = round(total_time, 4)
-        return dic
-
-
-def get_current_memory_mb():
-    """
-    It is used to Obtain the memory usage of the CPU and GPU during the running of the program.
-    And this function Current program is time-consuming.
-    """
-    import GPUtil
-    import psutil
-    import pynvml
-
-    gpu_id = int(os.environ.get("CUDA_VISIBLE_DEVICES", 0))
-
-    pid = os.getpid()
-    p = psutil.Process(pid)
-    info = p.memory_full_info()
-    cpu_mem = info.uss / 1024.0 / 1024.0
-    gpu_mem = 0
-    gpu_percent = 0
-    gpus = GPUtil.getGPUs()
-    if gpu_id is not None and len(gpus) > 0:
-        gpu_percent = gpus[gpu_id].load
-        pynvml.nvmlInit()
-        handle = pynvml.nvmlDeviceGetHandleByIndex(0)
-        meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
-        gpu_mem = meminfo.used / 1024.0 / 1024.0
-    return round(cpu_mem, 4), round(gpu_mem, 4), round(gpu_percent, 4)
-
-
-def multiclass_nms(bboxs, num_classes, match_threshold=0.6, match_metric="iou"):
-    final_boxes = []
-    for c in range(num_classes):
-        idxs = bboxs[:, 0] == c
-        if np.count_nonzero(idxs) == 0:
-            continue
-        r = nms(bboxs[idxs, 1:], match_threshold, match_metric)
-        final_boxes.append(np.concatenate([np.full((r.shape[0], 1), c), r], 1))
-    return final_boxes
-
-
-def nms(dets, match_threshold=0.6, match_metric="iou"):
-    """Apply NMS to avoid detecting too many overlapping bounding boxes.
-    Args:
-        dets: shape [N, 5], [score, x1, y1, x2, y2]
-        match_metric: 'iou' or 'ios'
-        match_threshold: overlap thresh for match metric.
-    """
-    if dets.shape[0] == 0:
-        return dets[[], :]
-    scores = dets[:, 0]
-    x1 = dets[:, 1]
-    y1 = dets[:, 2]
-    x2 = dets[:, 3]
-    y2 = dets[:, 4]
-    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
-    order = scores.argsort()[::-1]
-
-    ndets = dets.shape[0]
-    suppressed = np.zeros((ndets), dtype=np.int32)
-
-    for _i in range(ndets):
-        i = order[_i]
-        if suppressed[i] == 1:
-            continue
-        ix1 = x1[i]
-        iy1 = y1[i]
-        ix2 = x2[i]
-        iy2 = y2[i]
-        iarea = areas[i]
-        for _j in range(_i + 1, ndets):
-            j = order[_j]
-            if suppressed[j] == 1:
-                continue
-            xx1 = max(ix1, x1[j])
-            yy1 = max(iy1, y1[j])
-            xx2 = min(ix2, x2[j])
-            yy2 = min(iy2, y2[j])
-            w = max(0.0, xx2 - xx1 + 1)
-            h = max(0.0, yy2 - yy1 + 1)
-            inter = w * h
-            if match_metric == "iou":
-                union = iarea + areas[j] - inter
-                match_value = inter / union
-            elif match_metric == "ios":
-                smaller = min(iarea, areas[j])
-                match_value = inter / smaller
-            else:
-                raise ValueError()
-            if match_value >= match_threshold:
-                suppressed[j] = 1
-    keep = np.where(suppressed == 0)[0]
-    dets = dets[keep, :]
-    return dets
-
-
-coco_clsid2catid = {
-    0: 1,
-    1: 2,
-    2: 3,
-    3: 4,
-    4: 5,
-    5: 6,
-    6: 7,
-    7: 8,
-    8: 9,
-    9: 10,
-    10: 11,
-    11: 13,
-    12: 14,
-    13: 15,
-    14: 16,
-    15: 17,
-    16: 18,
-    17: 19,
-    18: 20,
-    19: 21,
-    20: 22,
-    21: 23,
-    22: 24,
-    23: 25,
-    24: 27,
-    25: 28,
-    26: 31,
-    27: 32,
-    28: 33,
-    29: 34,
-    30: 35,
-    31: 36,
-    32: 37,
-    33: 38,
-    34: 39,
-    35: 40,
-    36: 41,
-    37: 42,
-    38: 43,
-    39: 44,
-    40: 46,
-    41: 47,
-    42: 48,
-    43: 49,
-    44: 50,
-    45: 51,
-    46: 52,
-    47: 53,
-    48: 54,
-    49: 55,
-    50: 56,
-    51: 57,
-    52: 58,
-    53: 59,
-    54: 60,
-    55: 61,
-    56: 62,
-    57: 63,
-    58: 64,
-    59: 65,
-    60: 67,
-    61: 70,
-    62: 72,
-    63: 73,
-    64: 74,
-    65: 75,
-    66: 76,
-    67: 77,
-    68: 78,
-    69: 79,
-    70: 80,
-    71: 81,
-    72: 82,
-    73: 84,
-    74: 85,
-    75: 86,
-    76: 87,
-    77: 88,
-    78: 89,
-    79: 90,
-}
-
-
-def gaussian_radius(bbox_size, min_overlap):
-    height, width = bbox_size
-
-    a1 = 1
-    b1 = height + width
-    c1 = width * height * (1 - min_overlap) / (1 + min_overlap)
-    sq1 = np.sqrt(b1**2 - 4 * a1 * c1)
-    radius1 = (b1 + sq1) / (2 * a1)
-
-    a2 = 4
-    b2 = 2 * (height + width)
-    c2 = (1 - min_overlap) * width * height
-    sq2 = np.sqrt(b2**2 - 4 * a2 * c2)
-    radius2 = (b2 + sq2) / 2
-
-    a3 = 4 * min_overlap
-    b3 = -2 * min_overlap * (height + width)
-    c3 = (min_overlap - 1) * width * height
-    sq3 = np.sqrt(b3**2 - 4 * a3 * c3)
-    radius3 = (b3 + sq3) / 2
-    return min(radius1, radius2, radius3)
-
-
-def gaussian2D(shape, sigma_x=1, sigma_y=1):
-    m, n = [(ss - 1.0) / 2.0 for ss in shape]
-    y, x = np.ogrid[-m : m + 1, -n : n + 1]
-
-    h = np.exp(-(x * x / (2 * sigma_x * sigma_x) + y * y / (2 * sigma_y * sigma_y)))
-    h[h < np.finfo(h.dtype).eps * h.max()] = 0
-    return h
-
-
-def draw_umich_gaussian(heatmap, center, radius, k=1):
-    """
-    draw_umich_gaussian, refer to https://github.com/xingyizhou/CenterNet/blob/master/src/lib/utils/image.py#L126
-    """
-    diameter = 2 * radius + 1
-    gaussian = gaussian2D((diameter, diameter), sigma_x=diameter / 6, sigma_y=diameter / 6)
-
-    x, y = int(center[0]), int(center[1])
-
-    height, width = heatmap.shape[0:2]
-
-    left, right = min(x, radius), min(width - x, radius + 1)
-    top, bottom = min(y, radius), min(height - y, radius + 1)
-
-    masked_heatmap = heatmap[y - top : y + bottom, x - left : x + right]
-    masked_gaussian = gaussian[radius - top : radius + bottom, radius - left : radius + right]
-    if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0:
-        np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap)
-    return heatmap
diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/visualize.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/visualize.py
deleted file mode 100644
index 745380ac8a82..000000000000
--- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/visualize.py
+++ /dev/null
@@ -1,365 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import division
-
-import math
-import os
-
-import cv2
-import numpy as np
-from PIL import Image, ImageDraw, ImageFile
-
-ImageFile.LOAD_TRUNCATED_IMAGES = True
-
-
-def visualize_box_mask(im, results, labels, threshold=0.5):
-    """
-    Args:
-        im (str/np.ndarray): path of image/np.ndarray read by cv2
-        results (dict): include 'boxes': np.ndarray: shape:[N,6], N: number of box,
-                        matix element:[class, score, x_min, y_min, x_max, y_max]
-                        MaskRCNN's results include 'masks': np.ndarray:
-                        shape:[N, im_h, im_w]
-        labels (list): labels:['class1', ..., 'classn']
-        threshold (float): Threshold of score.
-    Returns:
-        im (PIL.Image.Image): visualized image
-    """
-    if isinstance(im, str):
-        im = Image.open(im).convert("RGB")
-    elif isinstance(im, np.ndarray):
-        im = Image.fromarray(im)
-    if "masks" in results and "boxes" in results and len(results["boxes"]) > 0:
-        im = draw_mask(im, results["boxes"], results["masks"], labels, threshold=threshold)
-    if "boxes" in results and len(results["boxes"]) > 0:
-        im = draw_box(im, results["boxes"], labels, threshold=threshold)
-    if "segm" in results:
-        im = draw_segm(im, results["segm"], results["label"], results["score"], labels, threshold=threshold)
-    return im
-
-
-def get_color_map_list(num_classes):
-    """
-    Args:
-        num_classes (int): number of class
-    Returns:
-        color_map (list): RGB color list
-    """
-    color_map = num_classes * [0, 0, 0]
-    for i in range(0, num_classes):
-        j = 0
-        lab = i
-        while lab:
-            color_map[i * 3] |= ((lab >> 0) & 1) << (7 - j)
-            color_map[i * 3 + 1] |= ((lab >> 1) & 1) << (7 - j)
-            color_map[i * 3 + 2] |= ((lab >> 2) & 1) << (7 - j)
-            j += 1
-            lab >>= 3
-    color_map = [color_map[i : i + 3] for i in range(0, len(color_map), 3)]
-    return color_map
-
-
-def draw_mask(im, np_boxes, np_masks, labels, threshold=0.5):
-    """
-    Args:
-        im (PIL.Image.Image): PIL image
-        np_boxes (np.ndarray): shape:[N,6], N: number of box,
-            matix element:[class, score, x_min, y_min, x_max, y_max]
-        np_masks (np.ndarray): shape:[N, im_h, im_w]
-        labels (list): labels:['class1', ..., 'classn']
-        threshold (float): threshold of mask
-    Returns:
-        im (PIL.Image.Image): visualized image
-    """
-    color_list = get_color_map_list(len(labels))
-    w_ratio = 0.4
-    alpha = 0.7
-    im = np.array(im).astype("float32")
-    clsid2color = {}
-    expect_boxes = (np_boxes[:, 1] > threshold) & (np_boxes[:, 0] > -1)
-    np_boxes = np_boxes[expect_boxes, :]
-    np_masks = np_masks[expect_boxes, :, :]
-    im_h, im_w = im.shape[:2]
-    np_masks = np_masks[:, :im_h, :im_w]
-    for i in range(len(np_masks)):
-        clsid, _ = int(np_boxes[i][0]), np_boxes[i][1]
-        mask = np_masks[i]
-        if clsid not in clsid2color:
-            clsid2color[clsid] = color_list[clsid]
-        color_mask = clsid2color[clsid]
-        for c in range(3):
-            color_mask[c] = color_mask[c] * (1 - w_ratio) + w_ratio * 255
-        idx = np.nonzero(mask)
-        color_mask = np.array(color_mask)
-        im[idx[0], idx[1], :] *= 1.0 - alpha
-        im[idx[0], idx[1], :] += alpha * color_mask
-    return Image.fromarray(im.astype("uint8"))
-
-
-def draw_box(im, np_boxes, labels, threshold=0.5):
-    """
-    Args:
-        im (PIL.Image.Image): PIL image
-        np_boxes (np.ndarray): shape:[N,6], N: number of box,
-                               matix element:[class, score, x_min, y_min, x_max, y_max]
-        labels (list): labels:['class1', ..., 'classn']
-        threshold (float): threshold of box
-    Returns:
-        im (PIL.Image.Image): visualized image
-    """
-    draw_thickness = min(im.size) // 320
-    draw = ImageDraw.Draw(im)
-    clsid2color = {}
-    color_list = get_color_map_list(len(labels))
-    expect_boxes = (np_boxes[:, 1] > threshold) & (np_boxes[:, 0] > -1)
-    np_boxes = np_boxes[expect_boxes, :]
-
-    for dt in np_boxes:
-        clsid, bbox, score = int(dt[0]), dt[2:], dt[1]
-        if clsid not in clsid2color:
-            clsid2color[clsid] = color_list[clsid]
-        color = tuple(clsid2color[clsid])
-
-        if len(bbox) == 4:
-            xmin, ymin, xmax, ymax = bbox
-            print(
-                "class_id:{:d}, confidence:{:.4f}, left_top:[{:.2f},{:.2f}],"
-                "right_bottom:[{:.2f},{:.2f}]".format(int(clsid), score, xmin, ymin, xmax, ymax)
-            )
-            # draw bbox
-            draw.line(
-                [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin), (xmin, ymin)],
-                width=draw_thickness,
-                fill=color,
-            )
-        elif len(bbox) == 8:
-            x1, y1, x2, y2, x3, y3, x4, y4 = bbox
-            draw.line([(x1, y1), (x2, y2), (x3, y3), (x4, y4), (x1, y1)], width=2, fill=color)
-            xmin = min(x1, x2, x3, x4)
-            ymin = min(y1, y2, y3, y4)
-
-        # draw label
-        text = "{} {:.4f}".format(labels[clsid], score)
-        tw, th = draw.textsize(text)
-        draw.rectangle([(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill=color)
-        draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255))
-    return im
-
-
-def draw_segm(im, np_segms, np_label, np_score, labels, threshold=0.5, alpha=0.7):
-    """
-    Draw segmentation on image
-    """
-    w_ratio = 0.4
-    color_list = get_color_map_list(len(labels))
-    im = np.array(im).astype("float32")
-    clsid2color = {}
-    np_segms = np_segms.astype(np.uint8)
-    for i in range(np_segms.shape[0]):
-        mask, score, clsid = np_segms[i], np_score[i], np_label[i]
-        if score < threshold:
-            continue
-
-        if clsid not in clsid2color:
-            clsid2color[clsid] = color_list[clsid]
-        color_mask = clsid2color[clsid]
-        for c in range(3):
-            color_mask[c] = color_mask[c] * (1 - w_ratio) + w_ratio * 255
-        idx = np.nonzero(mask)
-        color_mask = np.array(color_mask)
-        idx0 = np.minimum(idx[0], im.shape[0] - 1)
-        idx1 = np.minimum(idx[1], im.shape[1] - 1)
-        im[idx0, idx1, :] *= 1.0 - alpha
-        im[idx0, idx1, :] += alpha * color_mask
-        sum_x = np.sum(mask, axis=0)
-        x = np.where(sum_x > 0.5)[0]
-        sum_y = np.sum(mask, axis=1)
-        y = np.where(sum_y > 0.5)[0]
-        x0, x1, y0, y1 = x[0], x[-1], y[0], y[-1]
-        cv2.rectangle(im, (x0, y0), (x1, y1), tuple(color_mask.astype("int32").tolist()), 1)
-        bbox_text = "%s %.2f" % (labels[clsid], score)
-        t_size = cv2.getTextSize(bbox_text, 0, 0.3, thickness=1)[0]
-        cv2.rectangle(
-            im, (x0, y0), (x0 + t_size[0], y0 - t_size[1] - 3), tuple(color_mask.astype("int32").tolist()), -1
-        )
-        cv2.putText(im, bbox_text, (x0, y0 - 2), cv2.FONT_HERSHEY_SIMPLEX, 0.3, (0, 0, 0), 1, lineType=cv2.LINE_AA)
-    return Image.fromarray(im.astype("uint8"))
-
-
-def get_color(idx):
-    idx = idx * 3
-    color = ((37 * idx) % 255, (17 * idx) % 255, (29 * idx) % 255)
-    return color
-
-
-def visualize_pose(
-    imgfile, results, visual_thresh=0.6, save_name="pose.jpg", save_dir="output", returnimg=False, ids=None
-):
-    try:
-        import matplotlib.pyplot as plt
-
-        plt.switch_backend("agg")
-    except Exception as e:
-        print("Matplotlib not found, please install matplotlib." "for example: `pip install matplotlib`.")
-        raise e
-    skeletons, _ = results["keypoint"]
-    skeletons = np.array(skeletons)
-    kpt_nums = 17
-    if len(skeletons) > 0:
-        kpt_nums = skeletons.shape[1]
-    if kpt_nums == 17:  # plot coco keypoint
-        EDGES = [
-            (0, 1),
-            (0, 2),
-            (1, 3),
-            (2, 4),
-            (3, 5),
-            (4, 6),
-            (5, 7),
-            (6, 8),
-            (7, 9),
-            (8, 10),
-            (5, 11),
-            (6, 12),
-            (11, 13),
-            (12, 14),
-            (13, 15),
-            (14, 16),
-            (11, 12),
-        ]
-    else:  # plot mpii keypoint
-        EDGES = [
-            (0, 1),
-            (1, 2),
-            (3, 4),
-            (4, 5),
-            (2, 6),
-            (3, 6),
-            (6, 7),
-            (7, 8),
-            (8, 9),
-            (10, 11),
-            (11, 12),
-            (13, 14),
-            (14, 15),
-            (8, 12),
-            (8, 13),
-        ]
-    NUM_EDGES = len(EDGES)
-
-    colors = [
-        [255, 0, 0],
-        [255, 85, 0],
-        [255, 170, 0],
-        [255, 255, 0],
-        [170, 255, 0],
-        [85, 255, 0],
-        [0, 255, 0],
-        [0, 255, 85],
-        [0, 255, 170],
-        [0, 255, 255],
-        [0, 170, 255],
-        [0, 85, 255],
-        [0, 0, 255],
-        [85, 0, 255],
-        [170, 0, 255],
-        [255, 0, 255],
-        [255, 0, 170],
-        [255, 0, 85],
-    ]
-    plt.figure()
-
-    img = cv2.imread(imgfile) if type(imgfile) == str else imgfile
-
-    color_set = results["colors"] if "colors" in results else None
-
-    if "bbox" in results and ids is None:
-        bboxs = results["bbox"]
-        for j, rect in enumerate(bboxs):
-            xmin, ymin, xmax, ymax = rect
-            color = colors[0] if color_set is None else colors[color_set[j] % len(colors)]
-            cv2.rectangle(img, (xmin, ymin), (xmax, ymax), color, 1)
-
-    canvas = img.copy()
-    for i in range(kpt_nums):
-        for j in range(len(skeletons)):
-            if skeletons[j][i, 2] < visual_thresh:
-                continue
-            if ids is None:
-                color = colors[i] if color_set is None else colors[color_set[j] % len(colors)]
-            else:
-                color = get_color(ids[j])
-
-            cv2.circle(canvas, tuple(skeletons[j][i, 0:2].astype("int32")), 2, color, thickness=-1)
-
-    stickwidth = 2
-
-    for i in range(NUM_EDGES):
-        for j in range(len(skeletons)):
-            edge = EDGES[i]
-            if skeletons[j][edge[0], 2] < visual_thresh or skeletons[j][edge[1], 2] < visual_thresh:
-                continue
-
-            cur_canvas = canvas.copy()
-            X = [skeletons[j][edge[0], 1], skeletons[j][edge[1], 1]]
-            Y = [skeletons[j][edge[0], 0], skeletons[j][edge[1], 0]]
-            mX = np.mean(X)
-            mY = np.mean(Y)
-            length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
-            angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
-            polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1)
-            if ids is None:
-                color = colors[i] if color_set is None else colors[color_set[j] % len(colors)]
-            else:
-                color = get_color(ids[j])
-            cv2.fillConvexPoly(cur_canvas, polygon, color)
-            canvas = cv2.addWeighted(canvas, 0.4, cur_canvas, 0.6, 0)
-    if returnimg:
-        return canvas
-    save_name = os.path.join(save_dir, os.path.splitext(os.path.basename(imgfile))[0] + "_vis.jpg")
-    plt.imsave(save_name, canvas[:, :, ::-1])
-    print("keypoint visualize image saved to: " + save_name)
-    plt.close()
-
-
-def visualize_attr(im, results, boxes=None, is_mtmct=False):
-    if isinstance(im, str):
-        im = Image.open(im)
-        im = np.ascontiguousarray(np.copy(im))
-        im = cv2.cvtColor(im, cv2.COLOR_RGB2BGR)
-    else:
-        im = np.ascontiguousarray(np.copy(im))
-
-    text_scale = max(0.5, im.shape[0] / 3000.0)
-    text_thickness = 1
-
-    line_inter = im.shape[0] / 40.0
-    for i, res in enumerate(results):
-        if boxes is None:
-            text_w = 3
-            text_h = 1
-        elif is_mtmct:
-            box = boxes[i]  # multi camera, bbox shape is x,y, w,h
-            text_w = int(box[0]) + 3
-            text_h = int(box[1])
-        else:
-            box = boxes[i]  # single camera, bbox shape is 0, 0, x,y, w,h
-            text_w = int(box[2]) + 3
-            text_h = int(box[3])
-        for text in res:
-            text_h += int(line_inter)
-            text_loc = (text_w, text_h)
-            cv2.putText(im, text, text_loc, cv2.FONT_ITALIC, text_scale, (0, 255, 255), thickness=text_thickness)
-    return im
diff --git a/ppdiffusers/examples/controlnet/annotator/segformer_paddle/__init__.py b/ppdiffusers/examples/controlnet/annotator/segformer_paddle/__init__.py
deleted file mode 100644
index 33eebbc62936..000000000000
--- a/ppdiffusers/examples/controlnet/annotator/segformer_paddle/__init__.py
+++ /dev/null
@@ -1,400 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-
-import cv2
-import numpy as np
-import paddle
-from annotator.util import annotator_ckpts_path
-from paddleseg.cvlibs import Config, manager
-from paddleseg.transforms import Compose
-from paddleseg.utils import get_image_list, get_sys_env, logger
-from pydantic import NoneBytes
-
-from .predict import predict, quick_predict
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="Model prediction")
-
-    # params of prediction
-    parser.add_argument("--config", dest="cfg", help="The config file.", default=None, type=str)
-    parser.add_argument(
-        "--model_path", dest="model_path", help="The path of model for prediction", type=str, default=None
-    )
-    parser.add_argument(
-        "--image_path",
-        dest="image_path",
-        help="The image to predict, which can be a path of image, or a file list containing image paths, or a directory including images",
-        type=str,
-        default=None,
-    )
-    parser.add_argument(
-        "--save_dir",
-        dest="save_dir",
-        help="The directory for saving the predicted results",
-        type=str,
-        default="./output/result",
-    )
-
-    # augment for prediction
-    parser.add_argument(
-        "--aug_pred",
-        dest="aug_pred",
-        help="Whether to use mulit-scales and flip augment for prediction",
-        action="store_true",
-    )
-    parser.add_argument("--scales", dest="scales", nargs="+", help="Scales for augment", type=float, default=1.0)
-    parser.add_argument(
-        "--flip_horizontal",
-        dest="flip_horizontal",
-        help="Whether to use flip horizontally augment",
-        action="store_true",
-    )
-    parser.add_argument(
-        "--flip_vertical", dest="flip_vertical", help="Whether to use flip vertically augment", action="store_true"
-    )
-
-    # sliding window prediction
-    parser.add_argument(
-        "--is_slide", dest="is_slide", help="Whether to prediction by sliding window", action="store_true"
-    )
-    parser.add_argument(
-        "--crop_size",
-        dest="crop_size",
-        nargs=2,
-        help="The crop size of sliding window, the first is width and the second is height.",
-        type=int,
-        default=None,
-    )
-    parser.add_argument(
-        "--stride",
-        dest="stride",
-        nargs=2,
-        help="The stride of sliding window, the first is width and the second is height.",
-        type=int,
-        default=None,
-    )
-
-    # custom color map
-    parser.add_argument(
-        "--custom_color",
-        dest="custom_color",
-        nargs="+",
-        help="Save images with a custom color map. Default: None, use paddleseg's default color map.",
-        type=int,
-        default=None,
-    )
-
-    # set device
-    parser.add_argument(
-        "--device",
-        dest="device",
-        help="Device place to be set, which can be GPU, XPU, NPU, CPU",
-        default="gpu",
-        type=str,
-    )
-
-    return parser.parse_args()
-
-
-custom_color = [
-    [120, 120, 120],
-    [180, 120, 120],
-    [6, 230, 230],
-    [80, 50, 50],
-    [4, 200, 3],
-    [120, 120, 80],
-    [140, 140, 140],
-    [204, 5, 255],
-    [230, 230, 230],
-    [4, 250, 7],
-    [224, 5, 255],
-    [235, 255, 7],
-    [150, 5, 61],
-    [120, 120, 70],
-    [8, 255, 51],
-    [255, 6, 82],
-    [143, 255, 140],
-    [204, 255, 4],
-    [255, 51, 7],
-    [204, 70, 3],
-    [0, 102, 200],
-    [61, 230, 250],
-    [255, 6, 51],
-    [11, 102, 255],
-    [255, 7, 71],
-    [255, 9, 224],
-    [9, 7, 230],
-    [220, 220, 220],
-    [255, 9, 92],
-    [112, 9, 255],
-    [8, 255, 214],
-    [7, 255, 224],
-    [255, 184, 6],
-    [10, 255, 71],
-    [255, 41, 10],
-    [7, 255, 255],
-    [224, 255, 8],
-    [102, 8, 255],
-    [255, 61, 6],
-    [255, 194, 7],
-    [255, 122, 8],
-    [0, 255, 20],
-    [255, 8, 41],
-    [255, 5, 153],
-    [6, 51, 255],
-    [235, 12, 255],
-    [160, 150, 20],
-    [0, 163, 255],
-    [140, 140, 140],
-    [250, 10, 15],
-    [20, 255, 0],
-    [31, 255, 0],
-    [255, 31, 0],
-    [255, 224, 0],
-    [153, 255, 0],
-    [0, 0, 255],
-    [255, 71, 0],
-    [0, 235, 255],
-    [0, 173, 255],
-    [31, 0, 255],
-    [11, 200, 200],
-    [255, 82, 0],
-    [0, 255, 245],
-    [0, 61, 255],
-    [0, 255, 112],
-    [0, 255, 133],
-    [255, 0, 0],
-    [255, 163, 0],
-    [255, 102, 0],
-    [194, 255, 0],
-    [0, 143, 255],
-    [51, 255, 0],
-    [0, 82, 255],
-    [0, 255, 41],
-    [0, 255, 173],
-    [10, 0, 255],
-    [173, 255, 0],
-    [0, 255, 153],
-    [255, 92, 0],
-    [255, 0, 255],
-    [255, 0, 245],
-    [255, 0, 102],
-    [255, 173, 0],
-    [255, 0, 20],
-    [255, 184, 184],
-    [0, 31, 255],
-    [0, 255, 61],
-    [0, 71, 255],
-    [255, 0, 204],
-    [0, 255, 194],
-    [0, 255, 82],
-    [0, 10, 255],
-    [0, 112, 255],
-    [51, 0, 255],
-    [0, 194, 255],
-    [0, 122, 255],
-    [0, 255, 163],
-    [255, 153, 0],
-    [0, 255, 10],
-    [255, 112, 0],
-    [143, 255, 0],
-    [82, 0, 255],
-    [163, 255, 0],
-    [255, 235, 0],
-    [8, 184, 170],
-    [133, 0, 255],
-    [0, 255, 92],
-    [184, 0, 255],
-    [255, 0, 31],
-    [0, 184, 255],
-    [0, 214, 255],
-    [255, 0, 112],
-    [92, 255, 0],
-    [0, 224, 255],
-    [112, 224, 255],
-    [70, 184, 160],
-    [163, 0, 255],
-    [153, 0, 255],
-    [71, 255, 0],
-    [255, 0, 163],
-    [255, 204, 0],
-    [255, 0, 143],
-    [0, 255, 235],
-    [133, 255, 0],
-    [255, 0, 235],
-    [245, 0, 255],
-    [255, 0, 122],
-    [255, 245, 0],
-    [10, 190, 212],
-    [214, 255, 0],
-    [0, 204, 255],
-    [20, 0, 255],
-    [255, 255, 0],
-    [0, 153, 255],
-    [0, 41, 255],
-    [0, 255, 204],
-    [41, 0, 255],
-    [41, 255, 0],
-    [173, 0, 255],
-    [0, 245, 255],
-    [71, 0, 255],
-    [122, 0, 255],
-    [0, 255, 184],
-    [0, 92, 255],
-    [184, 255, 0],
-    [0, 133, 255],
-    [255, 214, 0],
-    [25, 194, 194],
-    [102, 255, 0],
-    [92, 0, 255],
-]
-
-
-def get_test_config(cfg, args):
-
-    test_config = cfg.test_config
-    if "aug_eval" in test_config:
-        test_config.pop("aug_eval")
-    if args.aug_pred:
-        test_config["aug_pred"] = args.aug_pred
-        test_config["scales"] = args.scales
-
-    if args.flip_horizontal:
-        test_config["flip_horizontal"] = args.flip_horizontal
-
-    if args.flip_vertical:
-        test_config["flip_vertical"] = args.flip_vertical
-
-    if args.is_slide:
-        test_config["is_slide"] = args.is_slide
-        test_config["crop_size"] = args.crop_size
-        test_config["stride"] = args.stride
-
-    if args.custom_color:
-        test_config["custom_color"] = args.custom_color
-
-    return test_config
-
-
-def main(args):
-    env_info = get_sys_env()
-
-    if args.device == "gpu" and env_info["Paddle compiled with cuda"] and env_info["GPUs used"]:
-        place = "gpu"
-    elif args.device == "xpu" and paddle.is_compiled_with_xpu():
-        place = "xpu"
-    elif args.device == "npu" and paddle.is_compiled_with_npu():
-        place = "npu"
-    else:
-        place = "cpu"
-
-    paddle.set_device(place)
-    if not args.cfg:
-        raise RuntimeError("No configuration file specified.")
-
-    cfg = Config(args.cfg)
-    cfg.check_sync_info()
-
-    msg = "\n---------------Config Information---------------\n"
-    msg += str(cfg)
-    msg += "------------------------------------------------"
-    logger.info(msg)
-
-    model = cfg.model
-    transforms = Compose(cfg.val_transforms)
-    image_list, image_dir = get_image_list(args.image_path)
-    logger.info("Number of predict images = {}".format(len(image_list)))
-
-    test_config = get_test_config(cfg, args)
-
-    predict(
-        model,
-        model_path=args.model_path,
-        transforms=transforms,
-        image_list=image_list,
-        image_dir=image_dir,
-        save_dir=args.save_dir,
-        **test_config,
-    )
-
-
-checkpoint_file = (
-    "https://bj.bcebos.com/paddleseg/dygraph/cityscapes/segformer_b5_cityscapes_1024x1024_160k/model.pdparams"
-)
-
-
-class SegformerDetector:
-    def __init__(self, mode):
-        assert mode in ["cityscapes", "ade20k"], f"mode should in {['cityscapes', 'ade20k']}!"
-        if mode == "cityscapes":
-            segformer_annotator_ckpts_path = os.path.join(annotator_ckpts_path, "segformer_model")
-            modelpath = os.path.join(segformer_annotator_ckpts_path, "model.pdparams")
-            if not os.path.exists(modelpath):
-                from paddlenlp.utils.downloader import get_path_from_url_with_filelock
-
-                get_path_from_url_with_filelock(checkpoint_file, root_dir=segformer_annotator_ckpts_path)
-            self.model_path = modelpath
-
-            cfg = "annotator/segformer_paddle/segformer_b5_cityscapes_1024x1024_160k.yml"
-        else:
-            segformer_annotator_ckpts_path = os.path.join(annotator_ckpts_path, "segformer_model")
-            modelpath = os.path.join(segformer_annotator_ckpts_path, "segformer_b5_ade20k_512x512_160k.pdparams")
-
-            self.model_path = modelpath
-
-            cfg = "annotator/segformer_paddle/segformer_b5_ade20k_512x512_160k.yml"
-
-        cfg = Config(cfg)
-        cfg.check_sync_info()
-
-        msg = "\n---------------Config Information---------------\n"
-        msg += str(cfg)
-        msg += "------------------------------------------------"
-        logger.info(msg)
-
-        self.model = cfg.model
-        self.transforms = Compose(cfg.val_transforms)
-        args = parse_args()
-        self.test_config = get_test_config(cfg, args)
-
-    def __call__(self, img):
-        # img= img.swapaxes(0, 2)
-        custom_color_flatten = []
-        for color in custom_color:
-            custom_color_flatten += color
-
-        res_img, pred_mask = quick_predict(
-            self.model,
-            model_path=self.model_path,
-            transforms=self.transforms,
-            image_list=[img],
-            image_dir=None,
-            save_dir="output",
-            skip_save=True,
-            custom_color=custom_color_flatten,
-            **self.test_config,
-        )
-        pred_mask = cv2.cvtColor(np.asarray(pred_mask.convert("RGB"))[:, :, ::-1], cv2.COLOR_RGB2BGR)
-
-        return pred_mask
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)
diff --git a/ppdiffusers/examples/controlnet/annotator/segformer_paddle/predict.py b/ppdiffusers/examples/controlnet/annotator/segformer_paddle/predict.py
deleted file mode 100644
index 5e1850259a3f..000000000000
--- a/ppdiffusers/examples/controlnet/annotator/segformer_paddle/predict.py
+++ /dev/null
@@ -1,261 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-import os
-
-import cv2
-import numpy as np
-import paddle
-from paddleseg import utils
-from paddleseg.core import infer
-from paddleseg.utils import logger, progbar, visualize
-
-
-def mkdir(path):
-    sub_dir = os.path.dirname(path)
-    if not os.path.exists(sub_dir):
-        os.makedirs(sub_dir)
-
-
-def partition_list(arr, m):
-    """split the list 'arr' into m pieces"""
-    n = int(math.ceil(len(arr) / float(m)))
-    return [arr[i : i + n] for i in range(0, len(arr), n)]
-
-
-def preprocess(im_path, transforms):
-    data = {}
-    data["img"] = im_path
-    if transforms:
-        data = transforms(data)
-    data["img"] = data["img"][np.newaxis, ...]
-    data["img"] = paddle.to_tensor(data["img"])
-    return data
-
-
-def predict(
-    model,
-    model_path,
-    transforms,
-    image_list,
-    image_dir=None,
-    save_dir="output",
-    aug_pred=False,
-    scales=1.0,
-    flip_horizontal=True,
-    flip_vertical=False,
-    is_slide=False,
-    stride=None,
-    crop_size=None,
-    custom_color=None,
-):
-    """
-    predict and visualize the image_list.
-
-    Args:
-        model (nn.Layer): Used to predict for input image.
-        model_path (str): The path of pretrained model.
-        transforms (transform.Compose): Preprocess for input image.
-        image_list (list): A list of image path to be predicted.
-        image_dir (str, optional): The root directory of the images predicted. Default: None.
-        save_dir (str, optional): The directory to save the visualized results. Default: 'output'.
-        aug_pred (bool, optional): Whether to use mulit-scales and flip augment for predition. Default: False.
-        scales (list|float, optional): Scales for augment. It is valid when `aug_pred` is True. Default: 1.0.
-        flip_horizontal (bool, optional): Whether to use flip horizontally augment. It is valid when `aug_pred` is True. Default: True.
-        flip_vertical (bool, optional): Whether to use flip vertically augment. It is valid when `aug_pred` is True. Default: False.
-        is_slide (bool, optional): Whether to predict by sliding window. Default: False.
-        stride (tuple|list, optional): The stride of sliding window, the first is width and the second is height.
-            It should be provided when `is_slide` is True.
-        crop_size (tuple|list, optional):  The crop size of sliding window, the first is width and the second is height.
-            It should be provided when `is_slide` is True.
-        custom_color (list, optional): Save images with a custom color map. Default: None, use paddleseg's default color map.
-
-    """
-    utils.utils.load_entire_model(model, model_path)
-    model.eval()
-    nranks = paddle.distributed.get_world_size()
-    local_rank = paddle.distributed.get_rank()
-    if nranks > 1:
-        img_lists = partition_list(image_list, nranks)
-    else:
-        img_lists = [image_list]
-
-    added_saved_dir = os.path.join(save_dir, "added_prediction")
-    pred_saved_dir = os.path.join(save_dir, "pseudo_color_prediction")
-
-    logger.info("Start to predict...")
-    progbar_pred = progbar.Progbar(target=len(img_lists[0]), verbose=1)
-    color_map = visualize.get_color_map_list(256, custom_color=custom_color)
-    with paddle.no_grad():
-        for i, im_path in enumerate(img_lists[local_rank]):
-            data = preprocess(im_path, transforms)
-
-            if aug_pred:
-                pred, _ = infer.aug_inference(
-                    model,
-                    data["img"],
-                    trans_info=data["trans_info"],
-                    scales=scales,
-                    flip_horizontal=flip_horizontal,
-                    flip_vertical=flip_vertical,
-                    is_slide=is_slide,
-                    stride=stride,
-                    crop_size=crop_size,
-                )
-            else:
-                pred, _ = infer.inference(
-                    model,
-                    data["img"],
-                    trans_info=data["trans_info"],
-                    is_slide=is_slide,
-                    stride=stride,
-                    crop_size=crop_size,
-                )
-            pred = paddle.squeeze(pred)
-            pred = pred.numpy().astype("uint8")
-
-            # get the saved name
-            if image_dir is not None:
-                im_file = im_path.replace(image_dir, "")
-            else:
-                im_file = os.path.basename(im_path)
-            if im_file[0] == "/" or im_file[0] == "\\":
-                im_file = im_file[1:]
-
-            # save added image
-            added_image = utils.visualize.visualize(im_path, pred, color_map, weight=0.6)
-            added_image_path = os.path.join(added_saved_dir, im_file)
-            mkdir(added_image_path)
-            cv2.imwrite(added_image_path, added_image)
-
-            # save pseudo color prediction
-            pred_mask = utils.visualize.get_pseudo_color_map(pred, color_map)
-            pred_saved_path = os.path.join(pred_saved_dir, os.path.splitext(im_file)[0] + ".png")
-            mkdir(pred_saved_path)
-            pred_mask.save(pred_saved_path)
-
-            progbar_pred.update(i + 1)
-            # return pred
-
-
-def quick_predict(
-    model,
-    model_path,
-    transforms,
-    image_list,
-    image_dir=None,
-    save_dir="output",
-    aug_pred=False,
-    scales=1.0,
-    flip_horizontal=True,
-    flip_vertical=False,
-    is_slide=False,
-    stride=None,
-    crop_size=None,
-    custom_color=None,
-    skip_save=True,
-):
-    """
-    predict and visualize the image_list.
-
-    Args:
-        model (nn.Layer): Used to predict for input image.
-        model_path (str): The path of pretrained model.
-        transforms (transform.Compose): Preprocess for input image.
-        image_list (list): A list of image path to be predicted.
-        image_dir (str, optional): The root directory of the images predicted. Default: None.
-        save_dir (str, optional): The directory to save the visualized results. Default: 'output'.
-        aug_pred (bool, optional): Whether to use mulit-scales and flip augment for predition. Default: False.
-        scales (list|float, optional): Scales for augment. It is valid when `aug_pred` is True. Default: 1.0.
-        flip_horizontal (bool, optional): Whether to use flip horizontally augment. It is valid when `aug_pred` is True. Default: True.
-        flip_vertical (bool, optional): Whether to use flip vertically augment. It is valid when `aug_pred` is True. Default: False.
-        is_slide (bool, optional): Whether to predict by sliding window. Default: False.
-        stride (tuple|list, optional): The stride of sliding window, the first is width and the second is height.
-            It should be provided when `is_slide` is True.
-        crop_size (tuple|list, optional):  The crop size of sliding window, the first is width and the second is height.
-            It should be provided when `is_slide` is True.
-        custom_color (list, optional): Save images with a custom color map. Default: None, use paddleseg's default color map.
-
-    """
-    utils.utils.load_entire_model(model, model_path)
-    model.eval()
-    nranks = paddle.distributed.get_world_size()
-    local_rank = paddle.distributed.get_rank()
-    if nranks > 1:
-        img_lists = partition_list(image_list, nranks)
-    else:
-        img_lists = [image_list]
-
-    if not skip_save:
-        added_saved_dir = os.path.join(save_dir, "added_prediction")
-        pred_saved_dir = os.path.join(save_dir, "pseudo_color_prediction")
-
-    logger.info("Start to predict...")
-    progbar_pred = progbar.Progbar(target=len(img_lists[0]), verbose=1)
-    color_map = visualize.get_color_map_list(256, custom_color=custom_color)
-    with paddle.no_grad():
-        for i, im_path in enumerate(img_lists[local_rank]):
-            data = preprocess(im_path, transforms)
-
-            if aug_pred:
-                pred, _ = infer.aug_inference(
-                    model,
-                    data["img"],
-                    trans_info=data["trans_info"],
-                    scales=scales,
-                    flip_horizontal=flip_horizontal,
-                    flip_vertical=flip_vertical,
-                    is_slide=is_slide,
-                    stride=stride,
-                    crop_size=crop_size,
-                )
-            else:
-                pred, _ = infer.inference(
-                    model,
-                    data["img"],
-                    trans_info=data["trans_info"],
-                    is_slide=is_slide,
-                    stride=stride,
-                    crop_size=crop_size,
-                )
-            pred = paddle.squeeze(pred)
-            pred = pred.numpy().astype("uint8")
-
-            # get the saved name
-            if not skip_save:
-                if image_dir is not None:
-                    im_file = im_path.replace(image_dir, "")
-                else:
-                    im_file = os.path.basename(im_path)
-                if im_file[0] == "/" or im_file[0] == "\\":
-                    im_file = im_file[1:]
-
-            # save added image
-            if not skip_save:
-                added_image = utils.visualize.visualize(im_path, pred, color_map, weight=0.6)
-                added_image_path = os.path.join(added_saved_dir, im_file)
-                mkdir(added_image_path)
-                cv2.imwrite(added_image_path, added_image)
-
-            # save pseudo color prediction
-            pred_mask = utils.visualize.get_pseudo_color_map(pred, color_map)
-            if not skip_save:
-                pred_saved_path = os.path.join(pred_saved_dir, os.path.splitext(im_file)[0] + ".png")
-                mkdir(pred_saved_path)
-                pred_mask.save(pred_saved_path)
-
-            progbar_pred.update(i + 1)
-            return pred, pred_mask
diff --git a/ppdiffusers/examples/controlnet/annotator/segformer_paddle/segformer_b5_ade20k_512x512_160k.yml b/ppdiffusers/examples/controlnet/annotator/segformer_paddle/segformer_b5_ade20k_512x512_160k.yml
deleted file mode 100644
index 266ec90a72d4..000000000000
--- a/ppdiffusers/examples/controlnet/annotator/segformer_paddle/segformer_b5_ade20k_512x512_160k.yml
+++ /dev/null
@@ -1,45 +0,0 @@
-_base_: '../_base_/ade20k.yml'
-
-batch_size: 1
-iters: 160000
-
-model:
-  type: SegFormer
-  backbone:
-    type: MixVisionTransformer_B5
-    pretrained: https://bj.bcebos.com/paddleseg/dygraph/backbone/mix_vision_transformer_b5.tar.gz
-  embedding_dim: 768
-  num_classes: 150
-
-
-val_dataset:
-  transforms:
-    - type: Resize
-      target_size: [2048, 512]
-      keep_ratio: True
-      size_divisor: 32
-    - type: Normalize
-      mean: [0.485, 0.456, 0.406]
-      std: [0.229, 0.224, 0.225]
-
-optimizer:
-  _inherited_: False
-  type: AdamW
-  beta1: 0.9
-  beta2: 0.999
-  weight_decay: 0.01
-
-lr_scheduler:
-  type: PolynomialDecay
-  learning_rate: 0.00006
-  power: 1
-
-loss:
-  types:
-    - type: CrossEntropyLoss
-  coef: [1]
-
-test_config:
-    is_slide: True
-    crop_size: [1024, 1024]
-    stride: [768, 768]
diff --git a/ppdiffusers/examples/controlnet/annotator/segformer_paddle/segformer_b5_cityscapes_1024x1024_160k.yml b/ppdiffusers/examples/controlnet/annotator/segformer_paddle/segformer_b5_cityscapes_1024x1024_160k.yml
deleted file mode 100644
index 6fd1b90e999f..000000000000
--- a/ppdiffusers/examples/controlnet/annotator/segformer_paddle/segformer_b5_cityscapes_1024x1024_160k.yml
+++ /dev/null
@@ -1,34 +0,0 @@
-_base_: '../_base_/cityscapes_1024x1024.yml'
-
-batch_size: 1
-iters: 160000
-
-model:
-  type: SegFormer
-  backbone:
-    type: MixVisionTransformer_B5
-    pretrained: https://bj.bcebos.com/paddleseg/dygraph/backbone/mix_vision_transformer_b5.tar.gz
-  embedding_dim: 768
-  num_classes: 19
-
-optimizer:
-  _inherited_: False
-  type: AdamW
-  beta1: 0.9
-  beta2: 0.999
-  weight_decay: 0.01
-
-lr_scheduler:
-  type: PolynomialDecay
-  learning_rate: 0.00006
-  power: 1
-
-loss:
-  types:
-    - type: CrossEntropyLoss
-  coef: [1]
-
-test_config:
-    is_slide: True
-    crop_size: [1024, 1024]
-    stride: [768, 768]
diff --git a/ppdiffusers/examples/controlnet/annotator/segmenter_paddle/__init__.py b/ppdiffusers/examples/controlnet/annotator/segmenter_paddle/__init__.py
deleted file mode 100644
index fc06b241eb9d..000000000000
--- a/ppdiffusers/examples/controlnet/annotator/segmenter_paddle/__init__.py
+++ /dev/null
@@ -1,388 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-
-import cv2
-import numpy as np
-import paddle
-from annotator.util import annotator_ckpts_path
-from paddleseg.cvlibs import Config, manager
-from paddleseg.transforms import Compose
-from paddleseg.utils import get_image_list, get_sys_env, logger
-from pydantic import NoneBytes
-
-from .predict import predict, quick_predict
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="Model prediction")
-
-    # params of prediction
-    parser.add_argument("--config", dest="cfg", help="The config file.", default=None, type=str)
-    parser.add_argument(
-        "--model_path", dest="model_path", help="The path of model for prediction", type=str, default=None
-    )
-    parser.add_argument(
-        "--image_path",
-        dest="image_path",
-        help="The image to predict, which can be a path of image, or a file list containing image paths, or a directory including images",
-        type=str,
-        default=None,
-    )
-    parser.add_argument(
-        "--save_dir",
-        dest="save_dir",
-        help="The directory for saving the predicted results",
-        type=str,
-        default="./output/result",
-    )
-
-    # augment for prediction
-    parser.add_argument(
-        "--aug_pred",
-        dest="aug_pred",
-        help="Whether to use mulit-scales and flip augment for prediction",
-        action="store_true",
-    )
-    parser.add_argument("--scales", dest="scales", nargs="+", help="Scales for augment", type=float, default=1.0)
-    parser.add_argument(
-        "--flip_horizontal",
-        dest="flip_horizontal",
-        help="Whether to use flip horizontally augment",
-        action="store_true",
-    )
-    parser.add_argument(
-        "--flip_vertical", dest="flip_vertical", help="Whether to use flip vertically augment", action="store_true"
-    )
-
-    # sliding window prediction
-    parser.add_argument(
-        "--is_slide", dest="is_slide", help="Whether to prediction by sliding window", action="store_true"
-    )
-    parser.add_argument(
-        "--crop_size",
-        dest="crop_size",
-        nargs=2,
-        help="The crop size of sliding window, the first is width and the second is height.",
-        type=int,
-        default=None,
-    )
-    parser.add_argument(
-        "--stride",
-        dest="stride",
-        nargs=2,
-        help="The stride of sliding window, the first is width and the second is height.",
-        type=int,
-        default=None,
-    )
-
-    # custom color map
-    parser.add_argument(
-        "--custom_color",
-        dest="custom_color",
-        nargs="+",
-        help="Save images with a custom color map. Default: None, use paddleseg's default color map.",
-        type=int,
-        default=None,
-    )
-
-    # set device
-    parser.add_argument(
-        "--device",
-        dest="device",
-        help="Device place to be set, which can be GPU, XPU, NPU, CPU",
-        default="gpu",
-        type=str,
-    )
-
-    return parser.parse_args()
-
-
-custom_color = [
-    [120, 120, 120],
-    [180, 120, 120],
-    [6, 230, 230],
-    [80, 50, 50],
-    [4, 200, 3],
-    [120, 120, 80],
-    [140, 140, 140],
-    [204, 5, 255],
-    [230, 230, 230],
-    [4, 250, 7],
-    [224, 5, 255],
-    [235, 255, 7],
-    [150, 5, 61],
-    [120, 120, 70],
-    [8, 255, 51],
-    [255, 6, 82],
-    [143, 255, 140],
-    [204, 255, 4],
-    [255, 51, 7],
-    [204, 70, 3],
-    [0, 102, 200],
-    [61, 230, 250],
-    [255, 6, 51],
-    [11, 102, 255],
-    [255, 7, 71],
-    [255, 9, 224],
-    [9, 7, 230],
-    [220, 220, 220],
-    [255, 9, 92],
-    [112, 9, 255],
-    [8, 255, 214],
-    [7, 255, 224],
-    [255, 184, 6],
-    [10, 255, 71],
-    [255, 41, 10],
-    [7, 255, 255],
-    [224, 255, 8],
-    [102, 8, 255],
-    [255, 61, 6],
-    [255, 194, 7],
-    [255, 122, 8],
-    [0, 255, 20],
-    [255, 8, 41],
-    [255, 5, 153],
-    [6, 51, 255],
-    [235, 12, 255],
-    [160, 150, 20],
-    [0, 163, 255],
-    [140, 140, 140],
-    [250, 10, 15],
-    [20, 255, 0],
-    [31, 255, 0],
-    [255, 31, 0],
-    [255, 224, 0],
-    [153, 255, 0],
-    [0, 0, 255],
-    [255, 71, 0],
-    [0, 235, 255],
-    [0, 173, 255],
-    [31, 0, 255],
-    [11, 200, 200],
-    [255, 82, 0],
-    [0, 255, 245],
-    [0, 61, 255],
-    [0, 255, 112],
-    [0, 255, 133],
-    [255, 0, 0],
-    [255, 163, 0],
-    [255, 102, 0],
-    [194, 255, 0],
-    [0, 143, 255],
-    [51, 255, 0],
-    [0, 82, 255],
-    [0, 255, 41],
-    [0, 255, 173],
-    [10, 0, 255],
-    [173, 255, 0],
-    [0, 255, 153],
-    [255, 92, 0],
-    [255, 0, 255],
-    [255, 0, 245],
-    [255, 0, 102],
-    [255, 173, 0],
-    [255, 0, 20],
-    [255, 184, 184],
-    [0, 31, 255],
-    [0, 255, 61],
-    [0, 71, 255],
-    [255, 0, 204],
-    [0, 255, 194],
-    [0, 255, 82],
-    [0, 10, 255],
-    [0, 112, 255],
-    [51, 0, 255],
-    [0, 194, 255],
-    [0, 122, 255],
-    [0, 255, 163],
-    [255, 153, 0],
-    [0, 255, 10],
-    [255, 112, 0],
-    [143, 255, 0],
-    [82, 0, 255],
-    [163, 255, 0],
-    [255, 235, 0],
-    [8, 184, 170],
-    [133, 0, 255],
-    [0, 255, 92],
-    [184, 0, 255],
-    [255, 0, 31],
-    [0, 184, 255],
-    [0, 214, 255],
-    [255, 0, 112],
-    [92, 255, 0],
-    [0, 224, 255],
-    [112, 224, 255],
-    [70, 184, 160],
-    [163, 0, 255],
-    [153, 0, 255],
-    [71, 255, 0],
-    [255, 0, 163],
-    [255, 204, 0],
-    [255, 0, 143],
-    [0, 255, 235],
-    [133, 255, 0],
-    [255, 0, 235],
-    [245, 0, 255],
-    [255, 0, 122],
-    [255, 245, 0],
-    [10, 190, 212],
-    [214, 255, 0],
-    [0, 204, 255],
-    [20, 0, 255],
-    [255, 255, 0],
-    [0, 153, 255],
-    [0, 41, 255],
-    [0, 255, 204],
-    [41, 0, 255],
-    [41, 255, 0],
-    [173, 0, 255],
-    [0, 245, 255],
-    [71, 0, 255],
-    [122, 0, 255],
-    [0, 255, 184],
-    [0, 92, 255],
-    [184, 255, 0],
-    [0, 133, 255],
-    [255, 214, 0],
-    [25, 194, 194],
-    [102, 255, 0],
-    [92, 0, 255],
-]
-
-
-def get_test_config(cfg, args):
-
-    test_config = cfg.test_config
-    if "aug_eval" in test_config:
-        test_config.pop("aug_eval")
-    if args.aug_pred:
-        test_config["aug_pred"] = args.aug_pred
-        test_config["scales"] = args.scales
-
-    if args.flip_horizontal:
-        test_config["flip_horizontal"] = args.flip_horizontal
-
-    if args.flip_vertical:
-        test_config["flip_vertical"] = args.flip_vertical
-
-    if args.is_slide:
-        test_config["is_slide"] = args.is_slide
-        test_config["crop_size"] = args.crop_size
-        test_config["stride"] = args.stride
-
-    if args.custom_color:
-        test_config["custom_color"] = args.custom_color
-
-    return test_config
-
-
-def main(args):
-    env_info = get_sys_env()
-
-    if args.device == "gpu" and env_info["Paddle compiled with cuda"] and env_info["GPUs used"]:
-        place = "gpu"
-    elif args.device == "xpu" and paddle.is_compiled_with_xpu():
-        place = "xpu"
-    elif args.device == "npu" and paddle.is_compiled_with_npu():
-        place = "npu"
-    else:
-        place = "cpu"
-
-    paddle.set_device(place)
-    if not args.cfg:
-        raise RuntimeError("No configuration file specified.")
-
-    cfg = Config(args.cfg)
-    cfg.check_sync_info()
-
-    msg = "\n---------------Config Information---------------\n"
-    msg += str(cfg)
-    msg += "------------------------------------------------"
-    logger.info(msg)
-
-    model = cfg.model
-    transforms = Compose(cfg.val_transforms)
-    image_list, image_dir = get_image_list(args.image_path)
-    logger.info("Number of predict images = {}".format(len(image_list)))
-
-    test_config = get_test_config(cfg, args)
-
-    predict(
-        model,
-        model_path=args.model_path,
-        transforms=transforms,
-        image_list=image_list,
-        image_dir=image_dir,
-        save_dir=args.save_dir,
-        **test_config,
-    )
-
-
-checkpoint_file = (
-    "https://paddleseg.bj.bcebos.com/dygraph/ade20k/segmenter_vit_base_linear_ade20k_512x512_160k/model.pdparams"
-)
-
-
-class SegmenterDetector:
-    def __init__(self):
-        segmenter_annotator_ckpts_path = os.path.join(annotator_ckpts_path, "segmenter_model")
-        modelpath = os.path.join(segmenter_annotator_ckpts_path, "model.pdparams")
-        if not os.path.exists(modelpath):
-            from paddlenlp.utils.downloader import get_path_from_url_with_filelock
-
-            get_path_from_url_with_filelock(checkpoint_file, root_dir=segmenter_annotator_ckpts_path)
-        self.model_path = modelpath
-
-        cfg = "annotator/segmenter_paddle/segmenter_vit_base_linear_ade20k_512x512_160k.yml"
-        cfg = Config(cfg)
-        cfg.check_sync_info()
-
-        msg = "\n---------------Config Information---------------\n"
-        msg += str(cfg)
-        msg += "------------------------------------------------"
-        logger.info(msg)
-
-        self.model = cfg.model
-        self.transforms = Compose(cfg.val_transforms)
-        args = parse_args()
-        self.test_config = get_test_config(cfg, args)
-
-    def __call__(self, img):
-        # img= img.swapaxes(0, 2)
-        custom_color_flatten = []
-        for color in custom_color:
-            custom_color_flatten += color
-        res_img, pred_mask = quick_predict(
-            self.model,
-            model_path=self.model_path,
-            transforms=self.transforms,
-            image_list=[img],
-            image_dir=None,
-            save_dir="output",
-            skip_save=True,
-            custom_color=custom_color_flatten,
-            **self.test_config,
-        )
-        pred_mask = cv2.cvtColor(np.asarray(pred_mask.convert("RGB"))[:, :, ::-1], cv2.COLOR_RGB2BGR)
-        return pred_mask
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)
diff --git a/ppdiffusers/examples/controlnet/annotator/segmenter_paddle/predict.py b/ppdiffusers/examples/controlnet/annotator/segmenter_paddle/predict.py
deleted file mode 100644
index 5e1850259a3f..000000000000
--- a/ppdiffusers/examples/controlnet/annotator/segmenter_paddle/predict.py
+++ /dev/null
@@ -1,261 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-import os
-
-import cv2
-import numpy as np
-import paddle
-from paddleseg import utils
-from paddleseg.core import infer
-from paddleseg.utils import logger, progbar, visualize
-
-
-def mkdir(path):
-    sub_dir = os.path.dirname(path)
-    if not os.path.exists(sub_dir):
-        os.makedirs(sub_dir)
-
-
-def partition_list(arr, m):
-    """split the list 'arr' into m pieces"""
-    n = int(math.ceil(len(arr) / float(m)))
-    return [arr[i : i + n] for i in range(0, len(arr), n)]
-
-
-def preprocess(im_path, transforms):
-    data = {}
-    data["img"] = im_path
-    if transforms:
-        data = transforms(data)
-    data["img"] = data["img"][np.newaxis, ...]
-    data["img"] = paddle.to_tensor(data["img"])
-    return data
-
-
-def predict(
-    model,
-    model_path,
-    transforms,
-    image_list,
-    image_dir=None,
-    save_dir="output",
-    aug_pred=False,
-    scales=1.0,
-    flip_horizontal=True,
-    flip_vertical=False,
-    is_slide=False,
-    stride=None,
-    crop_size=None,
-    custom_color=None,
-):
-    """
-    predict and visualize the image_list.
-
-    Args:
-        model (nn.Layer): Used to predict for input image.
-        model_path (str): The path of pretrained model.
-        transforms (transform.Compose): Preprocess for input image.
-        image_list (list): A list of image path to be predicted.
-        image_dir (str, optional): The root directory of the images predicted. Default: None.
-        save_dir (str, optional): The directory to save the visualized results. Default: 'output'.
-        aug_pred (bool, optional): Whether to use mulit-scales and flip augment for predition. Default: False.
-        scales (list|float, optional): Scales for augment. It is valid when `aug_pred` is True. Default: 1.0.
-        flip_horizontal (bool, optional): Whether to use flip horizontally augment. It is valid when `aug_pred` is True. Default: True.
-        flip_vertical (bool, optional): Whether to use flip vertically augment. It is valid when `aug_pred` is True. Default: False.
-        is_slide (bool, optional): Whether to predict by sliding window. Default: False.
-        stride (tuple|list, optional): The stride of sliding window, the first is width and the second is height.
-            It should be provided when `is_slide` is True.
-        crop_size (tuple|list, optional):  The crop size of sliding window, the first is width and the second is height.
-            It should be provided when `is_slide` is True.
-        custom_color (list, optional): Save images with a custom color map. Default: None, use paddleseg's default color map.
-
-    """
-    utils.utils.load_entire_model(model, model_path)
-    model.eval()
-    nranks = paddle.distributed.get_world_size()
-    local_rank = paddle.distributed.get_rank()
-    if nranks > 1:
-        img_lists = partition_list(image_list, nranks)
-    else:
-        img_lists = [image_list]
-
-    added_saved_dir = os.path.join(save_dir, "added_prediction")
-    pred_saved_dir = os.path.join(save_dir, "pseudo_color_prediction")
-
-    logger.info("Start to predict...")
-    progbar_pred = progbar.Progbar(target=len(img_lists[0]), verbose=1)
-    color_map = visualize.get_color_map_list(256, custom_color=custom_color)
-    with paddle.no_grad():
-        for i, im_path in enumerate(img_lists[local_rank]):
-            data = preprocess(im_path, transforms)
-
-            if aug_pred:
-                pred, _ = infer.aug_inference(
-                    model,
-                    data["img"],
-                    trans_info=data["trans_info"],
-                    scales=scales,
-                    flip_horizontal=flip_horizontal,
-                    flip_vertical=flip_vertical,
-                    is_slide=is_slide,
-                    stride=stride,
-                    crop_size=crop_size,
-                )
-            else:
-                pred, _ = infer.inference(
-                    model,
-                    data["img"],
-                    trans_info=data["trans_info"],
-                    is_slide=is_slide,
-                    stride=stride,
-                    crop_size=crop_size,
-                )
-            pred = paddle.squeeze(pred)
-            pred = pred.numpy().astype("uint8")
-
-            # get the saved name
-            if image_dir is not None:
-                im_file = im_path.replace(image_dir, "")
-            else:
-                im_file = os.path.basename(im_path)
-            if im_file[0] == "/" or im_file[0] == "\\":
-                im_file = im_file[1:]
-
-            # save added image
-            added_image = utils.visualize.visualize(im_path, pred, color_map, weight=0.6)
-            added_image_path = os.path.join(added_saved_dir, im_file)
-            mkdir(added_image_path)
-            cv2.imwrite(added_image_path, added_image)
-
-            # save pseudo color prediction
-            pred_mask = utils.visualize.get_pseudo_color_map(pred, color_map)
-            pred_saved_path = os.path.join(pred_saved_dir, os.path.splitext(im_file)[0] + ".png")
-            mkdir(pred_saved_path)
-            pred_mask.save(pred_saved_path)
-
-            progbar_pred.update(i + 1)
-            # return pred
-
-
-def quick_predict(
-    model,
-    model_path,
-    transforms,
-    image_list,
-    image_dir=None,
-    save_dir="output",
-    aug_pred=False,
-    scales=1.0,
-    flip_horizontal=True,
-    flip_vertical=False,
-    is_slide=False,
-    stride=None,
-    crop_size=None,
-    custom_color=None,
-    skip_save=True,
-):
-    """
-    predict and visualize the image_list.
-
-    Args:
-        model (nn.Layer): Used to predict for input image.
-        model_path (str): The path of pretrained model.
-        transforms (transform.Compose): Preprocess for input image.
-        image_list (list): A list of image path to be predicted.
-        image_dir (str, optional): The root directory of the images predicted. Default: None.
-        save_dir (str, optional): The directory to save the visualized results. Default: 'output'.
-        aug_pred (bool, optional): Whether to use mulit-scales and flip augment for predition. Default: False.
-        scales (list|float, optional): Scales for augment. It is valid when `aug_pred` is True. Default: 1.0.
-        flip_horizontal (bool, optional): Whether to use flip horizontally augment. It is valid when `aug_pred` is True. Default: True.
-        flip_vertical (bool, optional): Whether to use flip vertically augment. It is valid when `aug_pred` is True. Default: False.
-        is_slide (bool, optional): Whether to predict by sliding window. Default: False.
-        stride (tuple|list, optional): The stride of sliding window, the first is width and the second is height.
-            It should be provided when `is_slide` is True.
-        crop_size (tuple|list, optional):  The crop size of sliding window, the first is width and the second is height.
-            It should be provided when `is_slide` is True.
-        custom_color (list, optional): Save images with a custom color map. Default: None, use paddleseg's default color map.
-
-    """
-    utils.utils.load_entire_model(model, model_path)
-    model.eval()
-    nranks = paddle.distributed.get_world_size()
-    local_rank = paddle.distributed.get_rank()
-    if nranks > 1:
-        img_lists = partition_list(image_list, nranks)
-    else:
-        img_lists = [image_list]
-
-    if not skip_save:
-        added_saved_dir = os.path.join(save_dir, "added_prediction")
-        pred_saved_dir = os.path.join(save_dir, "pseudo_color_prediction")
-
-    logger.info("Start to predict...")
-    progbar_pred = progbar.Progbar(target=len(img_lists[0]), verbose=1)
-    color_map = visualize.get_color_map_list(256, custom_color=custom_color)
-    with paddle.no_grad():
-        for i, im_path in enumerate(img_lists[local_rank]):
-            data = preprocess(im_path, transforms)
-
-            if aug_pred:
-                pred, _ = infer.aug_inference(
-                    model,
-                    data["img"],
-                    trans_info=data["trans_info"],
-                    scales=scales,
-                    flip_horizontal=flip_horizontal,
-                    flip_vertical=flip_vertical,
-                    is_slide=is_slide,
-                    stride=stride,
-                    crop_size=crop_size,
-                )
-            else:
-                pred, _ = infer.inference(
-                    model,
-                    data["img"],
-                    trans_info=data["trans_info"],
-                    is_slide=is_slide,
-                    stride=stride,
-                    crop_size=crop_size,
-                )
-            pred = paddle.squeeze(pred)
-            pred = pred.numpy().astype("uint8")
-
-            # get the saved name
-            if not skip_save:
-                if image_dir is not None:
-                    im_file = im_path.replace(image_dir, "")
-                else:
-                    im_file = os.path.basename(im_path)
-                if im_file[0] == "/" or im_file[0] == "\\":
-                    im_file = im_file[1:]
-
-            # save added image
-            if not skip_save:
-                added_image = utils.visualize.visualize(im_path, pred, color_map, weight=0.6)
-                added_image_path = os.path.join(added_saved_dir, im_file)
-                mkdir(added_image_path)
-                cv2.imwrite(added_image_path, added_image)
-
-            # save pseudo color prediction
-            pred_mask = utils.visualize.get_pseudo_color_map(pred, color_map)
-            if not skip_save:
-                pred_saved_path = os.path.join(pred_saved_dir, os.path.splitext(im_file)[0] + ".png")
-                mkdir(pred_saved_path)
-                pred_mask.save(pred_saved_path)
-
-            progbar_pred.update(i + 1)
-            return pred, pred_mask
diff --git a/ppdiffusers/examples/controlnet/annotator/segmenter_paddle/segmenter_vit_base_linear_ade20k_512x512_160k.yml b/ppdiffusers/examples/controlnet/annotator/segmenter_paddle/segmenter_vit_base_linear_ade20k_512x512_160k.yml
deleted file mode 100644
index d7108c6f32cf..000000000000
--- a/ppdiffusers/examples/controlnet/annotator/segmenter_paddle/segmenter_vit_base_linear_ade20k_512x512_160k.yml
+++ /dev/null
@@ -1,38 +0,0 @@
-_base_: '../_base_/ade20k.yml'
-
-batch_size: 2
-iters: 160000
-
-model:
-  type: LinearSegmenter
-  backbone:
-    type: VisionTransformer
-    img_size: 512
-    patch_size: 16
-    embed_dim: 768
-    depth: 12
-    num_heads: 12
-    mlp_ratio: 4
-    qkv_bias: True
-    drop_rate: 0.0
-    drop_path_rate: 0.1
-    final_norm: True
-    pretrained: https://bj.bcebos.com/paddleseg/dygraph/pretrained_models/vit_base_patch16_384_augreg.tar.gz
-
-val_dataset:
-  transforms:
-    - type: ResizeByShort
-      short_size: 512
-    - type: Normalize
-
-optimizer:
-  weight_decay: 0.0
-
-lr_scheduler:
-  learning_rate: 0.001
-  end_lr: 1.0e-05
-
-test_config:
-  is_slide: True
-  crop_size: [512, 512]
-  stride: [512, 512]
\ No newline at end of file
diff --git a/ppdiffusers/examples/controlnet/annotator/segmenter_paddle/segmenter_vit_base_mask_ade20k_512x512_160k.yml b/ppdiffusers/examples/controlnet/annotator/segmenter_paddle/segmenter_vit_base_mask_ade20k_512x512_160k.yml
deleted file mode 100644
index 36155108d4a9..000000000000
--- a/ppdiffusers/examples/controlnet/annotator/segmenter_paddle/segmenter_vit_base_mask_ade20k_512x512_160k.yml
+++ /dev/null
@@ -1,10 +0,0 @@
-_base_: './segmenter_vit_base_linear_ade20k_512x512_160k.yml'
-
-model:
-  type: MaskSegmenter
-  h_embed_dim: 768
-  h_depth: 2
-  h_num_heads: 12
-  h_mlp_ratio: 4
-  h_drop_rate: 0.0
-  h_drop_path_rate: 0.1
\ No newline at end of file
diff --git a/ppdiffusers/examples/controlnet/annotator/shuffle/__init__.py b/ppdiffusers/examples/controlnet/annotator/shuffle/__init__.py
deleted file mode 100644
index 4e72d29a2d4e..000000000000
--- a/ppdiffusers/examples/controlnet/annotator/shuffle/__init__.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-
-import cv2
-import numpy as np
-from annotator.util import img2mask, make_noise_disk
-
-
-class ContentShuffleDetector:
-    def __call__(self, img, h=None, w=None, f=None):
-        H, W, C = img.shape
-        if h is None:
-            h = H
-        if w is None:
-            w = W
-        if f is None:
-            f = 256
-        x = make_noise_disk(h, w, 1, f) * float(W - 1)
-        y = make_noise_disk(h, w, 1, f) * float(H - 1)
-        flow = np.concatenate([x, y], axis=2).astype(np.float32)
-        return cv2.remap(img, flow, None, cv2.INTER_LINEAR)
-
-
-class ColorShuffleDetector:
-    def __call__(self, img):
-        H, W, C = img.shape
-        F = random.randint(64, 384)
-        A = make_noise_disk(H, W, 3, F)
-        B = make_noise_disk(H, W, 3, F)
-        C = (A + B) / 2.0
-        A = (C + (A - C) * 3.0).clip(0, 1)
-        B = (C + (B - C) * 3.0).clip(0, 1)
-        L = img.astype(np.float32) / 255.0
-        Y = A * L + B * (1 - L)
-        Y -= np.min(Y, axis=(0, 1), keepdims=True)
-        Y /= np.maximum(np.max(Y, axis=(0, 1), keepdims=True), 1e-5)
-        Y *= 255.0
-        return Y.clip(0, 255).astype(np.uint8)
-
-
-class GrayDetector:
-    def __call__(self, img):
-        eps = 1e-5
-        X = img.astype(np.float32)
-        r, g, b = X[:, :, 0], X[:, :, 1], X[:, :, 2]
-        kr, kg, kb = [random.random() + eps for _ in range(3)]
-        ks = kr + kg + kb
-        kr /= ks
-        kg /= ks
-        kb /= ks
-        Y = r * kr + g * kg + b * kb
-        Y = np.stack([Y] * 3, axis=2)
-        return Y.clip(0, 255).astype(np.uint8)
-
-
-class DownSampleDetector:
-    def __call__(self, img, level=3, k=16.0):
-        h = img.astype(np.float32)
-        for _ in range(level):
-            h += np.random.normal(loc=0.0, scale=k, size=h.shape)
-            h = cv2.pyrDown(h)
-        for _ in range(level):
-            h = cv2.pyrUp(h)
-            h += np.random.normal(loc=0.0, scale=k, size=h.shape)
-        return h.clip(0, 255).astype(np.uint8)
-
-
-class Image2MaskShuffleDetector:
-    def __init__(self, resolution=(640, 512)):
-        self.H, self.W = resolution
-
-    def __call__(self, img):
-        m = img2mask(img, self.H, self.W)
-        m *= 255.0
-        return m.clip(0, 255).astype(np.uint8)
diff --git a/ppdiffusers/examples/controlnet/annotator/util.py b/ppdiffusers/examples/controlnet/annotator/util.py
deleted file mode 100644
index 6d3267a64e08..000000000000
--- a/ppdiffusers/examples/controlnet/annotator/util.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import random
-
-import cv2
-import numpy as np
-
-annotator_ckpts_path = os.path.join(os.path.dirname(__file__), "ckpts")
-
-
-def HWC3(x):
-    assert x.dtype == np.uint8
-    if x.ndim == 2:
-        x = x[:, :, None]
-    assert x.ndim == 3
-    H, W, C = x.shape
-    assert C == 1 or C == 3 or C == 4
-    if C == 3:
-        return x
-    if C == 1:
-        return np.concatenate([x, x, x], axis=2)
-    if C == 4:
-        color = x[:, :, 0:3].astype(np.float32)
-        alpha = x[:, :, 3:4].astype(np.float32) / 255.0
-        y = color * alpha + 255.0 * (1.0 - alpha)
-        y = y.clip(0, 255).astype(np.uint8)
-        return y
-
-
-def resize_image(input_image, resolution):
-    H, W, C = input_image.shape
-    H = float(H)
-    W = float(W)
-    k = float(resolution) / min(H, W)
-    H *= k
-    W *= k
-    H = int(np.round(H / 64.0)) * 64
-    W = int(np.round(W / 64.0)) * 64
-    img = cv2.resize(input_image, (W, H), interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA)
-    return img
-
-
-def make_noise_disk(H, W, C, F):
-    noise = np.random.uniform(low=0, high=1, size=((H // F) + 2, (W // F) + 2, C))
-    noise = cv2.resize(noise, (W + 2 * F, H + 2 * F), interpolation=cv2.INTER_CUBIC)
-    noise = noise[F : F + H, F : F + W]
-    noise -= np.min(noise)
-    noise /= np.max(noise)
-    if C == 1:
-        noise = noise[:, :, None]
-    return noise
-
-
-def img2mask(img, H, W, low=10, high=90):
-    assert img.ndim == 3 or img.ndim == 2
-    assert img.dtype == np.uint8
-
-    if img.ndim == 3:
-        y = img[:, :, random.randrange(0, img.shape[2])]
-    else:
-        y = img
-
-    y = cv2.resize(y, (W, H), interpolation=cv2.INTER_CUBIC)
-
-    if random.uniform(0, 1) < 0.5:
-        y = 255 - y
-
-    return y < np.percentile(y, random.randrange(low, high))
diff --git a/ppdiffusers/examples/controlnet/control/__init__.py b/ppdiffusers/examples/controlnet/control/__init__.py
deleted file mode 100644
index c3aaaafd3eb3..000000000000
--- a/ppdiffusers/examples/controlnet/control/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# flake8: noqa
-
-from .control_args import DataArguments, ModelArguments
-from .control_trainer import ControlNetTrainer
-from .dumpy_dataset import Fill50kDataset
-from .model import ControlNet
diff --git a/ppdiffusers/examples/controlnet/control/control_args.py b/ppdiffusers/examples/controlnet/control/control_args.py
deleted file mode 100644
index 6a688687e1a2..000000000000
--- a/ppdiffusers/examples/controlnet/control/control_args.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from dataclasses import dataclass, field
-from typing import Optional
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    vae_name_or_path: Optional[str] = field(default=None, metadata={"help": "pretrained_vae_name_or_path"})
-    text_encoder_name_or_path: Optional[str] = field(default=None, metadata={"help": "text_encoder_name_or_path"})
-    unet_name_or_path: Optional[str] = field(default=None, metadata={"help": "unet_encoder_name_or_path"})
-    tokenizer_name: Optional[str] = field(
-        default=None,
-        metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"},
-    )
-    model_max_length: Optional[int] = field(default=77, metadata={"help": "Pretrained tokenizer model_max_length"})
-    num_inference_steps: Optional[int] = field(default=50, metadata={"help": "num_inference_steps"})
-    use_ema: bool = field(default=False, metadata={"help": "Whether or not use ema"})
-    pretrained_model_name_or_path: str = field(
-        default="runwayml/stable-diffusion-v1-5",
-        metadata={"help": "Path to pretrained model or model, when we want to resume training."},
-    )
-    image_logging_steps: Optional[int] = field(default=1000, metadata={"help": "Log image every X steps."})
-    sd_locked: bool = field(default=True, metadata={"help": "lock unet output_blocks and out."})
-    use_paddle_conv_init: bool = field(default=False, metadata={"help": "Whether or not use paddle conv2d init."})
-    only_mid_control: bool = field(default=False, metadata={"help": "only_mid_control."})
-    is_ldmbert: bool = field(default=False, metadata={"help": "Whether to use ldmbert."})
-    enable_xformers_memory_efficient_attention: bool = field(
-        default=False, metadata={"help": "enable_xformers_memory_efficient_attention."}
-    )
-
-
-@dataclass
-class DataArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training.
-    """
-
-    resolution: int = field(
-        default=512,
-        metadata={
-            "help": "The resolution for input images, all the images in the train/validation dataset will be resized to this resolution."
-        },
-    )
-    file_path: str = field(default="./fill50k", metadata={"help": "The path to of the fill50k."})
diff --git a/ppdiffusers/examples/controlnet/control/control_trainer.py b/ppdiffusers/examples/controlnet/control/control_trainer.py
deleted file mode 100644
index 61be60e6f4a2..000000000000
--- a/ppdiffusers/examples/controlnet/control/control_trainer.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import contextlib
-import os
-import sys
-
-import paddle.amp.auto_cast as autocast
-
-from paddlenlp.trainer import Trainer
-from paddlenlp.trainer.integrations import (
-    INTEGRATION_TO_CALLBACK,
-    VisualDLCallback,
-    rewrite_logs,
-)
-from paddlenlp.utils.log import logger
-from ppdiffusers.training_utils import unwrap_model
-
-
-class VisualDLWithImageCallback(VisualDLCallback):
-    def autocast_smart_context_manager(self, args):
-        if args.fp16 or args.bf16:
-            amp_dtype = "float16" if args.fp16 else "bfloat16"
-            ctx_manager = autocast(
-                True,
-                custom_black_list=[
-                    "reduce_sum",
-                    "c_softmax_with_cross_entropy",
-                ],
-                level=args.fp16_opt_level,
-                dtype=amp_dtype,
-            )
-        else:
-            ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
-
-        return ctx_manager
-
-    def on_step_end(self, args, state, control, model=None, **kwargs):
-        if hasattr(model, "on_train_batch_end"):
-            model.on_train_batch_end()
-        if args.image_logging_steps > 0 and state.global_step % args.image_logging_steps == 0:
-            control.should_log = True
-
-    def on_log(self, args, state, control, logs=None, **kwargs):
-        if not state.is_world_process_zero:
-            return
-        # log image on each node
-        inputs = kwargs.get("inputs", None)
-        model = kwargs.get("model", None)
-        image_logs = {}
-        if (
-            inputs is not None
-            and model is not None
-            and args.image_logging_steps > 0
-            and state.global_step % args.image_logging_steps == 0
-        ):
-            with self.autocast_smart_context_manager(args):
-                image_logs["reconstruction"] = model.decode_image(pixel_values=inputs["pixel_values"])
-                image_logs["control"] = model.decode_control_image(controlnet_cond=inputs["controlnet_cond"])
-                image_logs["ddim-samples-9.0"] = model.log_image(
-                    input_ids=inputs["input_ids"],
-                    controlnet_cond=inputs["controlnet_cond"],
-                    guidance_scale=9.0,
-                    height=args.resolution,
-                    width=args.resolution,
-                )
-
-        if self.vdl_writer is None:
-            self._init_summary_writer(args)
-
-        if self.vdl_writer is not None:
-            logs = rewrite_logs(logs)
-            for k, v in logs.items():
-                if isinstance(v, (int, float)):
-                    self.vdl_writer.add_scalar(k, v, state.global_step)
-                else:
-                    logger.warning(
-                        "Trainer is attempting to log a value of "
-                        f'"{v}" of type {type(v)} for key "{k}" as a scalar. '
-                        "This invocation of VisualDL's writer.add_scalar() "
-                        "is incorrect so we dropped this attribute."
-                    )
-            # log images
-            for k, v in image_logs.items():
-                self.vdl_writer.add_image(k, v, state.global_step, dataformats="NHWC")
-            self.vdl_writer.flush()
-
-
-# register visualdl_with_image
-INTEGRATION_TO_CALLBACK.update({"custom_visualdl": VisualDLWithImageCallback})
-
-
-class ControlNetTrainer(Trainer):
-    def compute_loss(self, model, inputs, return_outputs=False):
-        loss = model(**inputs)
-        return loss
-
-    def _save(self, output_dir=None, state_dict=None, merge_tensor_parallel=False):
-        super()._save(output_dir=output_dir, state_dict=state_dict, merge_tensor_parallel=merge_tensor_parallel)
-        output_dir = output_dir if output_dir is not None else self.args.output_dir
-        unwrap_model(self.model).controlnet.save_pretrained(os.path.join(output_dir, "controlnet"))
diff --git a/ppdiffusers/examples/controlnet/control/dumpy_dataset.py b/ppdiffusers/examples/controlnet/control/dumpy_dataset.py
deleted file mode 100644
index c67eca10fb03..000000000000
--- a/ppdiffusers/examples/controlnet/control/dumpy_dataset.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-
-import cv2
-import numpy as np
-import paddle
-from paddle.io import Dataset
-
-
-class Fill50kDataset(Dataset):
-    def __init__(self, tokenizer, file_path="./fill50k"):
-        self.data = []
-        self.file_path = file_path
-        with open(os.path.join(file_path, "prompt.json"), "rt") as f:
-            for line in f:
-                self.data.append(json.loads(line))
-
-        self.text_processing = lambda caption: tokenizer(
-            caption,
-            padding="max_length",
-            truncation=True,
-            max_length=tokenizer.model_max_length,
-            return_tensors="np",
-        ).input_ids[0]
-
-    def __len__(self):
-        return len(self.data)
-
-    def __getitem__(self, idx):
-        item = self.data[idx]
-
-        source_filename = item["source"]
-        target_filename = item["target"]
-        prompt = item["prompt"]
-
-        source = cv2.imread(os.path.join(self.file_path, source_filename))
-        target = cv2.imread(os.path.join(self.file_path, target_filename))
-
-        # Do not forget that OpenCV read images in BGR order.
-        source = cv2.cvtColor(source, cv2.COLOR_BGR2RGB)
-        target = cv2.cvtColor(target, cv2.COLOR_BGR2RGB)
-
-        # Normalize source images to [0, 1].
-        source = source.astype(np.float32) / 255.0
-
-        # Normalize target images to [-1, 1].
-        target = (target.astype(np.float32) / 127.5) - 1.0
-
-        input_ids = self.text_processing(prompt)
-
-        return dict(
-            input_ids=paddle.to_tensor(input_ids, dtype=paddle.int64),
-            pixel_values=paddle.to_tensor(target.transpose([2, 0, 1]), dtype=paddle.float32),
-            controlnet_cond=paddle.to_tensor(source.transpose([2, 0, 1]), dtype=paddle.float32),
-        )
diff --git a/ppdiffusers/examples/controlnet/control/model.py b/ppdiffusers/examples/controlnet/control/model.py
deleted file mode 100644
index 57f1b5df1f0a..000000000000
--- a/ppdiffusers/examples/controlnet/control/model.py
+++ /dev/null
@@ -1,289 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import contextlib
-import inspect
-import json
-import os
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-
-from paddlenlp.transformers import AutoTokenizer, CLIPTextModel
-from paddlenlp.utils.log import logger
-from ppdiffusers import (
-    AutoencoderKL,
-    ControlNetModel,
-    DDIMScheduler,
-    DDPMScheduler,
-    LDMBertModel,
-    UNet2DConditionModel,
-    is_ppxformers_available,
-)
-from ppdiffusers.initializer import reset_initialized_parameter
-from ppdiffusers.models.ema import LitEma
-from ppdiffusers.training_utils import freeze_params
-
-
-def read_json(file):
-    with open(file, "r", encoding="utf-8") as f:
-        data = json.load(f)
-    return data
-
-
-class ControlNet(nn.Layer):
-    def __init__(self, model_args):
-        super().__init__()
-        # init tokenizer
-        tokenizer_name_or_path = (
-            model_args.tokenizer_name
-            if model_args.pretrained_model_name_or_path is None
-            else os.path.join(model_args.pretrained_model_name_or_path, "tokenizer")
-        )
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            tokenizer_name_or_path, model_max_length=model_args.model_max_length
-        )
-
-        vae_name = "vqvae" if model_args.is_ldmbert else "vae"
-        # init vae
-        vae_name_or_path = (
-            model_args.vae_name_or_path
-            if model_args.pretrained_model_name_or_path is None
-            else os.path.join(model_args.pretrained_model_name_or_path, vae_name)
-        )
-
-        self.vae = AutoencoderKL.from_pretrained(vae_name_or_path)
-        freeze_params(self.vae.parameters())
-        logger.info("Freeze vae parameters!")
-
-        if model_args.is_ldmbert:
-            text_encoder_name_or_path = (
-                model_args.text_encoder_name_or_path
-                if model_args.pretrained_model_name_or_path is None
-                else os.path.join(model_args.pretrained_model_name_or_path, "bert")
-            )
-            # init text_encoder
-            self.text_encoder = LDMBertModel.from_pretrained(text_encoder_name_or_path)
-        else:
-            text_encoder_name_or_path = (
-                model_args.text_encoder_name_or_path
-                if model_args.pretrained_model_name_or_path is None
-                else os.path.join(model_args.pretrained_model_name_or_path, "text_encoder")
-            )
-            self.text_encoder = CLIPTextModel.from_pretrained(text_encoder_name_or_path)
-
-        freeze_params(self.text_encoder.parameters())
-        logger.info("Freeze text_encoder parameters!")
-
-        unet_name_or_path = (
-            model_args.unet_name_or_path
-            if model_args.pretrained_model_name_or_path is None
-            else os.path.join(model_args.pretrained_model_name_or_path, "unet")
-        )
-
-        self.unet = UNet2DConditionModel.from_pretrained(unet_name_or_path)
-
-        freeze_params(self.unet.parameters())
-        logger.info("Freeze unet parameters!")
-
-        self.controlnet = ControlNetModel.from_unet(self.unet, load_weights_from_unet=True)
-
-        if not model_args.use_paddle_conv_init:
-            # use torch conv2d init
-            reset_initialized_parameter(self.controlnet.controlnet_cond_embedding.conv_in)
-            reset_initialized_parameter(self.controlnet.controlnet_cond_embedding.blocks)
-
-        self.noise_scheduler = DDPMScheduler(
-            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000
-        )
-        self.eval_scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-            steps_offset=1,
-        )
-        self.eval_scheduler.set_timesteps(model_args.num_inference_steps)
-        self.use_ema = model_args.use_ema
-        if self.use_ema:
-            self.model_ema = LitEma(self.controlnet)
-        self.control_scales = [1.0] * 13
-        self.only_mid_control = model_args.only_mid_control
-
-        if model_args.enable_xformers_memory_efficient_attention and is_ppxformers_available():
-            try:
-                self.unet.enable_xformers_memory_efficient_attention()
-                self.controlnet.enable_xformers_memory_efficient_attention()
-            except Exception as e:
-                logger.warn(
-                    "Could not enable memory efficient attention. Make sure develop paddlepaddle is installed"
-                    f" correctly and a GPU is available: {e}"
-                )
-
-    @contextlib.contextmanager
-    def ema_scope(self, context=None):
-        if self.use_ema:
-            self.model_ema.store(self.controlnet.parameters())
-            self.model_ema.copy_to(self.controlnet)
-            if context is not None:
-                print(f"{context}: Switched to EMA weights")
-        try:
-            yield None
-        finally:
-            if self.use_ema:
-                self.model_ema.restore(self.controlnet.parameters())
-                if context is not None:
-                    print(f"{context}: Restored training weights")
-
-    def on_train_batch_end(self):
-        if self.use_ema:
-            self.model_ema(self.controlnet)
-
-    def forward(self, input_ids=None, pixel_values=None, controlnet_cond=None, **kwargs):
-        self.train()
-        with paddle.amp.auto_cast(enable=False):
-            with paddle.no_grad():
-                self.vae.eval()
-                self.text_encoder.eval()
-                latents = self.vae.encode(pixel_values).latent_dist.sample()
-                latents = latents * 0.18215
-                noise = paddle.randn(latents.shape)
-                timesteps = paddle.randint(0, self.noise_scheduler.num_train_timesteps, (latents.shape[0],)).astype(
-                    "int64"
-                )
-                noisy_latents = self.noise_scheduler.add_noise(latents, noise, timesteps)
-                encoder_hidden_states = self.text_encoder(input_ids)[0]
-        # control
-        down_block_res_samples, mid_block_res_sample = self.controlnet(
-            noisy_latents,
-            timestep=timesteps,
-            encoder_hidden_states=encoder_hidden_states,
-            controlnet_cond=controlnet_cond,
-            conditioning_scale=self.control_scales,
-            return_dict=False,
-        )
-
-        # predict the noise residual
-        noise_pred = self.unet(
-            noisy_latents,
-            timestep=timesteps,
-            encoder_hidden_states=encoder_hidden_states,
-            down_block_additional_residuals=down_block_res_samples,
-            mid_block_additional_residual=mid_block_res_sample,
-        ).sample
-        loss = F.mse_loss(noise_pred, noise, reduction="mean")
-        return loss
-
-    @paddle.no_grad()
-    def decode_image(self, pixel_values=None, **kwargs):
-        self.eval()
-        if pixel_values.shape[0] > 8:
-            pixel_values = pixel_values[:8]
-        latents = self.vae.encode(pixel_values).latent_dist.sample()
-        image = self.vae.decode(latents).sample
-        image = (image / 2 + 0.5).clip(0, 1).transpose([0, 2, 3, 1])
-        image = (image * 255.0).cast("float32").numpy().round()
-        return image
-
-    @paddle.no_grad()
-    def decode_control_image(self, controlnet_cond=None, **kwargs):
-        return (255 * controlnet_cond.transpose([0, 2, 3, 1])).cast("float32").numpy().round()
-
-    @paddle.no_grad()
-    def log_image(
-        self, input_ids=None, controlnet_cond=None, height=512, width=512, eta=0.0, guidance_scale=7.5, **kwargs
-    ):
-        self.eval()
-        with self.ema_scope():
-            if height % 8 != 0 or width % 8 != 0:
-                raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-            # only log 8 image
-            if input_ids.shape[0] > 4:
-                input_ids = input_ids[:4]
-
-            text_embeddings = self.text_encoder(input_ids)[0]
-            do_classifier_free_guidance = guidance_scale > 1.0
-            if do_classifier_free_guidance:
-                batch_size, max_length = input_ids.shape
-                uncond_input = self.tokenizer(
-                    [""] * batch_size,
-                    padding="max_length",
-                    truncation=True,
-                    max_length=max_length,
-                    return_tensors="pd",
-                )
-                uncond_embeddings = self.text_encoder(uncond_input.input_ids)[0]
-                text_embeddings = paddle.concat([uncond_embeddings, text_embeddings], axis=0)
-
-            latents = paddle.randn((input_ids.shape[0], self.unet.in_channels, height // 8, width // 8))
-            # ddim donot use this
-            latents = latents * self.eval_scheduler.init_noise_sigma
-
-            accepts_eta = "eta" in set(inspect.signature(self.eval_scheduler.step).parameters.keys())
-            extra_step_kwargs = {}
-            if accepts_eta:
-                extra_step_kwargs["eta"] = eta
-
-            controlnet_cond_input = (
-                paddle.concat([controlnet_cond] * 2) if do_classifier_free_guidance else controlnet_cond
-            )
-
-            for t in self.eval_scheduler.timesteps:
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-
-                # ddim donot use this
-                latent_model_input = self.eval_scheduler.scale_model_input(latent_model_input, t)
-
-                # ControlNet predict the noise residual
-                down_block_res_samples, mid_block_res_sample = self.controlnet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=text_embeddings,
-                    controlnet_cond=controlnet_cond_input,
-                    conditioning_scale=self.control_scales,
-                    return_dict=False,
-                )
-
-                # predict the noise residual
-                noise_pred = self.unet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=text_embeddings,
-                    down_block_additional_residuals=down_block_res_samples,
-                    mid_block_additional_residual=mid_block_res_sample,
-                ).sample
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.eval_scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-            latents = 1 / 0.18215 * latents
-            image = self.vae.decode(latents).sample
-            image = (image / 2 + 0.5).clip(0, 1).transpose([0, 2, 3, 1]) * 255.0
-
-        return image.cast("float32").numpy().round()
-
-    def set_recompute(self, value=False):
-        def fn(layer):
-            if hasattr(layer, "gradient_checkpointing"):
-                layer.gradient_checkpointing = value
-                print("Set", layer.__class__, "recompute", layer.gradient_checkpointing)
-
-        self.controlnet.apply(fn)
diff --git a/ppdiffusers/examples/controlnet/extract_controlnet_ema_weights.py b/ppdiffusers/examples/controlnet/extract_controlnet_ema_weights.py
deleted file mode 100644
index 17582dd93e64..000000000000
--- a/ppdiffusers/examples/controlnet/extract_controlnet_ema_weights.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-
-import paddle
-
-
-def extract_controlnet_ema_weights(model_path, output_path):
-    state_dict = paddle.load(model_path, return_numpy=True)
-    ema_state_dict = {}
-    for k in state_dict.keys():
-        if k.startswith("controlnet."):
-            flat_ema_key = "model_ema." + "".join(k.split(".")[1:])
-            ema_state_dict[k.replace("controlnet.", "")] = state_dict.get(flat_ema_key)
-    if len(ema_state_dict) == 0:
-        raise ValueError("Can not extract ema weights!")
-    os.makedirs(output_path, exist_ok=True)
-    paddle.save(ema_state_dict, os.path.join(output_path, "model_state.ema.pdparams"))
-    print(f"Save EMA weights to {output_path} !")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_path",
-        type=str,
-        default="./model_state.pdparams",
-        help="model_state.",
-    )
-    parser.add_argument(
-        "--output_path",
-        type=str,
-        default="ema_controlnet",
-        help="The model output path.",
-    )
-    args = parser.parse_args()
-    extract_controlnet_ema_weights(args.model_path, args.output_path)
diff --git a/ppdiffusers/examples/controlnet/gradio_canny2image.py b/ppdiffusers/examples/controlnet/gradio_canny2image.py
deleted file mode 100644
index d8983196c0bd..000000000000
--- a/ppdiffusers/examples/controlnet/gradio_canny2image.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-
-import gradio as gr
-import paddle
-from annotator.canny import CannyDetector
-from annotator.util import HWC3, resize_image
-
-from paddlenlp.trainer import set_seed as seed_everything
-from ppdiffusers import ControlNetModel, StableDiffusionControlNetPipeline
-
-apply_canny = CannyDetector()
-
-controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
-pipe = StableDiffusionControlNetPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None
-)
-
-
-def process(
-    input_image,
-    prompt,
-    a_prompt,
-    n_prompt,
-    num_samples,
-    image_resolution,
-    ddim_steps,
-    guess_mode,
-    strength,
-    scale,
-    seed,
-    eta,
-    low_threshold,
-    high_threshold,
-):
-    with paddle.no_grad():
-        img = resize_image(HWC3(input_image), image_resolution)
-        H, W, C = img.shape
-        detected_map = apply_canny(img, low_threshold, high_threshold)
-        detected_map = HWC3(detected_map)
-
-        control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0
-        control = control.unsqueeze(0).transpose([0, 3, 1, 2])
-
-        control_scales = (
-            [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)
-        )  # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
-        if seed == -1:
-            seed = random.randint(0, 65535)
-        seed_everything(seed)
-        results = []
-        for _ in range(num_samples):
-            img = pipe(
-                prompt + ", " + a_prompt,
-                negative_prompt=n_prompt,
-                image=control,
-                num_inference_steps=ddim_steps,
-                height=H,
-                width=W,
-                eta=eta,
-                controlnet_conditioning_scale=control_scales,
-                guidance_scale=scale,
-            ).images[0]
-            results.append(img)
-
-    return [255 - detected_map] + results
-
-
-block = gr.Blocks().queue()
-with block:
-    with gr.Row():
-        gr.Markdown("## Control Stable Diffusion with Canny Edge Maps")
-    with gr.Row():
-        with gr.Column():
-            input_image = gr.Image(source="upload", type="numpy")
-            prompt = gr.Textbox(label="Prompt")
-            run_button = gr.Button(label="Run")
-            with gr.Accordion("Advanced options", open=False):
-                num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
-                image_resolution = gr.Slider(label="Image Resolution", minimum=256, maximum=768, value=512, step=64)
-                strength = gr.Slider(label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01)
-                guess_mode = gr.Checkbox(label="Guess Mode", value=False)
-                low_threshold = gr.Slider(label="Canny low threshold", minimum=1, maximum=255, value=100, step=1)
-                high_threshold = gr.Slider(label="Canny high threshold", minimum=1, maximum=255, value=200, step=1)
-                ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
-                scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1)
-                seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
-                eta = gr.Number(label="eta (DDIM)", value=0.0)
-                a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed")
-                n_prompt = gr.Textbox(
-                    label="Negative Prompt",
-                    value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
-                )
-        with gr.Column():
-            result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style(
-                grid=2, height="auto"
-            )
-    ips = [
-        input_image,
-        prompt,
-        a_prompt,
-        n_prompt,
-        num_samples,
-        image_resolution,
-        ddim_steps,
-        guess_mode,
-        strength,
-        scale,
-        seed,
-        eta,
-        low_threshold,
-        high_threshold,
-    ]
-    run_button.click(fn=process, inputs=ips, outputs=[result_gallery])
-
-
-block.launch(server_name="0.0.0.0", server_port=8513)
diff --git a/ppdiffusers/examples/controlnet/gradio_depth2image.py b/ppdiffusers/examples/controlnet/gradio_depth2image.py
deleted file mode 100644
index d757e3183f93..000000000000
--- a/ppdiffusers/examples/controlnet/gradio_depth2image.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-
-import cv2
-import gradio as gr
-import paddle
-from annotator.midas_paddle import MidasDetector_Infer as MidasDetector
-from annotator.util import HWC3, resize_image
-
-from paddlenlp.trainer import set_seed as seed_everything
-from ppdiffusers import ControlNetModel, StableDiffusionControlNetPipeline
-
-apply_midas = MidasDetector()
-
-controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-depth")
-pipe = StableDiffusionControlNetPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None
-)
-
-
-def process(
-    input_image,
-    prompt,
-    a_prompt,
-    n_prompt,
-    num_samples,
-    image_resolution,
-    detect_resolution,
-    ddim_steps,
-    guess_mode,
-    strength,
-    scale,
-    seed,
-    eta,
-):
-    with paddle.no_grad():
-        input_image = HWC3(input_image)
-        detected_map, _ = apply_midas(resize_image(input_image, detect_resolution))
-        detected_map = HWC3(detected_map)
-        img = resize_image(input_image, image_resolution)
-        H, W, C = img.shape
-        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
-
-        control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0
-        control = control.unsqueeze(0).transpose([0, 3, 1, 2])
-
-        if seed == -1:
-            seed = random.randint(0, 65535)
-        seed_everything(seed)
-        results = []
-        for _ in range(num_samples):
-            img = pipe(
-                prompt + ", " + a_prompt,
-                negative_prompt=n_prompt,
-                image=control,
-                num_inference_steps=ddim_steps,
-                height=H,
-                width=W,
-                eta=eta,
-                controlnet_conditioning_scale=1.0,
-                guidance_scale=scale,
-            ).images[0]
-            results.append(img)
-
-    return [detected_map] + results
-
-
-block = gr.Blocks().queue()
-with block:
-    with gr.Row():
-        gr.Markdown("## Control Stable Diffusion with Depth Maps")
-    with gr.Row():
-        with gr.Column():
-            input_image = gr.Image(source="upload", type="numpy")
-            prompt = gr.Textbox(label="Prompt")
-            run_button = gr.Button(label="Run")
-            with gr.Accordion("Advanced options", open=False):
-                num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
-                image_resolution = gr.Slider(label="Image Resolution", minimum=256, maximum=768, value=512, step=64)
-                strength = gr.Slider(label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01)
-                guess_mode = gr.Checkbox(label="Guess Mode", value=False)
-                detect_resolution = gr.Slider(label="Depth Resolution", minimum=128, maximum=1024, value=384, step=1)
-                ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
-                scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1)
-                seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
-                eta = gr.Number(label="eta (DDIM)", value=0.0)
-                a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed")
-                n_prompt = gr.Textbox(
-                    label="Negative Prompt",
-                    value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
-                )
-        with gr.Column():
-            result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style(
-                grid=2, height="auto"
-            )
-    ips = [
-        input_image,
-        prompt,
-        a_prompt,
-        n_prompt,
-        num_samples,
-        image_resolution,
-        detect_resolution,
-        ddim_steps,
-        guess_mode,
-        strength,
-        scale,
-        seed,
-        eta,
-    ]
-    run_button.click(fn=process, inputs=ips, outputs=[result_gallery])
-
-
-block.launch(server_name="0.0.0.0", server_port=8536)
diff --git a/ppdiffusers/examples/controlnet/gradio_hed2image.py b/ppdiffusers/examples/controlnet/gradio_hed2image.py
deleted file mode 100644
index 28bc98c7f975..000000000000
--- a/ppdiffusers/examples/controlnet/gradio_hed2image.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-
-import cv2
-import gradio as gr
-import paddle
-from annotator.hed import HEDdetector
-from annotator.util import HWC3, resize_image
-
-from paddlenlp.trainer import set_seed as seed_everything
-from ppdiffusers import ControlNetModel, StableDiffusionControlNetPipeline
-
-apply_hed = HEDdetector()
-
-
-controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-hed")
-pipe = StableDiffusionControlNetPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None
-)
-
-
-def process(
-    input_image,
-    prompt,
-    a_prompt,
-    n_prompt,
-    num_samples,
-    image_resolution,
-    detect_resolution,
-    ddim_steps,
-    guess_mode,
-    strength,
-    scale,
-    seed,
-    eta,
-):
-    with paddle.no_grad():
-        input_image = HWC3(input_image)
-        detected_map = apply_hed(resize_image(input_image, detect_resolution))
-        detected_map = HWC3(detected_map)
-        img = resize_image(input_image, image_resolution)
-        H, W, C = img.shape
-
-        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
-
-        control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0
-        control = control.unsqueeze(0).transpose([0, 3, 1, 2])
-
-        control_scales = (
-            [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)
-        )  # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
-        if seed == -1:
-            seed = random.randint(0, 65535)
-        seed_everything(seed)
-        results = []
-        for _ in range(num_samples):
-            img = pipe(
-                prompt + ", " + a_prompt,
-                negative_prompt=n_prompt,
-                image=control,
-                num_inference_steps=ddim_steps,
-                height=H,
-                width=W,
-                eta=eta,
-                controlnet_conditioning_scale=control_scales,
-                guidance_scale=scale,
-            ).images[0]
-            results.append(img)
-
-    return [detected_map] + results
-
-
-block = gr.Blocks().queue()
-with block:
-    with gr.Row():
-        gr.Markdown("## Control Stable Diffusion with HED Maps")
-    with gr.Row():
-        with gr.Column():
-            input_image = gr.Image(source="upload", type="numpy")
-            prompt = gr.Textbox(label="Prompt")
-            run_button = gr.Button(label="Run")
-            with gr.Accordion("Advanced options", open=False):
-                num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
-                image_resolution = gr.Slider(label="Image Resolution", minimum=256, maximum=768, value=512, step=64)
-                strength = gr.Slider(label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01)
-                guess_mode = gr.Checkbox(label="Guess Mode", value=False)
-                detect_resolution = gr.Slider(label="HED Resolution", minimum=128, maximum=1024, value=512, step=1)
-                ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
-                scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1)
-                seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
-                eta = gr.Number(label="eta (DDIM)", value=0.0)
-                a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed")
-                n_prompt = gr.Textbox(
-                    label="Negative Prompt",
-                    value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
-                )
-        with gr.Column():
-            result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style(
-                grid=2, height="auto"
-            )
-    ips = [
-        input_image,
-        prompt,
-        a_prompt,
-        n_prompt,
-        num_samples,
-        image_resolution,
-        detect_resolution,
-        ddim_steps,
-        guess_mode,
-        strength,
-        scale,
-        seed,
-        eta,
-    ]
-    run_button.click(fn=process, inputs=ips, outputs=[result_gallery])
-
-
-block.launch(server_name="0.0.0.0", server_port=8235)
diff --git a/ppdiffusers/examples/controlnet/gradio_hough2image.py b/ppdiffusers/examples/controlnet/gradio_hough2image.py
deleted file mode 100644
index 87e7ac0e10ad..000000000000
--- a/ppdiffusers/examples/controlnet/gradio_hough2image.py
+++ /dev/null
@@ -1,143 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-
-import cv2
-import gradio as gr
-import paddle
-from annotator.mlsd import MLSDdetector
-from annotator.util import HWC3, resize_image
-
-from paddlenlp.trainer import set_seed as seed_everything
-from ppdiffusers import ControlNetModel, StableDiffusionControlNetPipeline
-
-apply_mlsd = MLSDdetector()
-
-controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-mlsd")
-pipe = StableDiffusionControlNetPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None
-)
-
-
-def process(
-    input_image,
-    prompt,
-    a_prompt,
-    n_prompt,
-    num_samples,
-    image_resolution,
-    detect_resolution,
-    ddim_steps,
-    guess_mode,
-    strength,
-    scale,
-    seed,
-    eta,
-    value_threshold,
-    distance_threshold,
-):
-    with paddle.no_grad():
-        input_image = HWC3(input_image)
-        detected_map = apply_mlsd(resize_image(input_image, detect_resolution), value_threshold, distance_threshold)
-        detected_map = HWC3(detected_map)
-        img = resize_image(input_image, image_resolution)
-        H, W, C = img.shape
-        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
-
-        control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0
-        control = control.unsqueeze(0).transpose([0, 3, 1, 2])
-
-        control_scales = (
-            [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)
-        )  # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
-        if seed == -1:
-            seed = random.randint(0, 65535)
-        seed_everything(seed)
-        results = []
-        for _ in range(num_samples):
-            img = pipe(
-                prompt + ", " + a_prompt,
-                negative_prompt=n_prompt,
-                image=control,
-                num_inference_steps=ddim_steps,
-                height=H,
-                width=W,
-                eta=eta,
-                controlnet_conditioning_scale=control_scales,
-                guidance_scale=scale,
-            ).images[0]
-            results.append(img)
-
-    return [detected_map] + results
-
-
-block = gr.Blocks().queue()
-with block:
-    with gr.Row():
-        gr.Markdown("## Control Stable Diffusion with Hough Line Maps")
-    with gr.Row():
-        with gr.Column():
-            input_image = gr.Image(source="upload", type="numpy")
-            prompt = gr.Textbox(label="Prompt")
-            run_button = gr.Button(label="Run")
-            with gr.Accordion("Advanced options", open=False):
-                num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
-                image_resolution = gr.Slider(label="Image Resolution", minimum=256, maximum=768, value=512, step=64)
-                strength = gr.Slider(label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01)
-                guess_mode = gr.Checkbox(label="Guess Mode", value=False)
-                detect_resolution = gr.Slider(
-                    label="Hough Line Resolution", minimum=128, maximum=1024, value=512, step=1
-                )
-                value_threshold = gr.Slider(
-                    label="Hough value threshold (MLSD)", minimum=0.01, maximum=2.0, value=0.1, step=0.01
-                )
-                distance_threshold = gr.Slider(
-                    label="Hough distance threshold (MLSD)", minimum=0.01, maximum=20.0, value=0.1, step=0.01
-                )
-                ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
-                scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1)
-                seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
-                eta = gr.Number(label="eta (DDIM)", value=0.0)
-                a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed")
-                n_prompt = gr.Textbox(
-                    label="Negative Prompt",
-                    value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
-                )
-        with gr.Column():
-            result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style(
-                grid=2, height="auto"
-            )
-    ips = [
-        input_image,
-        prompt,
-        a_prompt,
-        n_prompt,
-        num_samples,
-        image_resolution,
-        detect_resolution,
-        ddim_steps,
-        guess_mode,
-        strength,
-        scale,
-        seed,
-        eta,
-        value_threshold,
-        distance_threshold,
-    ]
-    run_button.click(fn=process, inputs=ips, outputs=[result_gallery])
-
-
-block.launch(server_name="0.0.0.0")
diff --git a/ppdiffusers/examples/controlnet/gradio_ip2p2image.py b/ppdiffusers/examples/controlnet/gradio_ip2p2image.py
deleted file mode 100644
index d28c5269bb73..000000000000
--- a/ppdiffusers/examples/controlnet/gradio_ip2p2image.py
+++ /dev/null
@@ -1,122 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-
-import cv2
-import gradio as gr
-import paddle
-from annotator.util import HWC3, resize_image
-
-from paddlenlp.trainer import set_seed as seed_everything
-from ppdiffusers import ControlNetModel, StableDiffusionControlNetPipeline
-
-controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11e_sd15_ip2p")
-pipe = StableDiffusionControlNetPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None
-)
-
-
-def process(
-    input_image,
-    prompt,
-    a_prompt,
-    n_prompt,
-    num_samples,
-    image_resolution,
-    ddim_steps,
-    guess_mode,
-    strength,
-    scale,
-    seed,
-    eta,
-):
-    with paddle.no_grad():
-        img = resize_image(HWC3(input_image), image_resolution)
-        detected_map = input_image.copy()
-        H, W, C = img.shape
-        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
-
-        control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0
-        control = control.unsqueeze(0).transpose([0, 3, 1, 2])
-
-        control_scales = (
-            [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)
-        )  # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
-        if seed == -1:
-            seed = random.randint(0, 65535)
-        seed_everything(seed)
-        results = []
-        for _ in range(num_samples):
-            img = pipe(
-                prompt + ", " + a_prompt,
-                negative_prompt=n_prompt,
-                image=control,
-                num_inference_steps=ddim_steps,
-                height=H,
-                width=W,
-                eta=eta,
-                controlnet_conditioning_scale=control_scales,
-                guidance_scale=scale,
-            ).images[0]
-            results.append(img)
-
-    return [detected_map] + results
-
-
-block = gr.Blocks().queue()
-with block:
-    with gr.Row():
-        gr.Markdown("## Control Stable Diffusion with Instruct Pix2Pix")
-    with gr.Row():
-        with gr.Column():
-            input_image = gr.Image(source="upload", type="numpy")
-            prompt = gr.Textbox(label="Prompt")
-            run_button = gr.Button(label="Run")
-            with gr.Accordion("Advanced options", open=False):
-                num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
-                image_resolution = gr.Slider(label="Image Resolution", minimum=256, maximum=768, value=512, step=64)
-                strength = gr.Slider(label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01)
-                guess_mode = gr.Checkbox(label="Guess Mode", value=False)
-                ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
-                scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1)
-                seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
-                eta = gr.Number(label="eta (DDIM)", value=0.0)
-                a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed")
-                n_prompt = gr.Textbox(
-                    label="Negative Prompt",
-                    value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
-                )
-        with gr.Column():
-            result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style(
-                grid=2, height="auto"
-            )
-    ips = [
-        input_image,
-        prompt,
-        a_prompt,
-        n_prompt,
-        num_samples,
-        image_resolution,
-        ddim_steps,
-        guess_mode,
-        strength,
-        scale,
-        seed,
-        eta,
-    ]
-    run_button.click(fn=process, inputs=ips, outputs=[result_gallery])
-
-block.launch(server_name="0.0.0.0", server_port=8513)
diff --git a/ppdiffusers/examples/controlnet/gradio_normal2image.py b/ppdiffusers/examples/controlnet/gradio_normal2image.py
deleted file mode 100644
index 66ee9050e822..000000000000
--- a/ppdiffusers/examples/controlnet/gradio_normal2image.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-
-import cv2
-import gradio as gr
-import paddle
-from annotator.midas_paddle import MidasDetector_Infer as MidasDetector
-from annotator.util import HWC3, resize_image
-
-from paddlenlp.trainer import set_seed as seed_everything
-from ppdiffusers import ControlNetModel, StableDiffusionControlNetPipeline
-
-apply_midas = MidasDetector()
-
-controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-normal")
-pipe = StableDiffusionControlNetPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None
-)
-
-
-def process(
-    input_image,
-    prompt,
-    a_prompt,
-    n_prompt,
-    num_samples,
-    image_resolution,
-    detect_resolution,
-    ddim_steps,
-    guess_mode,
-    strength,
-    scale,
-    seed,
-    eta,
-    bg_threshold,
-):
-    with paddle.no_grad():
-        input_image = HWC3(input_image)
-        _, detected_map = apply_midas(resize_image(input_image, detect_resolution), bg_th=bg_threshold)
-        detected_map = HWC3(detected_map)
-        img = resize_image(input_image, image_resolution)
-        H, W, C = img.shape
-        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
-
-        control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0
-        control = control.unsqueeze(0).transpose([0, 3, 1, 2])
-
-        control_scales = (
-            [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)
-        )  # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
-        if seed == -1:
-            seed = random.randint(0, 65535)
-        seed_everything(seed)
-        results = []
-        for _ in range(num_samples):
-            img = pipe(
-                prompt + ", " + a_prompt,
-                negative_prompt=n_prompt,
-                image=control,
-                num_inference_steps=ddim_steps,
-                height=H,
-                width=W,
-                eta=eta,
-                controlnet_conditioning_scale=control_scales,
-                guidance_scale=scale,
-            ).images[0]
-            results.append(img)
-
-    return [detected_map] + results
-
-
-block = gr.Blocks().queue()
-with block:
-    with gr.Row():
-        gr.Markdown("## Control Stable Diffusion with Normal Maps")
-    with gr.Row():
-        with gr.Column():
-            input_image = gr.Image(source="upload", type="numpy")
-            prompt = gr.Textbox(label="Prompt")
-            run_button = gr.Button(label="Run")
-            with gr.Accordion("Advanced options", open=False):
-                num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
-                image_resolution = gr.Slider(label="Image Resolution", minimum=256, maximum=768, value=512, step=64)
-                strength = gr.Slider(label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01)
-                guess_mode = gr.Checkbox(label="Guess Mode", value=False)
-                detect_resolution = gr.Slider(label="Normal Resolution", minimum=128, maximum=1024, value=384, step=1)
-                bg_threshold = gr.Slider(
-                    label="Normal background threshold", minimum=0.0, maximum=1.0, value=0.4, step=0.01
-                )
-                ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
-                scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1)
-                seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
-                eta = gr.Number(label="eta (DDIM)", value=0.0)
-                a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed")
-                n_prompt = gr.Textbox(
-                    label="Negative Prompt",
-                    value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
-                )
-        with gr.Column():
-            result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style(
-                grid=2, height="auto"
-            )
-    ips = [
-        input_image,
-        prompt,
-        a_prompt,
-        n_prompt,
-        num_samples,
-        image_resolution,
-        detect_resolution,
-        ddim_steps,
-        guess_mode,
-        strength,
-        scale,
-        seed,
-        eta,
-        bg_threshold,
-    ]
-    run_button.click(fn=process, inputs=ips, outputs=[result_gallery])
-
-
-block.launch(server_name="0.0.0.0")
diff --git a/ppdiffusers/examples/controlnet/gradio_pose2image_openpose.py b/ppdiffusers/examples/controlnet/gradio_pose2image_openpose.py
deleted file mode 100644
index 6877599eb7ae..000000000000
--- a/ppdiffusers/examples/controlnet/gradio_pose2image_openpose.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-
-import cv2
-import gradio as gr
-import paddle
-from annotator.openpose import OpenposePaddleDetector
-from annotator.util import HWC3, resize_image
-
-from paddlenlp.trainer import set_seed as seed_everything
-from ppdiffusers import ControlNetModel, StableDiffusionControlNetPipeline
-
-apply_openpose = OpenposePaddleDetector()
-
-controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-openpose")
-pipe = StableDiffusionControlNetPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None
-)
-
-
-def process(
-    input_image,
-    hand,
-    prompt,
-    a_prompt,
-    n_prompt,
-    num_samples,
-    image_resolution,
-    detect_resolution,
-    ddim_steps,
-    guess_mode,
-    strength,
-    scale,
-    seed,
-    eta,
-):
-    with paddle.no_grad():
-        input_image = HWC3(input_image)
-        detected_map, _ = apply_openpose(input_image, detect_resolution, hand)
-        detected_map = HWC3(detected_map)
-        img = resize_image(input_image, image_resolution)
-        H, W, C = img.shape
-        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
-
-        control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0
-        control = control.unsqueeze(0).transpose([0, 3, 1, 2])
-
-        control_scales = (
-            [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)
-        )  # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
-        if seed == -1:
-            seed = random.randint(0, 65535)
-        seed_everything(seed)
-        results = []
-        for _ in range(num_samples):
-            img = pipe(
-                prompt + ", " + a_prompt,
-                negative_prompt=n_prompt,
-                image=control,
-                num_inference_steps=ddim_steps,
-                height=H,
-                width=W,
-                eta=eta,
-                controlnet_conditioning_scale=control_scales,
-                guidance_scale=scale,
-            ).images[0]
-            results.append(img)
-
-    return [detected_map] + results
-
-
-block = gr.Blocks().queue()
-with block:
-    with gr.Row():
-        gr.Markdown("## Control Stable Diffusion with Human Pose")
-    with gr.Row():
-        with gr.Column():
-            input_image = gr.Image(source="upload", type="numpy")
-            hand = gr.Checkbox(label="detect hand", value=False)
-            prompt = gr.Textbox(label="Prompt")
-            run_button = gr.Button(label="Run")
-            with gr.Accordion("Advanced options", open=False):
-                num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
-                image_resolution = gr.Slider(label="Image Resolution", minimum=256, maximum=768, value=512, step=64)
-                strength = gr.Slider(label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01)
-                guess_mode = gr.Checkbox(label="Guess Mode", value=False)
-                detect_resolution = gr.Slider(
-                    label="OpenPose Resolution", minimum=128, maximum=1024, value=512, step=1
-                )
-                ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
-                scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1)
-                seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
-                eta = gr.Number(label="eta (DDIM)", value=0.0)
-                a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed")
-                n_prompt = gr.Textbox(
-                    label="Negative Prompt",
-                    value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
-                )
-        with gr.Column():
-            result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style(
-                grid=2, height="auto"
-            )
-    ips = [
-        input_image,
-        hand,
-        prompt,
-        a_prompt,
-        n_prompt,
-        num_samples,
-        image_resolution,
-        detect_resolution,
-        ddim_steps,
-        guess_mode,
-        strength,
-        scale,
-        seed,
-        eta,
-    ]
-    run_button.click(fn=process, inputs=ips, outputs=[result_gallery])
-
-
-block.launch(server_name="0.0.0.0", server_port=8225)
diff --git a/ppdiffusers/examples/controlnet/gradio_pose2image_ppdetpose.py b/ppdiffusers/examples/controlnet/gradio_pose2image_ppdetpose.py
deleted file mode 100644
index 483d274fb374..000000000000
--- a/ppdiffusers/examples/controlnet/gradio_pose2image_ppdetpose.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-
-import cv2
-import gradio as gr
-import paddle
-from annotator.ppdet_hrnet import PPDetDetector
-from annotator.util import HWC3, resize_image
-
-from paddlenlp.trainer import set_seed as seed_everything
-from ppdiffusers import ControlNetModel, StableDiffusionControlNetPipeline
-
-apply_ppdetpose = PPDetDetector()
-
-controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-openpose")
-pipe = StableDiffusionControlNetPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None
-)
-
-
-def process(
-    input_image,
-    hand,
-    prompt,
-    a_prompt,
-    n_prompt,
-    num_samples,
-    image_resolution,
-    detect_resolution,
-    ddim_steps,
-    guess_mode,
-    strength,
-    scale,
-    seed,
-    eta,
-):
-    with paddle.no_grad():
-        input_image = HWC3(input_image)
-        detected_map, _ = apply_ppdetpose(input_image, detect_resolution, hand)
-        detected_map = HWC3(detected_map)
-        img = resize_image(input_image, image_resolution)
-        H, W, C = img.shape
-        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
-
-        control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0
-        control = control.unsqueeze(0).transpose([0, 3, 1, 2])
-
-        control_scales = (
-            [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)
-        )  # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
-        if seed == -1:
-            seed = random.randint(0, 65535)
-        seed_everything(seed)
-        results = []
-        for _ in range(num_samples):
-            img = pipe(
-                prompt + ", " + a_prompt,
-                negative_prompt=n_prompt,
-                image=control,
-                num_inference_steps=ddim_steps,
-                height=H,
-                width=W,
-                eta=eta,
-                controlnet_conditioning_scale=control_scales,
-                guidance_scale=scale,
-            ).images[0]
-            results.append(img)
-
-    return [detected_map] + results
-
-
-block = gr.Blocks().queue()
-with block:
-    with gr.Row():
-        gr.Markdown("## Control Stable Diffusion with Human Pose")
-    with gr.Row():
-        with gr.Column():
-            input_image = gr.Image(source="upload", type="numpy")
-            hand = gr.Checkbox(label="detect hand", value=False)
-            prompt = gr.Textbox(label="Prompt")
-            run_button = gr.Button(label="Run")
-            with gr.Accordion("Advanced options", open=False):
-                num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
-                image_resolution = gr.Slider(label="Image Resolution", minimum=256, maximum=768, value=512, step=64)
-                strength = gr.Slider(label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01)
-                guess_mode = gr.Checkbox(label="Guess Mode", value=False)
-                detect_resolution = gr.Slider(
-                    label="OpenPose Resolution", minimum=128, maximum=1024, value=512, step=1
-                )
-                ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
-                scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1)
-                seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
-                eta = gr.Number(label="eta (DDIM)", value=0.0)
-                a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed")
-                n_prompt = gr.Textbox(
-                    label="Negative Prompt",
-                    value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
-                )
-        with gr.Column():
-            result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style(
-                grid=2, height="auto"
-            )
-    ips = [
-        input_image,
-        hand,
-        prompt,
-        a_prompt,
-        n_prompt,
-        num_samples,
-        image_resolution,
-        detect_resolution,
-        ddim_steps,
-        guess_mode,
-        strength,
-        scale,
-        seed,
-        eta,
-    ]
-    run_button.click(fn=process, inputs=ips, outputs=[result_gallery])
-
-
-block.launch(server_name="0.0.0.0", server_port=8232)
diff --git a/ppdiffusers/examples/controlnet/gradio_seg2image_segformer.py b/ppdiffusers/examples/controlnet/gradio_seg2image_segformer.py
deleted file mode 100644
index 5bf8a7905f1d..000000000000
--- a/ppdiffusers/examples/controlnet/gradio_seg2image_segformer.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-
-import cv2
-import gradio as gr
-import paddle
-from annotator.segformer_paddle import SegformerDetector
-from annotator.util import HWC3, resize_image
-
-from paddlenlp.trainer import set_seed as seed_everything
-from ppdiffusers import ControlNetModel, StableDiffusionControlNetPipeline
-
-apply_uniformer = SegformerDetector(mode="ade20k")
-
-controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-seg")
-pipe = StableDiffusionControlNetPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None
-)
-
-
-def process(
-    input_image,
-    prompt,
-    a_prompt,
-    n_prompt,
-    num_samples,
-    image_resolution,
-    detect_resolution,
-    ddim_steps,
-    guess_mode,
-    strength,
-    scale,
-    seed,
-    eta,
-):
-    with paddle.no_grad():
-        input_image = HWC3(input_image)
-        detected_map = apply_uniformer(resize_image(input_image, detect_resolution))
-        img = resize_image(input_image, image_resolution)
-        H, W, C = img.shape
-
-        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
-
-        control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0
-        control = control.unsqueeze(0).transpose([0, 3, 1, 2])
-
-        control_scales = (
-            [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)
-        )  # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
-        if seed == -1:
-            seed = random.randint(0, 65535)
-        seed_everything(seed)
-        results = []
-        for _ in range(num_samples):
-            img = pipe(
-                prompt + ", " + a_prompt,
-                negative_prompt=n_prompt,
-                image=control,
-                num_inference_steps=ddim_steps,
-                height=H,
-                width=W,
-                eta=eta,
-                controlnet_conditioning_scale=control_scales,
-                guidance_scale=scale,
-            ).images[0]
-            results.append(img)
-
-    return [detected_map] + results
-
-
-block = gr.Blocks().queue()
-with block:
-    with gr.Row():
-        gr.Markdown("## Control Stable Diffusion with Segmentation Maps")
-    with gr.Row():
-        with gr.Column():
-            input_image = gr.Image(source="upload", type="numpy")
-            prompt = gr.Textbox(label="Prompt")
-            run_button = gr.Button(label="Run")
-            with gr.Accordion("Advanced options", open=False):
-                num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
-                image_resolution = gr.Slider(label="Image Resolution", minimum=256, maximum=768, value=512, step=64)
-                strength = gr.Slider(label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01)
-                guess_mode = gr.Checkbox(label="Guess Mode", value=False)
-                detect_resolution = gr.Slider(
-                    label="Segmentation Resolution", minimum=128, maximum=1024, value=512, step=1
-                )
-                ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
-                scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1)
-                seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
-                eta = gr.Number(label="eta (DDIM)", value=0.0)
-                a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed")
-                n_prompt = gr.Textbox(
-                    label="Negative Prompt",
-                    value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
-                )
-        with gr.Column():
-            result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style(
-                grid=2, height="auto"
-            )
-    ips = [
-        input_image,
-        prompt,
-        a_prompt,
-        n_prompt,
-        num_samples,
-        image_resolution,
-        detect_resolution,
-        ddim_steps,
-        guess_mode,
-        strength,
-        scale,
-        seed,
-        eta,
-    ]
-    run_button.click(fn=process, inputs=ips, outputs=[result_gallery])
-
-
-block.launch(server_name="0.0.0.0", server_port=8222)
diff --git a/ppdiffusers/examples/controlnet/gradio_seg2image_segmenter.py b/ppdiffusers/examples/controlnet/gradio_seg2image_segmenter.py
deleted file mode 100644
index 0eee845159f0..000000000000
--- a/ppdiffusers/examples/controlnet/gradio_seg2image_segmenter.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-
-import cv2
-import gradio as gr
-import paddle
-from annotator.segmenter_paddle import SegmenterDetector
-from annotator.util import HWC3, resize_image
-
-from paddlenlp.trainer import set_seed as seed_everything
-from ppdiffusers import ControlNetModel, StableDiffusionControlNetPipeline
-
-apply_uniformer = SegmenterDetector()
-
-controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-seg")
-pipe = StableDiffusionControlNetPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None
-)
-
-
-def process(
-    input_image,
-    prompt,
-    a_prompt,
-    n_prompt,
-    num_samples,
-    image_resolution,
-    detect_resolution,
-    ddim_steps,
-    guess_mode,
-    strength,
-    scale,
-    seed,
-    eta,
-):
-    with paddle.no_grad():
-        input_image = HWC3(input_image)
-        detected_map = apply_uniformer(resize_image(input_image, detect_resolution))
-        img = resize_image(input_image, image_resolution)
-        H, W, C = img.shape
-
-        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
-
-        control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0
-        control = control.unsqueeze(0).transpose([0, 3, 1, 2])
-
-        control_scales = (
-            [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)
-        )  # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
-        if seed == -1:
-            seed = random.randint(0, 65535)
-        seed_everything(seed)
-        results = []
-        for _ in range(num_samples):
-            img = pipe(
-                prompt + ", " + a_prompt,
-                negative_prompt=n_prompt,
-                image=control,
-                num_inference_steps=ddim_steps,
-                height=H,
-                width=W,
-                eta=eta,
-                controlnet_conditioning_scale=control_scales,
-                guidance_scale=scale,
-            ).images[0]
-            results.append(img)
-
-    return [detected_map] + results
-
-
-block = gr.Blocks().queue()
-with block:
-    with gr.Row():
-        gr.Markdown("## Control Stable Diffusion with Segmentation Maps")
-    with gr.Row():
-        with gr.Column():
-            input_image = gr.Image(source="upload", type="numpy")
-            prompt = gr.Textbox(label="Prompt")
-            run_button = gr.Button(label="Run")
-            with gr.Accordion("Advanced options", open=False):
-                num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
-                image_resolution = gr.Slider(label="Image Resolution", minimum=256, maximum=768, value=512, step=64)
-                strength = gr.Slider(label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01)
-                guess_mode = gr.Checkbox(label="Guess Mode", value=False)
-                detect_resolution = gr.Slider(
-                    label="Segmentation Resolution", minimum=128, maximum=1024, value=512, step=1
-                )
-                ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
-                scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1)
-                seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
-                eta = gr.Number(label="eta (DDIM)", value=0.0)
-                a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed")
-                n_prompt = gr.Textbox(
-                    label="Negative Prompt",
-                    value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
-                )
-        with gr.Column():
-            result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style(
-                grid=2, height="auto"
-            )
-    ips = [
-        input_image,
-        prompt,
-        a_prompt,
-        n_prompt,
-        num_samples,
-        image_resolution,
-        detect_resolution,
-        ddim_steps,
-        guess_mode,
-        strength,
-        scale,
-        seed,
-        eta,
-    ]
-    run_button.click(fn=process, inputs=ips, outputs=[result_gallery])
-
-
-block.launch(server_name="0.0.0.0", server_port=8322)
diff --git a/ppdiffusers/examples/controlnet/gradio_shuffle2image.py b/ppdiffusers/examples/controlnet/gradio_shuffle2image.py
deleted file mode 100644
index 461487e263bd..000000000000
--- a/ppdiffusers/examples/controlnet/gradio_shuffle2image.py
+++ /dev/null
@@ -1,122 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-
-import gradio as gr
-import paddle
-from annotator.shuffle import ContentShuffleDetector
-from annotator.util import HWC3, resize_image
-
-from paddlenlp.trainer import set_seed as seed_everything
-from ppdiffusers import ControlNetModel, StableDiffusionControlNetPipeline
-
-apply_shuffle = ContentShuffleDetector()
-
-controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11e_sd15_shuffle")
-pipe = StableDiffusionControlNetPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None
-)
-
-
-def process(
-    input_image,
-    prompt,
-    a_prompt,
-    n_prompt,
-    num_samples,
-    image_resolution,
-    ddim_steps,
-    guess_mode,
-    strength,
-    scale,
-    seed,
-    eta,
-):
-    with paddle.no_grad():
-        img = resize_image(HWC3(input_image), image_resolution)
-        H, W, C = img.shape
-        detected_map = apply_shuffle(img, w=W, h=H, f=256)
-
-        control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0
-        control = control.unsqueeze(0).transpose([0, 3, 1, 2])
-
-        control_scales = [strength] * 13
-        if seed == -1:
-            seed = random.randint(0, 65535)
-        seed_everything(seed)
-        results = []
-        for _ in range(num_samples):
-            img = pipe(
-                prompt + ", " + a_prompt,
-                negative_prompt=n_prompt,
-                image=control,
-                num_inference_steps=ddim_steps,
-                height=H,
-                width=W,
-                eta=eta,
-                controlnet_conditioning_scale=control_scales,
-                guidance_scale=scale,
-            ).images[0]
-            results.append(img)
-
-    return [detected_map] + results
-
-
-block = gr.Blocks().queue()
-with block:
-    with gr.Row():
-        gr.Markdown("## Control Stable Diffusion with Content Shuffle")
-    with gr.Row():
-        with gr.Column():
-            input_image = gr.Image(source="upload", type="numpy")
-            prompt = gr.Textbox(label="Prompt")
-            run_button = gr.Button(label="Run")
-            with gr.Accordion("Advanced options", open=False):
-                num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
-                image_resolution = gr.Slider(label="Image Resolution", minimum=256, maximum=768, value=512, step=64)
-                strength = gr.Slider(label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01)
-                guess_mode = gr.Checkbox(label="Guess Mode", value=False)
-                ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
-                scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1)
-                seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
-                eta = gr.Number(label="eta (DDIM)", value=0.0)
-                a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed")
-                n_prompt = gr.Textbox(
-                    label="Negative Prompt",
-                    value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
-                )
-        with gr.Column():
-            result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style(
-                grid=2, height="auto"
-            )
-    ips = [
-        input_image,
-        prompt,
-        a_prompt,
-        n_prompt,
-        num_samples,
-        image_resolution,
-        ddim_steps,
-        guess_mode,
-        strength,
-        scale,
-        seed,
-        eta,
-    ]
-    run_button.click(fn=process, inputs=ips, outputs=[result_gallery])
-
-
-block.launch(server_name="0.0.0.0", server_port=8513)
diff --git a/ppdiffusers/examples/controlnet/requirements.txt b/ppdiffusers/examples/controlnet/requirements.txt
deleted file mode 100644
index 895a47d19f22..000000000000
--- a/ppdiffusers/examples/controlnet/requirements.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-paddlehub>=2.3.1
-paddleseg==2.7.0
-paddlenlp>=2.6.0rc0
-opencv-python
-ppdiffusers>=0.16.1
-cchardet
diff --git a/ppdiffusers/examples/controlnet/train_txt2img_control_trainer.py b/ppdiffusers/examples/controlnet/train_txt2img_control_trainer.py
deleted file mode 100644
index 3c43edeab909..000000000000
--- a/ppdiffusers/examples/controlnet/train_txt2img_control_trainer.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import itertools
-import math
-import os
-
-import paddle
-from control import (
-    ControlNet,
-    ControlNetTrainer,
-    DataArguments,
-    Fill50kDataset,
-    ModelArguments,
-)
-
-from paddlenlp.trainer import PdArgumentParser, TrainingArguments, get_last_checkpoint
-from paddlenlp.utils.log import logger
-
-
-def unfreeze_params(params):
-    for param in params:
-        param.stop_gradient = False
-
-
-def main():
-    parser = PdArgumentParser((ModelArguments, DataArguments, TrainingArguments))
-    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-    # report to custom_visualdl
-    training_args.report_to = ["custom_visualdl"]
-    training_args.resolution = data_args.resolution
-    training_args.image_logging_steps = model_args.image_logging_steps = (
-        math.ceil(model_args.image_logging_steps / training_args.logging_steps) * training_args.logging_steps
-    )
-    training_args.print_config(model_args, "Model")
-    training_args.print_config(data_args, "Data")
-
-    paddle.set_device(training_args.device)
-
-    # Detecting last checkpoint.
-    last_checkpoint = None
-    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
-        last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome."
-            )
-        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
-            logger.info(
-                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-
-    model = ControlNet(model_args)
-    train_dataset = Fill50kDataset(model.tokenizer, data_args.file_path)
-
-    trainer = ControlNetTrainer(
-        model=model, args=training_args, train_dataset=train_dataset, tokenizer=model.tokenizer
-    )
-    # must set recompute after trainer init
-    trainer.model.set_recompute(training_args.recompute)
-
-    if not model_args.sd_locked:
-        params_to_train = itertools.chain(
-            trainer.model.controlnet.parameters(),
-            trainer.model.unet.up_blocks.parameters(),
-            trainer.model.unet.conv_norm_out.parameters(),
-            trainer.model.unet.conv_out.parameters(),
-        )
-        unfreeze_params(params_to_train)
-    else:
-        params_to_train = trainer.model.controlnet.parameters()
-    trainer.set_optimizer_grouped_parameters(params_to_train)
-
-    checkpoint = None
-    if training_args.resume_from_checkpoint is not None:
-        checkpoint = training_args.resume_from_checkpoint
-    elif last_checkpoint is not None:
-        checkpoint = last_checkpoint
-
-    # Training
-    trainer.train(resume_from_checkpoint=checkpoint)
-    trainer.save_model()
-    trainer.save_state()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/ppdiffusers/examples/dreambooth/README.md b/ppdiffusers/examples/dreambooth/README.md
deleted file mode 100644
index 04ddb0e17324..000000000000
--- a/ppdiffusers/examples/dreambooth/README.md
+++ /dev/null
@@ -1,282 +0,0 @@
-# DreamBooth训练代码
-
-[DreamBooth: Fine Tuning Text-to-Image Diffusion Models for Subject-Driven Generation](https://arxiv.org/abs/2208.12242)是一种新的文本生成图像(text2image)的“个性化”（可适应用户特定的图像生成需求）扩散模型。虽然 DreamBooth 是在 Imagen 的基础上做的调整，但研究人员在论文中还提到，他们的方法也适用于其他扩散模型。只需几张（通常 3~5 张）指定物体的照片和相应的类名（如“狗”）作为输入，并添加一个唯一标识符植入不同的文字描述中，DreamBooth 就能让被指定物体“完美”出现在用户想要生成的场景中。
-
-<p align="center">
-    <img src="https://user-images.githubusercontent.com/50394665/198022219-713bc91b-25cc-49b2-897f-b570635fd640.png">
-</p>
-
-## 1 本地运行
-### 1.1 安装依赖
-
-在运行这个训练代码前，我们需要安装下面的训练依赖。
-
-```bash
-pip install -U ppdiffusers visualdl
-```
-
-### 1.2 Sks Dog 训练教程
-
-为了下载`CompVis/stable-diffusion-v1-4`模型权重，我们需要阅读并签署相关的License。在这里我们默认用户已经阅读并签署了解了相关License，有关License及模型的详细介绍，请访问[CompVis/stable-diffusion-v1-4 card](https://huggingface.co/CompVis/stable-diffusion-v1-4)。
-
-> License: The CreativeML OpenRAIL M license is an Open RAIL M license, adapted from the work that BigScience and the RAIL Initiative are jointly carrying in the area of responsible AI licensing. See also the article about the BLOOM Open RAIL license on which our license is based.
-<br>
-
-#### 1.2.1 硬件要求
-当我们开启`gradient_checkpointing`功能后（Tips：该功能可以在一定程度上减少显存消耗），我们可以在24GB显存的GPU上微调模型。如果想要使用更大的`batch_size`进行更快的训练，建议用户使用具有30GB+显存的显卡。
-
-#### 1.2.2 单机单卡训练
-```bash
-export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export INSTANCE_DIR="./dream_image"
-export OUTPUT_DIR="./dream_outputs"
-
-python -u train_dreambooth.py \
-  --pretrained_model_name_or_path=$MODEL_NAME \
-  --instance_data_dir=$INSTANCE_DIR \
-  --output_dir=$OUTPUT_DIR \
-  --instance_prompt="a photo of sks dog" \
-  --resolution=512 \
-  --train_batch_size=1 \
-  --gradient_accumulation_steps=1 \
-  --learning_rate=5e-6 \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --max_train_steps=400
-```
-
-`train_dreambooth.py`代码可传入的参数解释如下：
-> 主要修改的参数
-> * `--pretrained_model_name_or_path`: 所使用的 `Stable Diffusion` 模型权重名称或者本地下载的模型路径，目前支持了上表中的8种模型权重，我们可直接替换使用。
-> * `--instance_data_dir`: 实例（物体）图片文件夹地址。
-> * `--instance_prompt`: 带有特定实例（物体）的提示词描述文本，例如`a photo of sks dog`，其中dog代表实例（物体）。
-> * `--class_data_dir`: 类别（class）图片文件夹地址，主要作为先验知识。
-> * `--class_prompt`: 类别（class）提示词文本，该提示器要与实例（物体）是同一种类别，例如`a photo of dog`，主要作为先验知识。
-> * `--num_class_images`: 事先需要从`class_prompt`中生成多少张图片，主要作为先验知识。
-> * `--prior_loss_weight`: 先验`loss`占比权重。
-> * `--sample_batch_size`: 生成`class_prompt`文本对应的图片所用的批次（batch size），注意，当GPU显卡显存较小的时候需要将这个默认值改成1。
-> * `--with_prior_preservation`: 是否将生成的同类图片（先验知识）一同加入训练，当为`True`的时候，`class_prompt`、`class_data_dir`、`num_class_images`、`sample_batch_size`和`prior_loss_weight`才生效。
-> * `--num_train_epochs`: 训练的轮数，默认值为`1`。
-> * `--max_train_steps`: 最大的训练步数，当我们设置这个值后，它会重新计算所需的`num_train_epochs`轮数。
-> * `--checkpointing_steps`: 每间隔多少步`（global step步数）`，保存模型权重。
-> * `--gradient_accumulation_steps`: 梯度累积的步数，用户可以指定梯度累积的步数，在梯度累积的 step 中。减少多卡之间梯度的通信，减少更新的次数，扩大训练的 batch_size 。
-> * `--train_text_encoder`: 是否一同训练文本编码器的部分，默认为`False`。
-
-> 可以修改的参数
-> * `--height`: 输入给模型的图片`高度`，由于用户输入的并不是固定大小的图片，因此代码中会将原始大小的图片压缩成指定`高度`的图片，默认值为`None`。
-> * `--width`: 输入给模型的图片`宽度`，由于用户输入的并不是固定大小的图片，因此代码中会将原始大小的图片压缩成指定`宽度`的图片，默认值为`None`。
-> * `--resolution`: 输入给模型图片的`分辨率`，当`高度`或`宽度`为`None`时，我们将会使用`resolution`，默认值为`512`。
-> * `--learning_rate`: 学习率。
-> * `--scale_lr`: 是否根据GPU数量，梯度累积步数，以及批量数对学习率进行缩放。缩放公式：`learning_rate * gradient_accumulation_steps * train_batch_size * num_processes`。
-> * `--lr_scheduler`: 要使用的学习率调度策略。默认为 `constant`。
-> * `--lr_warmup_steps`: 用于从 0 到 `learning_rate` 的线性 `warmup` 的步数。
-> * `--train_batch_size`: 训练时每张显卡所使用的`batch_size批量`，当我们的显存较小的时候，需要将这个值设置的小一点。
-> * `--center_crop`: 在调整图片宽和高之前是否将裁剪图像居中，默认值为`False`。
-> * `--random_flip`: 是否对图片进行随机水平反转，默认值为`False`。
-> * `--gradient_checkpointing`: 是否开启`gradient_checkpointing`功能，在一定程度上能够更显显存，但是会减慢训练速度。
-> * `--output_dir`: 模型训练完所保存的路径，默认设置为`dreambooth-model`文件夹，建议用户每训练一个模型可以修改一下输出路径，防止先前已有的模型被覆盖了。
-> * `--enable_xformers_memory_efficient_attention`: 是否开启`xformers`，开启后训练速度会变慢，但是能够节省显存。注意我们需要安装develop版本的paddlepaddle！
-
-> 基本无需修改的参数
-> * `--seed`: 随机种子，为了可以复现训练结果，Tips：当前paddle设置该随机种子后仍无法完美复现。
-> * `--adam_beta1`: `AdamW` 优化器时的 `beta1` 超参数。默认为 `0.9`。
-> * `--adam_beta2`: `AdamW` 优化器时的 `beta2` 超参数。默认为 `0.999`。
-> * `--adam_weight_decay`: `AdamW` 优化器时的 `weight_decay` 超参数。 默认为`0.02`。
-> * `--adam_weight_decay`: `AdamW` 优化器时的 `epsilon` 超参数。默认为 `1e-8`。
-> * `--max_grad_norm`: 最大梯度范数（用于梯度裁剪）。默认为 `-1` 表示不使用。
-> * `--logging_dir`: Tensorboard 或 VisualDL 记录日志的地址，注意：该地址会与输出目录进行拼接，即，最终的日志地址为`<output_dir>/<logging_dir>`。
-> * `--report_to`: 用于记录日志的工具，可选`["tensorboard", "visualdl"]`，默认为`visualdl`，如果选用`tensorboard`，请使用命令安装`pip install tensorboardX`。
-> * `--push_to_hub`: 是否将模型上传到 `huggingface hub`，默认值为 `False`。
-> * `--hub_token`: 上传到 `huggingface hub` 所需要使用的 `token`，如果我们已经登录了，那么我们就无需填写。
-> * `--hub_model_id`: 上传到 `huggingface hub` 的模型库名称， 如果为 `None` 的话表示我们将使用 `output_dir` 的名称作为模型库名称。
-
-
-#### 1.2.3 单机多卡训练
-通过设置`--gpus`，我们可以指定 GPU 为 `0,1,2,3` 卡。
-
-```bash
-export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export INSTANCE_DIR="./dream_image"
-export OUTPUT_DIR="./dream_outputs"
-
-python -u -m paddle.distributed.launch --gpus "0,1,2,3" train_dreambooth.py \
-  --pretrained_model_name_or_path=$MODEL_NAME \
-  --instance_data_dir=$INSTANCE_DIR \
-  --output_dir=$OUTPUT_DIR \
-  --instance_prompt="a photo of sks dog" \
-  --resolution=512 \
-  --train_batch_size=1 \
-  --gradient_accumulation_steps=1 \
-  --learning_rate=5e-6 \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --max_train_steps=400
-```
-
-#### 1.2.4 预测生成图片
-
-当训练完成后，模型将自动保存到`output_dir`目录，在上述例子中，我们的模型最终保存到了`dream_outputs`文件夹。我们可以使用`StableDiffusionPipeline`快速加载该模型。
-
-```
-├── train_dreambooth.py # 训练脚本
-├── dream_outputs  # 我们指定的输出文件路径
-    ├── vae # vae权重文件夹
-        ├── model_state.pdparams
-        ├── config.json
-    ├── text_encoder # text_encoder权重文件夹
-        ├── config.json
-        ├── model_state.pdparams
-    ├── unet # unet权重文件夹
-        ├── model_state.pdparams
-        ├── config.json
-    ├── scheduler # scheduler文件夹
-        ├── scheduler_config.json
-    ├── feature_extractor # feature_extractor文件夹
-        ├── preprocessor_config.json
-    ├── tokenizer # tokenizer文件夹
-        ├── tokenizer_config.json
-        ├── merges.txt
-        ├── special_tokens_map.json
-        ├── added_tokens.json
-        ├── vocab.json
-```
-
-```python
-from ppdiffusers import StableDiffusionPipeline
-
-# 我们所需加载的模型地址，这里我们输入了训练时候使用的 output_dir 地址
-model_path = "./dream_outputs"
-pipe = StableDiffusionPipeline.from_pretrained(model_path)
-
-prompt = "A photo of sks dog in a bucket"
-image = pipe(prompt).images[0]
-# 保存图片，我们可以查看 yoda-pokemon.png 图片。
-image.save("sks-dog.png")
-```
-<p align="center">
-    <img src="https://user-images.githubusercontent.com/50394665/198040284-82de104d-f353-46a1-95d8-06b3d30bc31d.png">
-</p>
-
-### 给模型引入先验知识（图片）一同训练
-`with_prior_preservation`这个参数主要用于防止模型过拟合以及语言出现语义理解偏差（如，原始防止模型将`狗`误理解成了其他一种`动物`）。请参阅论文以了解更多信息。对于该种训练方式，我们首先使用带有类别提示的模型生成对应的图像，然后在训练期间将这些图像与我们自己准备的数据一起使用。
-根据论文，建议生成 num_epochs * num_samples 张图像，其中 200-300 适用于绝大多数情况，因此当我们不太确定的时候，可以设置成200或300。
-
-#### 单机训练
-```bash
-export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export INSTANCE_DIR="./dream_image"
-export CLASS_DIR="./dream_class_image"
-export OUTPUT_DIR="./dream_outputs_with_class"
-
-python -u train_dreambooth.py \
-  --pretrained_model_name_or_path=$MODEL_NAME  \
-  --instance_data_dir=$INSTANCE_DIR \
-  --class_data_dir=$CLASS_DIR \
-  --output_dir=$OUTPUT_DIR \
-  --with_prior_preservation --prior_loss_weight=1.0 \
-  --instance_prompt="a photo of sks dog" \
-  --class_prompt="a photo of dog" \
-  --resolution=512 \
-  --train_batch_size=1 \
-  --gradient_accumulation_steps=1 \
-  --learning_rate=5e-6 \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --num_class_images=200 \
-  --max_train_steps=800
-```
-#### 预测
-```python
-from ppdiffusers import StableDiffusionPipeline
-
-# 我们所需加载的模型地址，这里我们输入了训练时候使用的 output_dir 地址
-model_path = "./dream_outputs_with_class"
-pipe = StableDiffusionPipeline.from_pretrained(model_path)
-
-prompt = "A photo of sks dog in a bucket"
-image = pipe(prompt).images[0]
-# 保存图片，我们可以查看 yoda-pokemon.png 图片。
-image.save("sks-dog-with-class.png")
-```
-<p align="center">
-    <img src="https://user-images.githubusercontent.com/50394665/198040116-57ce9e16-53df-4f53-90b9-344627fc5fd5.png">
-</p>
-
-
-# 使用 LoRA 和 DreamBooth 技术进行模型训练
-
-[LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685) 是微软研究员引入的一项新技术，主要用于处理大模型微调的问题。目前超过数十亿以上参数的具有强能力的大模型 (例如 GPT-3) 通常在为了适应其下游任务的微调中会呈现出巨大开销。LoRA 建议冻结预训练模型的权重并在每个 Transformer 块中注入可训练层 (秩-分解矩阵)。因为不需要为大多数模型权重计算梯度，所以大大减少了需要训练参数的数量并且降低了 GPU 的内存要求。研究人员发现，通过聚焦大模型的 Transformer 注意力块，使用 LoRA 进行的微调质量与全模型微调相当，同时速度更快且需要更少的计算。
-
-简而言之，LoRA允许通过向现有权重添加一对秩分解矩阵，并只训练这些新添加的权重来适应预训练的模型。这有几个优点：
-
-- 保持预训练的权重不变，这样模型就不容易出现灾难性遗忘 [catastrophic forgetting](https://www.pnas.org/doi/10.1073/pnas.1611835114)；
-- 秩分解矩阵的参数比原始模型少得多，这意味着训练的 LoRA 权重很容易移植；
-- LoRA 注意力层允许通过一个 `scale` 参数来控制模型适应新训练图像的程度。
-
-[cloneofsimo](https://github.com/cloneofsimo) 是第一个在 [LoRA GitHub](https://github.com/cloneofsimo/lora) 仓库中尝试使用 LoRA 训练 Stable Diffusion 的人。
-
-## 训练
-
-**___Note: 如果我们使用 [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 进行训练，那么我们需要将 `resolution` 改成 768 .___**
-
-```bash
-export MODEL_NAME="runwayml/stable-diffusion-v1-5"
-export INSTANCE_DIR="path-to-instance-images"
-export OUTPUT_DIR="path-to-save-model"
-
-python train_dreambooth_lora.py \
-  --pretrained_model_name_or_path=$MODEL_NAME  \
-  --instance_data_dir=$INSTANCE_DIR \
-  --output_dir=$OUTPUT_DIR \
-  --instance_prompt="a photo of sks dog" \
-  --resolution=512 \
-  --train_batch_size=1 \
-  --gradient_accumulation_steps=1 \
-  --checkpointing_steps=100 \
-  --learning_rate=1e-4 \
-  --report_to="visualdl" \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --max_train_steps=500 \
-  --validation_prompt="A photo of sks dog in a bucket" \
-  --validation_epochs=50 \
-  --lora_rank=4 \
-  --seed=0
-```
-
-**___Note: 当我使用 LoRA 训练模型的时候，我们需要使用更大的学习率，因此我们这里使用 *1e-4* 而不是 *2e-6*.___**
-
-最终经过微调后的 LoRA 权重，我们已经上传到了 [junnyu/lora_dreambooth_dog_example](https://huggingface.co/junnyu/lora_dreambooth_dog_example). **___Note: [最终的权重](https://huggingface.co/junnyu/lora_dreambooth_dog_example/blob/main/paddle_lora_weights.pdparams) 只有 3 MB 的大小.___**
-
-## 推理
-
-经过训练， LoRA 权重可以直接加载到原始的 pipeline 中。
-
-首先我们需要加载原始的 pipeline：
-
-```python
-from ppdiffusers import DiffusionPipeline, DPMSolverMultistepScheduler
-import paddle
-
-pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
-```
-
-接下来, 我们需要使用 `load_attn_procs` 方法将 `adapter layers` 添加到 UNet 模型中。
-```python
-pipe.unet.load_attn_procs("junnyu/lora_dreambooth_dog_example", from_hf_hub=True)
-```
-
-最终, 我们可以使用模型进行推理预测.
-
-```python
-image = pipe("A picture of a sks dog in a bucket", num_inference_steps=25).images[0]
-image.save("demo.png")
-```
-<p align="center">
-    <img src="https://user-images.githubusercontent.com/50394665/218384517-b89667f4-b5c9-4ecf-afcb-8b667c5532bb.jpg">
-</p>
-
-# 参考资料
-- https://github.com/huggingface/diffusers/tree/main/examples/dreambooth
-- https://github.com/CompVis/stable-diffusion
diff --git a/ppdiffusers/examples/dreambooth/requirements.txt b/ppdiffusers/examples/dreambooth/requirements.txt
deleted file mode 100644
index d77a600a0daf..000000000000
--- a/ppdiffusers/examples/dreambooth/requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-paddlenlp>=2.6.0rc0
-Pillow
-ppdiffusers>=0.16.1
\ No newline at end of file
diff --git a/ppdiffusers/examples/dreambooth/train_dreambooth.py b/ppdiffusers/examples/dreambooth/train_dreambooth.py
deleted file mode 100644
index e57242b3df68..000000000000
--- a/ppdiffusers/examples/dreambooth/train_dreambooth.py
+++ /dev/null
@@ -1,858 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import contextlib
-import gc
-import hashlib
-import math
-import os
-import sys
-import warnings
-from pathlib import Path
-from typing import Optional
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from huggingface_hub import HfFolder, Repository, create_repo, whoami
-from paddle.distributed.fleet.utils.hybrid_parallel_util import (
-    fused_allreduce_gradients,
-)
-from paddle.io import BatchSampler, DataLoader, Dataset, DistributedBatchSampler
-from paddle.optimizer import AdamW
-from paddle.vision import BaseTransform, transforms
-from PIL import Image
-from tqdm.auto import tqdm
-
-from paddlenlp.trainer import set_seed
-from paddlenlp.transformers import AutoTokenizer, PretrainedConfig
-from paddlenlp.utils.log import logger
-from ppdiffusers import (
-    AutoencoderKL,
-    DDPMScheduler,
-    DiffusionPipeline,
-    UNet2DConditionModel,
-    is_ppxformers_available,
-)
-from ppdiffusers.models.modeling_utils import freeze_params, unwrap_model
-from ppdiffusers.optimization import get_scheduler
-from ppdiffusers.utils import check_min_version
-
-# Will error if the minimal version of ppdiffusers is not installed. Remove at your own risks.
-check_min_version("0.16.1")
-
-
-def url_or_path_join(*path_list):
-    return os.path.join(*path_list) if os.path.isdir(os.path.join(*path_list)) else "/".join(path_list)
-
-
-class Lambda(BaseTransform):
-    def __init__(self, fn, keys=None):
-        super().__init__(keys)
-        self.fn = fn
-
-    def _apply_image(self, img):
-        return self.fn(img)
-
-
-def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str):
-    try:
-        text_encoder_config = PretrainedConfig.from_pretrained(
-            url_or_path_join(pretrained_model_name_or_path, "text_encoder")
-        )
-        model_class = text_encoder_config.architectures[0]
-    except Exception:
-        model_class = "LDMBertModel"
-    if model_class == "CLIPTextModel":
-        from paddlenlp.transformers import CLIPTextModel
-
-        return CLIPTextModel
-    elif model_class == "RobertaSeriesModelWithTransformation":
-        from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import (
-            RobertaSeriesModelWithTransformation,
-        )
-
-        return RobertaSeriesModelWithTransformation
-    elif model_class == "BertModel":
-        from paddlenlp.transformers import BertModel
-
-        return BertModel
-    elif model_class == "LDMBertModel":
-        from ppdiffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import (
-            LDMBertModel,
-        )
-
-        return LDMBertModel
-    else:
-        raise ValueError(f"{model_class} is not supported.")
-
-
-def set_recompute(model, value=False):
-    def fn(layer):
-        # ldmbert
-        if hasattr(layer, "enable_recompute"):
-            layer.enable_recompute = value
-            print("Set", layer.__class__, "recompute", layer.enable_recompute)
-        # unet
-        if hasattr(layer, "gradient_checkpointing"):
-            layer.gradient_checkpointing = value
-            print("Set", layer.__class__, "recompute", layer.gradient_checkpointing)
-
-    model.apply(fn)
-
-
-def get_report_to(args):
-    if args.report_to == "visualdl":
-        from visualdl import LogWriter
-
-        writer = LogWriter(logdir=args.logging_dir)
-    elif args.report_to == "tensorboard":
-        from tensorboardX import SummaryWriter
-
-        writer = SummaryWriter(logdir=args.logging_dir)
-    else:
-        raise ValueError("report_to must be in ['visualdl', 'tensorboard']")
-    return writer
-
-
-def parse_args(input_args=None):
-    parser = argparse.ArgumentParser(description="Simple example of a training dreambooth script.")
-    parser.add_argument(
-        "--pretrained_model_name_or_path",
-        type=str,
-        default=None,
-        required=True,
-        help="Path to pretrained model or model identifier from huggingface.co/models.",
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        type=str,
-        default=None,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--instance_data_dir",
-        type=str,
-        default=None,
-        required=True,
-        help="A folder containing the training data of instance images.",
-    )
-    parser.add_argument(
-        "--class_data_dir",
-        type=str,
-        default=None,
-        required=False,
-        help="A folder containing the training data of class images.",
-    )
-    parser.add_argument(
-        "--instance_prompt",
-        type=str,
-        default=None,
-        required=True,
-        help="The prompt with identifier specifying the instance",
-    )
-    parser.add_argument(
-        "--class_prompt",
-        type=str,
-        default=None,
-        help="The prompt to specify images in the same class as provided instance images.",
-    )
-    parser.add_argument(
-        "--with_prior_preservation",
-        default=False,
-        action="store_true",
-        help="Flag to add prior preservation loss.",
-    )
-    parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
-    parser.add_argument(
-        "--num_class_images",
-        type=int,
-        default=100,
-        help=(
-            "Minimal class images for prior preservation loss. If there are not enough images already present in"
-            " class_data_dir, additional images will be sampled with class_prompt."
-        ),
-    )
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        default="./dreambooth-model",
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
-    parser.add_argument(
-        "--height",
-        type=int,
-        default=None,
-        help=(
-            "The height for input images, all the images in the train/validation dataset will be resized to this"
-            " height"
-        ),
-    )
-    parser.add_argument(
-        "--width",
-        type=int,
-        default=None,
-        help=(
-            "The width for input images, all the images in the train/validation dataset will be resized to this"
-            " width"
-        ),
-    )
-    parser.add_argument(
-        "--resolution",
-        type=int,
-        default=512,
-        help=(
-            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
-            " resolution"
-        ),
-    )
-    parser.add_argument(
-        "--center_crop",
-        default=False,
-        action="store_true",
-        help=(
-            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
-            " cropped. The images will be resized to the resolution first before cropping."
-        ),
-    )
-    parser.add_argument(
-        "--random_flip",
-        action="store_true",
-        help="whether to randomly flip images horizontally",
-    )
-    parser.add_argument(
-        "--train_text_encoder",
-        action="store_true",
-        help="Whether to train the text encoder. If set, the text encoder should be float32 precision.",
-    )
-    parser.add_argument(
-        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
-    )
-    parser.add_argument(
-        "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images."
-    )
-    parser.add_argument("--num_train_epochs", type=int, default=1)
-    parser.add_argument(
-        "--max_train_steps",
-        type=int,
-        default=None,
-        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
-    )
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument(
-        "--gradient_checkpointing",
-        action="store_true",
-        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
-    )
-    parser.add_argument(
-        "--learning_rate",
-        type=float,
-        default=5e-6,
-        help="Initial learning rate (after the potential warmup period) to use.",
-    )
-    parser.add_argument(
-        "--scale_lr",
-        action="store_true",
-        default=False,
-        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
-    )
-    parser.add_argument(
-        "--lr_scheduler",
-        type=str,
-        default="constant",
-        help=(
-            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
-            ' "constant", "constant_with_warmup"]'
-        ),
-    )
-    parser.add_argument(
-        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
-    )
-    parser.add_argument(
-        "--lr_num_cycles",
-        type=int,
-        default=1,
-        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
-    )
-    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
-    parser.add_argument(
-        "--dataloader_num_workers",
-        type=int,
-        default=0,
-        help=(
-            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
-        ),
-    )
-    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
-    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
-    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
-    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
-    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
-    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
-    parser.add_argument(
-        "--hub_model_id",
-        type=str,
-        default=None,
-        help="The name of the repository to keep in sync with the local `output_dir`.",
-    )
-    parser.add_argument(
-        "--logging_dir",
-        type=str,
-        default="logs",
-        help=(
-            "[TensorBoard](https://www.tensorflow.org/tensorboard) or [VisualDL](https://www.paddlepaddle.org.cn/paddle/visualdl) log directory. Will default to"
-            "*output_dir/logs"
-        ),
-    )
-    parser.add_argument(
-        "--report_to",
-        type=str,
-        default="visualdl",
-        choices=["tensorboard", "visualdl"],
-        help="Log writer type.",
-    )
-    parser.add_argument(
-        "--checkpointing_steps",
-        type=int,
-        default=500,
-        help=("Save a checkpoint of the training state every X updates."),
-    )
-    parser.add_argument(
-        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
-    )
-    parser.add_argument("--noise_offset", type=float, default=1.0, help="The scale of noise offset.")
-
-    if input_args is not None:
-        args = parser.parse_args(input_args)
-    else:
-        args = parser.parse_args()
-
-    if args.instance_data_dir is None:
-        raise ValueError("You must specify a train data directory.")
-
-    if args.with_prior_preservation:
-        if args.class_data_dir is None:
-            raise ValueError("You must specify a data directory for class images.")
-        if args.class_prompt is None:
-            raise ValueError("You must specify prompt for class images.")
-    else:
-        # logger is not available yet
-        if args.class_data_dir is not None:
-            warnings.warn("You need not use --class_data_dir without --with_prior_preservation.")
-        if args.class_prompt is not None:
-            warnings.warn("You need not use --class_prompt without --with_prior_preservation.")
-
-    args.logging_dir = os.path.join(args.output_dir, args.logging_dir)
-    if args.height is None or args.width is None and args.resolution is not None:
-        args.height = args.width = args.resolution
-
-    return args
-
-
-class DreamBoothDataset(Dataset):
-    """
-    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
-    It pre-processes the images and the tokenizes prompts.
-    """
-
-    def __init__(
-        self,
-        instance_data_root,
-        instance_prompt,
-        tokenizer,
-        class_data_root=None,
-        class_prompt=None,
-        class_num=None,
-        height=512,
-        width=512,
-        center_crop=False,
-        interpolation="bilinear",
-        random_flip=False,
-    ):
-        self.height = height
-        self.width = width
-        self.center_crop = center_crop
-        self.tokenizer = tokenizer
-
-        self.instance_data_root = Path(instance_data_root)
-        if not self.instance_data_root.exists():
-            raise ValueError("Instance images root doesn't exists.")
-        ext = ["png", "jpg", "jpeg", "bmp", "PNG", "JPG", "JPEG", "BMP"]
-        self.instance_images_path = []
-        for p in Path(instance_data_root).iterdir():
-            if any(suffix in p.name for suffix in ext):
-                self.instance_images_path.append(p)
-        self.num_instance_images = len(self.instance_images_path)
-        self.instance_prompt = instance_prompt
-        self._length = self.num_instance_images
-
-        if class_data_root is not None:
-            self.class_data_root = Path(class_data_root)
-            self.class_data_root.mkdir(parents=True, exist_ok=True)
-            self.class_images_path = []
-            for p in Path(class_data_root).iterdir():
-                if any(suffix in p.name for suffix in ext):
-                    self.class_images_path.append(p)
-            if class_num is not None:
-                self.num_class_images = min(len(self.class_images_path), class_num)
-            else:
-                self.num_class_images = len(self.class_images_path)
-            self._length = max(self.num_class_images, self.num_instance_images)
-            self.class_prompt = class_prompt
-        else:
-            self.class_data_root = None
-
-        self.image_transforms = transforms.Compose(
-            [
-                transforms.Resize((height, width), interpolation=interpolation),
-                transforms.CenterCrop((height, width)) if center_crop else transforms.RandomCrop((height, width)),
-                transforms.RandomHorizontalFlip() if random_flip else Lambda(lambda x: x),
-                transforms.ToTensor(),
-                transforms.Normalize([0.5], [0.5]),
-            ]
-        )
-
-    def __len__(self):
-        return self._length
-
-    def __getitem__(self, index):
-        example = {}
-        instance_image = Image.open(self.instance_images_path[index % self.num_instance_images])
-        if not instance_image.mode == "RGB":
-            instance_image = instance_image.convert("RGB")
-        example["instance_images"] = self.image_transforms(instance_image)
-        example["instance_prompt_ids"] = self.tokenizer(
-            self.instance_prompt,
-            padding="do_not_pad",
-            truncation=True,
-            max_length=self.tokenizer.model_max_length,
-            return_attention_mask=False,
-        ).input_ids
-
-        if self.class_data_root:
-            class_image = Image.open(self.class_images_path[index % self.num_class_images])
-            if not class_image.mode == "RGB":
-                class_image = class_image.convert("RGB")
-            example["class_images"] = self.image_transforms(class_image)
-            example["class_prompt_ids"] = self.tokenizer(
-                self.class_prompt,
-                padding="do_not_pad",
-                truncation=True,
-                max_length=self.tokenizer.model_max_length,
-                return_attention_mask=False,
-            ).input_ids
-
-        return example
-
-
-class PromptDataset(Dataset):
-    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
-
-    def __init__(self, prompt, num_samples):
-        self.prompt = prompt
-        self.num_samples = num_samples
-
-    def __len__(self):
-        return self.num_samples
-
-    def __getitem__(self, index):
-        example = {}
-        example["prompt"] = self.prompt
-        example["index"] = index
-        return example
-
-
-def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
-    if token is None:
-        token = HfFolder.get_token()
-    if organization is None:
-        username = whoami(token)["name"]
-        return f"{username}/{model_id}"
-    else:
-        return f"{organization}/{model_id}"
-
-
-def main():
-    args = parse_args()
-    rank = paddle.distributed.get_rank()
-    is_main_process = rank == 0
-    num_processes = paddle.distributed.get_world_size()
-    if num_processes > 1:
-        paddle.distributed.init_parallel_env()
-
-    # If passed along, set the training seed now.
-    if args.seed is not None:
-        set_seed(args.seed)
-
-    # Generate class images if prior preservation is enabled.
-    if args.with_prior_preservation:
-        class_images_dir = Path(args.class_data_dir)
-        if not class_images_dir.exists():
-            class_images_dir.mkdir(parents=True)
-        cur_class_images = len(list(class_images_dir.iterdir()))
-
-        if cur_class_images < args.num_class_images:
-            pipeline = DiffusionPipeline.from_pretrained(
-                args.pretrained_model_name_or_path, safety_checker=None, requires_safety_checker=False
-            )
-            if args.enable_xformers_memory_efficient_attention and is_ppxformers_available():
-                try:
-                    pipeline.unet.enable_xformers_memory_efficient_attention()
-                except Exception as e:
-                    logger.warn(
-                        "Could not enable memory efficient attention. Make sure develop paddlepaddle is installed"
-                        f" correctly and a GPU is available: {e}"
-                    )
-            pipeline.set_progress_bar_config(disable=True)
-
-            num_new_images = args.num_class_images - cur_class_images
-            logger.info(f"Number of class images to sample: {num_new_images}.")
-
-            sample_dataset = PromptDataset(args.class_prompt, num_new_images)
-            batch_sampler = (
-                DistributedBatchSampler(sample_dataset, batch_size=args.sample_batch_size, shuffle=False)
-                if num_processes > 1
-                else BatchSampler(sample_dataset, batch_size=args.sample_batch_size, shuffle=False)
-            )
-            sample_dataloader = DataLoader(
-                sample_dataset, batch_sampler=batch_sampler, num_workers=args.dataloader_num_workers
-            )
-
-            for example in tqdm(sample_dataloader, desc="Generating class images", disable=not is_main_process):
-                images = pipeline(example["prompt"]).images
-
-                for i, image in enumerate(images):
-                    hash_image = hashlib.sha1(image.tobytes()).hexdigest()
-                    image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
-                    image.save(image_filename)
-            pipeline.to("cpu")
-            del pipeline
-            gc.collect()
-
-    if is_main_process:
-        if args.output_dir is not None:
-            os.makedirs(args.output_dir, exist_ok=True)
-
-        if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-
-            create_repo(repo_name, exist_ok=True, token=args.hub_token)
-            repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
-
-            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
-                if "step_*" not in gitignore:
-                    gitignore.write("step_*\n")
-                if "epoch_*" not in gitignore:
-                    gitignore.write("epoch_*\n")
-
-    # Load the tokenizer
-    if args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)
-    elif args.pretrained_model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(url_or_path_join(args.pretrained_model_name_or_path, "tokenizer"))
-
-    # import correct text encoder class
-    text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path)
-
-    # Load scheduler and models
-    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
-    text_encoder = text_encoder_cls.from_pretrained(
-        url_or_path_join(args.pretrained_model_name_or_path, "text_encoder")
-    )
-    text_config = text_encoder.config if isinstance(text_encoder.config, dict) else text_encoder.config.to_dict()
-    if text_config.get("use_attention_mask", None) is not None and text_config["use_attention_mask"]:
-        use_attention_mask = True
-    else:
-        use_attention_mask = False
-    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
-    unet = UNet2DConditionModel.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="unet",
-    )
-
-    freeze_params(vae.parameters())
-    if not args.train_text_encoder:
-        freeze_params(text_encoder.parameters())
-    if args.gradient_checkpointing:
-        unet.enable_gradient_checkpointing()
-        if args.train_text_encoder:
-            set_recompute(text_encoder, True)
-
-    if args.enable_xformers_memory_efficient_attention and is_ppxformers_available():
-        try:
-            unet.enable_xformers_memory_efficient_attention()
-        except Exception as e:
-            logger.warn(
-                "Could not enable memory efficient attention. Make sure develop paddlepaddle is installed"
-                f" correctly and a GPU is available: {e}"
-            )
-
-    # Dataset and DataLoaders creation:
-    train_dataset = DreamBoothDataset(
-        instance_data_root=args.instance_data_dir,
-        instance_prompt=args.instance_prompt,
-        class_data_root=args.class_data_dir if args.with_prior_preservation else None,
-        class_prompt=args.class_prompt,
-        class_num=args.num_class_images,
-        tokenizer=tokenizer,
-        height=args.height,
-        width=args.width,
-        center_crop=args.center_crop,
-        interpolation="bilinear",
-        random_flip=args.random_flip,
-    )
-
-    def collate_fn(examples):
-        input_ids = [example["instance_prompt_ids"] for example in examples]
-        pixel_values = [example["instance_images"] for example in examples]
-
-        # Concat class and instance examples for prior preservation.
-        # We do this to avoid doing two forward passes.
-        if args.with_prior_preservation:
-            input_ids += [example["class_prompt_ids"] for example in examples]
-            pixel_values += [example["class_images"] for example in examples]
-
-        pixel_values = paddle.stack(pixel_values).astype("float32")
-
-        input_ids = tokenizer.pad(
-            {"input_ids": input_ids}, padding="max_length", max_length=tokenizer.model_max_length, return_tensors="pd"
-        ).input_ids
-
-        return {
-            "input_ids": input_ids,
-            "pixel_values": pixel_values,
-        }
-
-    train_sampler = (
-        DistributedBatchSampler(train_dataset, batch_size=args.train_batch_size, shuffle=True)
-        if num_processes > 1
-        else BatchSampler(train_dataset, batch_size=args.train_batch_size, shuffle=True)
-    )
-    train_dataloader = DataLoader(
-        train_dataset, batch_sampler=train_sampler, collate_fn=collate_fn, num_workers=args.dataloader_num_workers
-    )
-
-    # Scheduler and math around the number of training steps.
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
-    if args.max_train_steps is None:
-        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
-    # Afterwards we recalculate our number of training epochs
-    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
-
-    if num_processes > 1:
-        unet = paddle.DataParallel(unet)
-        if args.train_text_encoder:
-            text_encoder = paddle.DataParallel(text_encoder)
-
-    params_to_optimize = (
-        list(unet.parameters()) + list(text_encoder.parameters()) if args.train_text_encoder else unet.parameters()
-    )
-
-    if args.scale_lr:
-        args.learning_rate = (
-            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * num_processes
-        )
-
-    lr_scheduler = get_scheduler(
-        args.lr_scheduler,
-        learning_rate=args.learning_rate,
-        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
-        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
-        num_cycles=args.lr_num_cycles,
-        power=args.lr_power,
-    )
-    # Initialize the optimizer
-    optimizer = AdamW(
-        learning_rate=lr_scheduler,
-        parameters=params_to_optimize,
-        beta1=args.adam_beta1,
-        beta2=args.adam_beta2,
-        weight_decay=args.adam_weight_decay,
-        epsilon=args.adam_epsilon,
-        grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm) if args.max_grad_norm > 0 else None,
-    )
-
-    if is_main_process:
-        logger.info("-----------  Configuration Arguments -----------")
-        for arg, value in sorted(vars(args).items()):
-            logger.info("%s: %s" % (arg, value))
-        logger.info("------------------------------------------------")
-        writer = get_report_to(args)
-
-    # Train!
-    total_batch_size = args.train_batch_size * num_processes * args.gradient_accumulation_steps
-
-    logger.info("***** Running training *****")
-    logger.info(f"  Num examples = {len(train_dataset)}")
-    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
-    logger.info(f"  Num Epochs = {args.num_train_epochs}")
-    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
-    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
-    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
-    logger.info(f"  Total optimization steps = {args.max_train_steps}")
-
-    # Only show the progress bar once on each machine.
-    progress_bar = tqdm(range(args.max_train_steps), disable=not is_main_process)
-    progress_bar.set_description("Train Steps")
-    global_step = 0
-
-    # Keep vae in eval model as we don't train these
-    vae.eval()
-    if args.train_text_encoder:
-        text_encoder.train()
-    else:
-        text_encoder.eval()
-    unet.train()
-
-    for epoch in range(args.num_train_epochs):
-        for step, batch in enumerate(train_dataloader):
-            # Convert images to latent space
-            latents = vae.encode(batch["pixel_values"]).latent_dist.sample()
-            latents = latents * vae.config.scaling_factor
-
-            # Sample noise that we'll add to the latents
-            noise = paddle.randn(latents.shape, dtype=latents.dtype)
-            if args.noise_offset:
-                # https://www.crosslabs.org//blog/diffusion-with-offset-noise
-                noise += args.noise_offset * paddle.randn(
-                    (latents.shape[0], latents.shape[1], 1, 1), dtype=latents.dtype
-                )
-            batch_size = latents.shape[0]
-            # Sample a random timestep for each image
-            timesteps = paddle.randint(0, noise_scheduler.config.num_train_timesteps, (batch_size,), dtype="int64")
-
-            # Add noise to the latents according to the noise magnitude at each timestep
-            # (this is the forward diffusion process)
-            noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
-
-            if num_processes > 1 and (
-                args.gradient_checkpointing or ((step + 1) % args.gradient_accumulation_steps != 0)
-            ):
-                # grad acc, no_sync when (step + 1) % args.gradient_accumulation_steps != 0:
-                # gradient_checkpointing, no_sync every where
-                # gradient_checkpointing + grad_acc, no_sync every where
-                unet_ctx_manager = unet.no_sync()
-                if args.train_text_encoder:
-                    text_encoder_ctx_manager = text_encoder.no_sync()
-                else:
-                    text_encoder_ctx_manager = (
-                        contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
-                    )
-            else:
-                unet_ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
-                text_encoder_ctx_manager = (
-                    contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
-                )
-
-            with text_encoder_ctx_manager:
-                # Get the text embedding for conditioning
-                if use_attention_mask:
-                    attention_mask = (batch["input_ids"] != tokenizer.pad_token_id).cast("int64")
-                else:
-                    attention_mask = None
-                encoder_hidden_states = text_encoder(batch["input_ids"], attention_mask=attention_mask)[0]
-
-                with unet_ctx_manager:
-                    # Predict the noise residual / sample
-                    model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
-
-                    # Get the target for loss depending on the prediction type
-                    if noise_scheduler.config.prediction_type == "epsilon":
-                        target = noise
-                    elif noise_scheduler.config.prediction_type == "v_prediction":
-                        target = noise_scheduler.get_velocity(latents, noise, timesteps)
-                    else:
-                        raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
-
-                    if args.with_prior_preservation:
-                        # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
-                        model_pred, model_pred_prior = model_pred.chunk(2, axis=0)
-                        target, target_prior = target.chunk(2, axis=0)
-
-                        # Compute instance loss
-                        loss = F.mse_loss(model_pred, target, reduction="mean")
-
-                        # Compute prior loss
-                        prior_loss = F.mse_loss(model_pred_prior, target_prior, reduction="mean")
-
-                        # Add the prior loss to the instance loss.
-                        loss = loss + args.prior_loss_weight * prior_loss
-                    else:
-                        loss = F.mse_loss(model_pred, target, reduction="mean")
-
-                    if args.gradient_accumulation_steps > 1:
-                        loss = loss / args.gradient_accumulation_steps
-                    loss.backward()
-
-            if (step + 1) % args.gradient_accumulation_steps == 0:
-                if num_processes > 1 and args.gradient_checkpointing:
-                    fused_allreduce_gradients(params_to_optimize, None)
-                optimizer.step()
-                lr_scheduler.step()
-                optimizer.clear_grad()
-                progress_bar.update(1)
-                global_step += 1
-                step_loss = loss.item() * args.gradient_accumulation_steps
-                logs = {
-                    "epoch": str(epoch).zfill(4),
-                    "step_loss": round(step_loss, 10),
-                    "lr": lr_scheduler.get_lr(),
-                }
-                progress_bar.set_postfix(**logs)
-
-                if is_main_process:
-                    for name, val in logs.items():
-                        if name == "epoch":
-                            continue
-                        writer.add_scalar(f"train/{name}", val, global_step)
-
-                    if global_step % args.checkpointing_steps == 0:
-                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
-                        unwrap_model(unet).save_pretrained(os.path.join(save_path, "unet"))
-                        if args.train_text_encoder:
-                            unwrap_model(text_encoder).save_pretrained(os.path.join(save_path, "text_encoder"))
-
-                if global_step >= args.max_train_steps:
-                    break
-
-    # Create the pipeline using the trained modules and save it.
-    if is_main_process:
-        writer.close()
-        # Create the pipeline using using the trained modules and save it.
-        pipeline = DiffusionPipeline.from_pretrained(
-            args.pretrained_model_name_or_path,
-            unet=unwrap_model(unet),
-            text_encoder=unwrap_model(text_encoder),
-        )
-        pipeline.save_pretrained(args.output_dir)
-
-        if args.push_to_hub:
-            repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/ppdiffusers/examples/dreambooth/train_dreambooth_lora.py b/ppdiffusers/examples/dreambooth/train_dreambooth_lora.py
deleted file mode 100644
index 015522da7732..000000000000
--- a/ppdiffusers/examples/dreambooth/train_dreambooth_lora.py
+++ /dev/null
@@ -1,1070 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import contextlib
-import gc
-import hashlib
-import math
-import os
-import sys
-import time
-import warnings
-from pathlib import Path
-from typing import Optional, Type
-
-import numpy as np
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-import requests
-from huggingface_hub import HfFolder, create_repo, upload_folder, whoami
-from paddle.distributed.fleet.utils.hybrid_parallel_util import (
-    fused_allreduce_gradients,
-)
-from paddle.io import BatchSampler, DataLoader, Dataset, DistributedBatchSampler
-from paddle.optimizer import AdamW
-from paddle.vision import BaseTransform, transforms
-from PIL import Image
-from tqdm.auto import tqdm
-
-from paddlenlp.trainer import set_seed
-from paddlenlp.transformers import AutoTokenizer, PretrainedConfig
-from paddlenlp.utils.log import logger
-from ppdiffusers import (
-    AutoencoderKL,
-    DDPMScheduler,
-    DiffusionPipeline,
-    DPMSolverMultistepScheduler,
-    UNet2DConditionModel,
-    is_ppxformers_available,
-)
-from ppdiffusers.loaders import AttnProcsLayers, LoraLoaderMixin
-from ppdiffusers.models.attention_processor import (
-    AttnProcessor,
-    AttnProcessor2_5,
-    LoRAAttnProcessor,
-    LoRAAttnProcessor2_5,
-)
-from ppdiffusers.optimization import get_scheduler
-from ppdiffusers.training_utils import freeze_params, unwrap_model
-from ppdiffusers.utils import TEXT_ENCODER_ATTN_MODULE, check_min_version
-
-# Will error if the minimal version of ppdiffusers is not installed. Remove at your own risks.
-check_min_version("0.16.1")
-
-# Since HF sometimes timeout, we need to retry uploads
-# Credit: https://github.com/huggingface/datasets/blob/06ae3f678651bfbb3ca7dd3274ee2f38e0e0237e/src/datasets/utils/file_utils.py#L265
-
-
-def _retry(
-    func,
-    func_args: Optional[tuple] = None,
-    func_kwargs: Optional[dict] = None,
-    exceptions: Type[requests.exceptions.RequestException] = requests.exceptions.RequestException,
-    max_retries: int = 0,
-    base_wait_time: float = 0.5,
-    max_wait_time: float = 2,
-):
-    func_args = func_args or ()
-    func_kwargs = func_kwargs or {}
-    retry = 0
-    while True:
-        try:
-            return func(*func_args, **func_kwargs)
-        except exceptions as err:
-            if retry >= max_retries:
-                raise err
-            else:
-                sleep_time = min(max_wait_time, base_wait_time * 2**retry)  # Exponential backoff
-                logger.info(f"{func} timed out, retrying in {sleep_time}s... [{retry/max_retries}]")
-                time.sleep(sleep_time)
-                retry += 1
-
-
-def url_or_path_join(*path_list):
-    return os.path.join(*path_list) if os.path.isdir(os.path.join(*path_list)) else "/".join(path_list)
-
-
-def save_model_card(repo_id: str, images=None, base_model=str, train_text_encoder=False, prompt=str, repo_folder=None):
-    img_str = ""
-    for i, image in enumerate(images):
-        image.save(os.path.join(repo_folder, f"image_{i}.png"))
-        img_str += f"![img_{i}](./image_{i}.png)\n"
-
-    yaml = f"""
----
-license: creativeml-openrail-m
-base_model: {base_model}
-instance_prompt: {prompt}
-tags:
-- stable-diffusion
-- stable-diffusion-ppdiffusers
-- text-to-image
-- ppdiffusers
-- lora
-inference: false
----
-    """
-    model_card = f"""
-# LoRA DreamBooth - {repo_id}
-These are LoRA adaption weights for {base_model}. The weights were trained on {prompt} using [DreamBooth](https://dreambooth.github.io/). You can find some example images in the following. \n
-{img_str}
-
-LoRA for the text encoder was enabled: {train_text_encoder}.
-"""
-    with open(os.path.join(repo_folder, "README.md"), "w") as f:
-        f.write(yaml + model_card)
-
-
-def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str):
-    try:
-        text_encoder_config = PretrainedConfig.from_pretrained(
-            url_or_path_join(pretrained_model_name_or_path, "text_encoder")
-        )
-        model_class = text_encoder_config.architectures[0]
-    except Exception:
-        model_class = "LDMBertModel"
-    if model_class == "CLIPTextModel":
-        from paddlenlp.transformers import CLIPTextModel
-
-        return CLIPTextModel
-    elif model_class == "RobertaSeriesModelWithTransformation":
-        from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import (
-            RobertaSeriesModelWithTransformation,
-        )
-
-        return RobertaSeriesModelWithTransformation
-    elif model_class == "BertModel":
-        from paddlenlp.transformers import BertModel
-
-        return BertModel
-    elif model_class == "LDMBertModel":
-        from ppdiffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import (
-            LDMBertModel,
-        )
-
-        return LDMBertModel
-    else:
-        raise ValueError(f"{model_class} is not supported.")
-
-
-class Lambda(BaseTransform):
-    def __init__(self, fn, keys=None):
-        super().__init__(keys)
-        self.fn = fn
-
-    def _apply_image(self, img):
-        return self.fn(img)
-
-
-def get_report_to(args):
-    if args.report_to == "visualdl":
-        from visualdl import LogWriter
-
-        writer = LogWriter(logdir=args.logging_dir)
-    elif args.report_to == "tensorboard":
-        from tensorboardX import SummaryWriter
-
-        writer = SummaryWriter(logdir=args.logging_dir)
-    else:
-        raise ValueError("report_to must be in ['visualdl', 'tensorboard']")
-    return writer
-
-
-def parse_args(input_args=None):
-    parser = argparse.ArgumentParser(description="Simple example of a training dreambooth lora script.")
-    parser.add_argument(
-        "--pretrained_model_name_or_path",
-        type=str,
-        default=None,
-        required=True,
-        help="Path to pretrained model or model identifier from huggingface.co/models.",
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        type=str,
-        default=None,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--instance_data_dir",
-        type=str,
-        default=None,
-        required=True,
-        help="A folder containing the training data of instance images.",
-    )
-    parser.add_argument(
-        "--class_data_dir",
-        type=str,
-        default=None,
-        required=False,
-        help="A folder containing the training data of class images.",
-    )
-    parser.add_argument(
-        "--instance_prompt",
-        type=str,
-        default=None,
-        required=True,
-        help="The prompt with identifier specifying the instance",
-    )
-    parser.add_argument(
-        "--class_prompt",
-        type=str,
-        default=None,
-        help="The prompt to specify images in the same class as provided instance images.",
-    )
-    parser.add_argument(
-        "--validation_prompt", type=str, default=None, help="A prompt that is sampled during training for inference."
-    )
-    parser.add_argument(
-        "--num_validation_images",
-        type=int,
-        default=4,
-        help="Number of images that should be generated during validation with `validation_prompt`.",
-    )
-    parser.add_argument(
-        "--validation_epochs",
-        type=int,
-        default=50,
-        help=(
-            "Run dreambooth validation every X epochs. Dreambooth validation consists of running the prompt"
-            " `args.validation_prompt` multiple times: `args.num_validation_images`."
-        ),
-    )
-    parser.add_argument(
-        "--with_prior_preservation",
-        default=False,
-        action="store_true",
-        help="Flag to add prior preservation loss.",
-    )
-    parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
-    parser.add_argument(
-        "--num_class_images",
-        type=int,
-        default=100,
-        help=(
-            "Minimal class images for prior preservation loss. If there are not enough images already present in"
-            " class_data_dir, additional images will be sampled with class_prompt."
-        ),
-    )
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        default="lora-dreambooth-model",
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
-    parser.add_argument(
-        "--height",
-        type=int,
-        default=None,
-        help=(
-            "The height for input images, all the images in the train/validation dataset will be resized to this"
-            " height"
-        ),
-    )
-    parser.add_argument(
-        "--width",
-        type=int,
-        default=None,
-        help=(
-            "The width for input images, all the images in the train/validation dataset will be resized to this"
-            " width"
-        ),
-    )
-    parser.add_argument(
-        "--resolution",
-        type=int,
-        default=512,
-        help=(
-            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
-            " resolution"
-        ),
-    )
-    parser.add_argument(
-        "--lora_rank",
-        type=int,
-        default=4,
-        help="The rank of lora linear.",
-    )
-    parser.add_argument(
-        "--center_crop",
-        default=False,
-        action="store_true",
-        help=(
-            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
-            " cropped. The images will be resized to the resolution first before cropping."
-        ),
-    )
-    parser.add_argument(
-        "--random_flip",
-        action="store_true",
-        help="whether to randomly flip images horizontally",
-    )
-    parser.add_argument(
-        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
-    )
-    parser.add_argument(
-        "--train_text_encoder",
-        action="store_true",
-        help="Whether to train the text encoder. If set, the text encoder should be float32 precision.",
-    )
-    parser.add_argument(
-        "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images."
-    )
-    parser.add_argument("--num_train_epochs", type=int, default=1)
-    parser.add_argument(
-        "--max_train_steps",
-        type=int,
-        default=None,
-        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
-    )
-    parser.add_argument(
-        "--checkpointing_steps",
-        type=int,
-        default=500,
-        help=("Save a checkpoint of the training state every X updates."),
-    )
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument(
-        "--gradient_checkpointing",
-        action="store_true",
-        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
-    )
-    parser.add_argument(
-        "--learning_rate",
-        type=float,
-        default=5e-4,
-        help="Initial learning rate (after the potential warmup period) to use.",
-    )
-    parser.add_argument(
-        "--scale_lr",
-        action="store_true",
-        default=False,
-        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
-    )
-    parser.add_argument(
-        "--lr_scheduler",
-        type=str,
-        default="constant",
-        help=(
-            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
-            ' "constant", "constant_with_warmup"]'
-        ),
-    )
-    parser.add_argument(
-        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
-    )
-    parser.add_argument(
-        "--lr_num_cycles",
-        type=int,
-        default=1,
-        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
-    )
-    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
-    parser.add_argument(
-        "--dataloader_num_workers",
-        type=int,
-        default=0,
-        help=(
-            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
-        ),
-    )
-    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
-    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
-    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
-    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
-    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
-    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
-    parser.add_argument(
-        "--hub_model_id",
-        type=str,
-        default=None,
-        help="The name of the repository to keep in sync with the local `output_dir`.",
-    )
-    parser.add_argument(
-        "--logging_dir",
-        type=str,
-        default="logs",
-        help=(
-            "[TensorBoard](https://www.tensorflow.org/tensorboard) or [VisualDL](https://www.paddlepaddle.org.cn/paddle/visualdl) log directory. Will default to"
-            "*output_dir/logs"
-        ),
-    )
-    parser.add_argument(
-        "--report_to",
-        type=str,
-        default="visualdl",
-        choices=["tensorboard", "visualdl"],
-        help="Log writer type.",
-    )
-    parser.add_argument(
-        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
-    )
-    parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
-
-    if input_args is not None:
-        args = parser.parse_args(input_args)
-    else:
-        args = parser.parse_args()
-
-    if args.instance_data_dir is None:
-        raise ValueError("You must specify a train data directory.")
-
-    if args.with_prior_preservation:
-        if args.class_data_dir is None:
-            raise ValueError("You must specify a data directory for class images.")
-        if args.class_prompt is None:
-            raise ValueError("You must specify prompt for class images.")
-    else:
-        # logger is not available yet
-        if args.class_data_dir is not None:
-            warnings.warn("You need not use --class_data_dir without --with_prior_preservation.")
-        if args.class_prompt is not None:
-            warnings.warn("You need not use --class_prompt without --with_prior_preservation.")
-
-    args.logging_dir = os.path.join(args.output_dir, args.logging_dir)
-    if args.height is None or args.width is None and args.resolution is not None:
-        args.height = args.width = args.resolution
-
-    return args
-
-
-class DreamBoothDataset(Dataset):
-    """
-    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
-    It pre-processes the images and the tokenizes prompts.
-    """
-
-    def __init__(
-        self,
-        instance_data_root,
-        instance_prompt,
-        tokenizer,
-        class_data_root=None,
-        class_prompt=None,
-        class_num=None,
-        height=512,
-        width=512,
-        center_crop=False,
-        interpolation="bilinear",
-        random_flip=False,
-    ):
-        self.height = height
-        self.width = width
-        self.center_crop = center_crop
-        self.tokenizer = tokenizer
-
-        self.instance_data_root = Path(instance_data_root)
-        if not self.instance_data_root.exists():
-            raise ValueError("Instance images root doesn't exists.")
-        ext = ["png", "jpg", "jpeg", "bmp", "PNG", "JPG", "JPEG", "BMP"]
-        self.instance_images_path = []
-        for p in Path(instance_data_root).iterdir():
-            if any(suffix in p.name for suffix in ext):
-                self.instance_images_path.append(p)
-        self.num_instance_images = len(self.instance_images_path)
-        self.instance_prompt = instance_prompt
-        self._length = self.num_instance_images
-
-        if class_data_root is not None:
-            self.class_data_root = Path(class_data_root)
-            self.class_data_root.mkdir(parents=True, exist_ok=True)
-            self.class_images_path = []
-            for p in Path(class_data_root).iterdir():
-                if any(suffix in p.name for suffix in ext):
-                    self.class_images_path.append(p)
-            if class_num is not None:
-                self.num_class_images = min(len(self.class_images_path), class_num)
-            else:
-                self.num_class_images = len(self.class_images_path)
-            self._length = max(self.num_class_images, self.num_instance_images)
-            self.class_prompt = class_prompt
-        else:
-            self.class_data_root = None
-
-        self.image_transforms = transforms.Compose(
-            [
-                transforms.Resize((height, width), interpolation=interpolation),
-                transforms.CenterCrop((height, width)) if center_crop else transforms.RandomCrop((height, width)),
-                transforms.RandomHorizontalFlip() if random_flip else Lambda(lambda x: x),
-                transforms.ToTensor(),
-                transforms.Normalize([0.5], [0.5]),
-            ]
-        )
-
-    def __len__(self):
-        return self._length
-
-    def __getitem__(self, index):
-        example = {}
-        instance_image = Image.open(self.instance_images_path[index % self.num_instance_images])
-        if not instance_image.mode == "RGB":
-            instance_image = instance_image.convert("RGB")
-        example["instance_images"] = self.image_transforms(instance_image)
-        example["instance_prompt_ids"] = self.tokenizer(
-            self.instance_prompt,
-            padding="do_not_pad",
-            truncation=True,
-            max_length=self.tokenizer.model_max_length,
-            return_attention_mask=False,
-        ).input_ids
-
-        if self.class_data_root:
-            class_image = Image.open(self.class_images_path[index % self.num_class_images])
-            if not class_image.mode == "RGB":
-                class_image = class_image.convert("RGB")
-            example["class_images"] = self.image_transforms(class_image)
-            example["class_prompt_ids"] = self.tokenizer(
-                self.class_prompt,
-                padding="do_not_pad",
-                truncation=True,
-                max_length=self.tokenizer.model_max_length,
-                return_attention_mask=False,
-            ).input_ids
-
-        return example
-
-
-class PromptDataset(Dataset):
-    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
-
-    def __init__(self, prompt, num_samples):
-        self.prompt = prompt
-        self.num_samples = num_samples
-
-    def __len__(self):
-        return self.num_samples
-
-    def __getitem__(self, index):
-        example = {}
-        example["prompt"] = self.prompt
-        example["index"] = index
-        return example
-
-
-def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
-    if token is None:
-        token = HfFolder.get_token()
-    if organization is None:
-        username = whoami(token)["name"]
-        return f"{username}/{model_id}"
-    else:
-        return f"{organization}/{model_id}"
-
-
-def main():
-    args = parse_args()
-    rank = paddle.distributed.get_rank()
-    is_main_process = rank == 0
-    num_processes = paddle.distributed.get_world_size()
-    if num_processes > 1:
-        paddle.distributed.init_parallel_env()
-
-    # If passed along, set the training seed now.
-    if args.seed is not None:
-        set_seed(args.seed)
-
-    # Generate class images if prior preservation is enabled.
-    if args.with_prior_preservation:
-        class_images_dir = Path(args.class_data_dir)
-        if not class_images_dir.exists():
-            class_images_dir.mkdir(parents=True)
-        cur_class_images = len(list(class_images_dir.iterdir()))
-
-        if cur_class_images < args.num_class_images:
-            pipeline = DiffusionPipeline.from_pretrained(
-                args.pretrained_model_name_or_path, safety_checker=None, requires_safety_checker=False
-            )
-            if args.enable_xformers_memory_efficient_attention and is_ppxformers_available():
-                try:
-                    pipeline.unet.enable_xformers_memory_efficient_attention()
-                except Exception as e:
-                    logger.warning(
-                        "Could not enable memory efficient attention. Make sure develop paddlepaddle is installed"
-                        f" correctly and a GPU is available: {e}"
-                    )
-            pipeline.set_progress_bar_config(disable=True)
-
-            num_new_images = args.num_class_images - cur_class_images
-            logger.info(f"Number of class images to sample: {num_new_images}.")
-
-            sample_dataset = PromptDataset(args.class_prompt, num_new_images)
-            batch_sampler = (
-                DistributedBatchSampler(sample_dataset, batch_size=args.sample_batch_size, shuffle=False)
-                if num_processes > 1
-                else BatchSampler(sample_dataset, batch_size=args.sample_batch_size, shuffle=False)
-            )
-            sample_dataloader = DataLoader(
-                sample_dataset, batch_sampler=batch_sampler, num_workers=args.dataloader_num_workers
-            )
-
-            for example in tqdm(sample_dataloader, desc="Generating class images", disable=not is_main_process):
-                images = pipeline(example["prompt"]).images
-
-                for i, image in enumerate(images):
-                    hash_image = hashlib.sha1(image.tobytes()).hexdigest()
-                    image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
-                    image.save(image_filename)
-            pipeline.to("cpu")
-            del pipeline
-            gc.collect()
-
-    if is_main_process:
-        if args.output_dir is not None:
-            os.makedirs(args.output_dir, exist_ok=True)
-
-    # Load the tokenizer
-    if args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)
-    elif args.pretrained_model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(url_or_path_join(args.pretrained_model_name_or_path, "tokenizer"))
-
-    # import correct text encoder class
-    text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path)
-
-    # Load scheduler and models
-    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
-    text_encoder = text_encoder_cls.from_pretrained(
-        url_or_path_join(args.pretrained_model_name_or_path, "text_encoder")
-    )
-    text_config = text_encoder.config if isinstance(text_encoder.config, dict) else text_encoder.config.to_dict()
-    if text_config.get("use_attention_mask", None) is not None and text_config["use_attention_mask"]:
-        use_attention_mask = True
-    else:
-        use_attention_mask = False
-    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
-    unet = UNet2DConditionModel.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="unet",
-    )
-
-    # We only train the additional adapter LoRA layers
-    freeze_params(vae.parameters())
-    freeze_params(text_encoder.parameters())
-    freeze_params(unet.parameters())
-
-    if args.enable_xformers_memory_efficient_attention and is_ppxformers_available():
-        try:
-            unet.enable_xformers_memory_efficient_attention()
-        except Exception as e:
-            logger.warning(
-                "Could not enable memory efficient attention. Make sure develop paddlepaddle is installed"
-                f" correctly and a GPU is available: {e}"
-            )
-    # now we will add new LoRA weights to the attention layers
-    # It's important to realize here how many attention weights will be added and of which sizes
-    # The sizes of the attention layers consist only of two different variables:
-    # 1) - the "hidden_size", which is increased according to `unet.config.block_out_channels`.
-    # 2) - the "cross attention size", which is set to `unet.config.cross_attention_dim`.
-
-    # Let's first see how many attention processors we will have to set.
-    # For Stable Diffusion, it should be equal to:
-    # - down blocks (2x attention layers) * (2x transformer layers) * (3x down blocks) = 12
-    # - mid blocks (2x attention layers) * (1x transformer layers) * (1x mid blocks) = 2
-    # - up blocks (2x attention layers) * (3x transformer layers) * (3x down blocks) = 18
-    # => 32 layers
-
-    # Set correct lora layers
-    unet_lora_attn_procs = {}
-    for name, attn_processor in unet.attn_processors.items():
-        cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
-        if name.startswith("mid_block"):
-            hidden_size = unet.config.block_out_channels[-1]
-        elif name.startswith("up_blocks"):
-            block_id = int(name[len("up_blocks.")])
-            hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
-        elif name.startswith("down_blocks"):
-            block_id = int(name[len("down_blocks.")])
-            hidden_size = unet.config.block_out_channels[block_id]
-
-        if isinstance(attn_processor, AttnProcessor):
-            lora_attn_processor_class = LoRAAttnProcessor
-        elif isinstance(attn_processor, AttnProcessor2_5):
-            lora_attn_processor_class = LoRAAttnProcessor2_5
-        else:
-            raise ValueError(f"Unknown attention processor type: {attn_processor.__class__.__name__}")
-
-        unet_lora_attn_procs[name] = lora_attn_processor_class(
-            hidden_size=hidden_size,
-            cross_attention_dim=cross_attention_dim,
-            rank=args.lora_rank,
-        )
-
-    unet.set_attn_processor(unet_lora_attn_procs)
-    unet_lora_layers = AttnProcsLayers(unet.attn_processors)
-
-    # The text encoder comes from 🤗 transformers, so we cannot directly modify it.
-    # So, instead, we monkey-patch the forward calls of its attention-blocks. For this,
-    # we first load a dummy pipeline with the text encoder and then do the monkey-patching.
-    text_encoder_lora_layers = None
-    if args.train_text_encoder:
-        text_lora_attn_procs = {}
-        for name, module in text_encoder.named_sublayers(include_self=True):
-            if name.endswith(TEXT_ENCODER_ATTN_MODULE):
-                text_lora_attn_procs[name] = LoRAAttnProcessor(
-                    hidden_size=module.out_proj.weight.shape[1],
-                    cross_attention_dim=None,
-                    rank=args.lora_rank,
-                )
-        text_encoder_lora_layers = AttnProcsLayers(text_lora_attn_procs)
-        temp_pipeline = DiffusionPipeline.from_pretrained(
-            args.pretrained_model_name_or_path, text_encoder=text_encoder
-        )
-        temp_pipeline._modify_text_encoder(text_lora_attn_procs)
-        text_encoder = temp_pipeline.text_encoder
-        del temp_pipeline
-
-    # Dataset and DataLoaders creation:
-    train_dataset = DreamBoothDataset(
-        instance_data_root=args.instance_data_dir,
-        instance_prompt=args.instance_prompt,
-        class_data_root=args.class_data_dir if args.with_prior_preservation else None,
-        class_prompt=args.class_prompt,
-        class_num=args.num_class_images,
-        tokenizer=tokenizer,
-        height=args.height,
-        width=args.width,
-        center_crop=args.center_crop,
-        interpolation="bilinear",
-        random_flip=args.random_flip,
-    )
-
-    def collate_fn(examples):
-        input_ids = [example["instance_prompt_ids"] for example in examples]
-        pixel_values = [example["instance_images"] for example in examples]
-
-        # Concat class and instance examples for prior preservation.
-        # We do this to avoid doing two forward passes.
-        if args.with_prior_preservation:
-            input_ids += [example["class_prompt_ids"] for example in examples]
-            pixel_values += [example["class_images"] for example in examples]
-
-        pixel_values = paddle.stack(pixel_values).astype("float32")
-
-        input_ids = tokenizer.pad(
-            {"input_ids": input_ids}, padding="max_length", max_length=tokenizer.model_max_length, return_tensors="pd"
-        ).input_ids
-
-        return {
-            "input_ids": input_ids,
-            "pixel_values": pixel_values,
-        }
-
-    train_sampler = (
-        DistributedBatchSampler(train_dataset, batch_size=args.train_batch_size, shuffle=True)
-        if num_processes > 1
-        else BatchSampler(train_dataset, batch_size=args.train_batch_size, shuffle=True)
-    )
-    train_dataloader = DataLoader(
-        train_dataset, batch_sampler=train_sampler, collate_fn=collate_fn, num_workers=args.dataloader_num_workers
-    )
-
-    # Scheduler and math around the number of training steps.
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
-    if args.max_train_steps is None:
-        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
-    # Afterwards we recalculate our number of training epochs
-    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
-
-    if args.scale_lr:
-        args.learning_rate = (
-            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * num_processes
-        )
-
-    lr_scheduler = get_scheduler(
-        args.lr_scheduler,
-        learning_rate=args.learning_rate,
-        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
-        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
-        num_cycles=args.lr_num_cycles,
-        power=args.lr_power,
-    )
-
-    params_to_optimize = (
-        list(unet_lora_layers.parameters()) + list(text_encoder_lora_layers.parameters())
-        if args.train_text_encoder
-        else unet_lora_layers.parameters()
-    )
-    # Optimizer creation
-    optimizer = AdamW(
-        learning_rate=lr_scheduler,
-        parameters=params_to_optimize,
-        beta1=args.adam_beta1,
-        beta2=args.adam_beta2,
-        weight_decay=args.adam_weight_decay,
-        epsilon=args.adam_epsilon,
-        grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm) if args.max_grad_norm > 0 else None,
-    )
-
-    if num_processes > 1:
-        unet = paddle.DataParallel(unet)
-        if args.train_text_encoder:
-            text_encoder = paddle.DataParallel(text_encoder)
-
-    if is_main_process:
-        logger.info("-----------  Configuration Arguments -----------")
-        for arg, value in sorted(vars(args).items()):
-            logger.info("%s: %s" % (arg, value))
-        logger.info("------------------------------------------------")
-        writer = get_report_to(args)
-
-    # Train!
-    total_batch_size = args.train_batch_size * num_processes * args.gradient_accumulation_steps
-
-    logger.info("***** Running training *****")
-    logger.info(f"  Num examples = {len(train_dataset)}")
-    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
-    logger.info(f"  Num Epochs = {args.num_train_epochs}")
-    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
-    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
-    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
-    logger.info(f"  Total optimization steps = {args.max_train_steps}")
-
-    # Only show the progress bar once on each machine.
-    progress_bar = tqdm(range(args.max_train_steps), disable=not is_main_process)
-    progress_bar.set_description("Train Steps")
-    global_step = 0
-    vae.eval()
-    if args.train_text_encoder:
-        text_encoder.train()
-    else:
-        text_encoder.eval()
-
-    for epoch in range(args.num_train_epochs):
-        unet.train()
-        for step, batch in enumerate(train_dataloader):
-            # Convert images to latent space
-            latents = vae.encode(batch["pixel_values"]).latent_dist.sample()
-            latents = latents * vae.config.scaling_factor
-
-            # Sample noise that we'll add to the latents
-            noise = paddle.randn(latents.shape, dtype=latents.dtype)
-            if args.noise_offset:
-                # https://www.crosslabs.org//blog/diffusion-with-offset-noise
-                noise += args.noise_offset * paddle.randn(
-                    (latents.shape[0], latents.shape[1], 1, 1), dtype=latents.dtype
-                )
-            batch_size = latents.shape[0]
-            # Sample a random timestep for each image
-            timesteps = paddle.randint(0, noise_scheduler.config.num_train_timesteps, (batch_size,)).cast("int64")
-
-            # Add noise to the latents according to the noise magnitude at each timestep
-            # (this is the forward diffusion process)
-            noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
-
-            if num_processes > 1 and (
-                args.gradient_checkpointing or ((step + 1) % args.gradient_accumulation_steps != 0)
-            ):
-                # grad acc, no_sync when (step + 1) % args.gradient_accumulation_steps != 0:
-                # gradient_checkpointing, no_sync every where
-                # gradient_checkpointing + grad_acc, no_sync every where
-                unet_ctx_manager = unet.no_sync()
-            else:
-                unet_ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
-
-            if use_attention_mask:
-                attention_mask = (batch["input_ids"] != tokenizer.pad_token_id).cast("int64")
-            else:
-                attention_mask = None
-            encoder_hidden_states = text_encoder(batch["input_ids"], attention_mask=attention_mask)[0]
-
-            with unet_ctx_manager:
-                # Predict the noise residual / sample
-                model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
-
-                # Get the target for loss depending on the prediction type
-                if noise_scheduler.config.prediction_type == "epsilon":
-                    target = noise
-                elif noise_scheduler.config.prediction_type == "v_prediction":
-                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
-                else:
-                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
-
-                if args.with_prior_preservation:
-                    # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
-                    model_pred, model_pred_prior = model_pred.chunk(2, axis=0)
-                    target, target_prior = target.chunk(2, axis=0)
-
-                    # Compute instance loss
-                    loss = F.mse_loss(model_pred, target, reduction="mean")
-
-                    # Compute prior loss
-                    prior_loss = F.mse_loss(model_pred_prior, target_prior, reduction="mean")
-
-                    # Add the prior loss to the instance loss.
-                    loss = loss + args.prior_loss_weight * prior_loss
-                else:
-                    loss = F.mse_loss(model_pred, target, reduction="mean")
-
-                if args.gradient_accumulation_steps > 1:
-                    loss = loss / args.gradient_accumulation_steps
-                loss.backward()
-
-            if (step + 1) % args.gradient_accumulation_steps == 0:
-                if num_processes > 1 and args.gradient_checkpointing:
-                    fused_allreduce_gradients(params_to_optimize, None)
-                optimizer.step()
-                lr_scheduler.step()
-                optimizer.clear_grad()
-                progress_bar.update(1)
-                global_step += 1
-                step_loss = loss.item() * args.gradient_accumulation_steps
-                logs = {
-                    "epoch": str(epoch).zfill(4),
-                    "step_loss": round(step_loss, 10),
-                    "lr": lr_scheduler.get_lr(),
-                }
-                progress_bar.set_postfix(**logs)
-
-                if is_main_process:
-                    for name, val in logs.items():
-                        if name == "epoch":
-                            continue
-                        writer.add_scalar(f"train/{name}", val, global_step)
-
-                    if global_step % args.checkpointing_steps == 0:
-                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
-                        # We combine the text encoder and UNet LoRA parameters with a simple
-                        # custom logic. So, use `LoraLoaderMixin.save_lora_weights()`.
-                        LoraLoaderMixin.save_lora_weights(
-                            save_directory=save_path,
-                            unet_lora_layers=unet_lora_layers,
-                            text_encoder_lora_layers=text_encoder_lora_layers,
-                        )
-                        logger.info(f"Saved lora weights to {save_path}")
-
-                if global_step >= args.max_train_steps:
-                    break
-
-        if is_main_process:
-            if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
-                logger.info(
-                    f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
-                    f" {args.validation_prompt}."
-                )
-                # create pipeline
-                pipeline = DiffusionPipeline.from_pretrained(
-                    args.pretrained_model_name_or_path,
-                    unet=unwrap_model(unet),
-                    text_encoder=unwrap_model(text_encoder),
-                    safety_checker=None,
-                    requires_safety_checker=False,
-                )
-                pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
-                pipeline.set_progress_bar_config(disable=True)
-
-                # run inference
-                generator = paddle.Generator().manual_seed(args.seed) if args.seed else None
-                images = [
-                    pipeline(args.validation_prompt, num_inference_steps=25, generator=generator).images[0]
-                    for _ in range(args.num_validation_images)
-                ]
-                np_images = np.stack([np.asarray(img) for img in images])
-
-                if args.report_to == "tensorboard":
-                    writer.add_images("test", np_images, epoch, dataformats="NHWC")
-                else:
-                    writer.add_image("test", np_images, epoch, dataformats="NHWC")
-
-                del pipeline
-                if args.train_text_encoder:
-                    text_encoder.train()
-                unet.train()
-                gc.collect()
-
-    # Save the lora layers
-    if is_main_process:
-        LoraLoaderMixin.save_lora_weights(
-            save_directory=args.output_dir,
-            unet_lora_layers=unet_lora_layers,
-            text_encoder_lora_layers=text_encoder_lora_layers,
-        )
-
-        # Final inference
-        # Load previous pipeline
-        pipeline = DiffusionPipeline.from_pretrained(
-            args.pretrained_model_name_or_path, safety_checker=None, requires_safety_checker=False
-        )
-        pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
-        # load attention processors
-        pipeline.load_lora_weights(args.output_dir)
-
-        # run inference
-        if args.validation_prompt and args.num_validation_images > 0:
-            generator = paddle.Generator().manual_seed(args.seed) if args.seed else None
-            images = [
-                pipeline(args.validation_prompt, num_inference_steps=25, generator=generator).images[0]
-                for _ in range(args.num_validation_images)
-            ]
-            np_images = np.stack([np.asarray(img) for img in images])
-
-            if args.report_to == "tensorboard":
-                writer.add_images("test", np_images, epoch, dataformats="NHWC")
-            else:
-                writer.add_image("test", np_images, epoch, dataformats="NHWC")
-
-        writer.close()
-
-        # logic to push to HF Hub
-        if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_id = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_id = args.hub_model_id
-
-            _retry(
-                create_repo,
-                func_kwargs={"repo_id": repo_id, "exist_ok": True, "token": args.hub_token},
-                base_wait_time=1.0,
-                max_retries=5,
-                max_wait_time=10.0,
-            )
-
-            save_model_card(
-                repo_id,
-                images=images,
-                base_model=args.pretrained_model_name_or_path,
-                prompt=args.instance_prompt,
-                repo_folder=args.output_dir,
-            )
-            # Upload model
-            logger.info(f"Pushing to {repo_id}")
-            _retry(
-                upload_folder,
-                func_kwargs={
-                    "repo_id": repo_id,
-                    "repo_type": "model",
-                    "folder_path": args.output_dir,
-                    "commit_message": "End of training",
-                    "token": args.hub_token,
-                    "ignore_patterns": ["checkpoint-*/*", "logs/*"],
-                },
-                base_wait_time=1.0,
-                max_retries=5,
-                max_wait_time=20.0,
-            )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/ppdiffusers/examples/inference/README.md b/ppdiffusers/examples/inference/README.md
deleted file mode 100644
index fb3b03e5141e..000000000000
--- a/ppdiffusers/examples/inference/README.md
+++ /dev/null
@@ -1,36 +0,0 @@
-# Pipelines汇总
-
-下表总结了所有支持的Pipelines，以及相应的论文、任务、推理脚本。
-
-| Pipeline                                                                                                                      | 源链接                                                                                                                       | 任务 | 推理脚本
-|-------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------|:---:|:---:|
-| [alt_diffusion](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/alt_diffusion)                 | [**Alt Diffusion**](https://arxiv.org/abs/2211.06679)   | *Text-to-Image Generation* |  [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/text_to_image_generation-alt_diffusion.py)
-| [alt_diffusion](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/alt_diffusion)                 | [**Alt Diffusion**](https://arxiv.org/abs/2211.06679)   | *Image-to-Image Text-Guided Generation* |  [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/image_to_image_text_guided_generation-alt_diffusion.py)
-| [audio_diffusion](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/audio_diffusion)                 | [**Audio Diffusion**](https://github.com/teticio/audio-diffusion)   | *Unconditional Audio Generation* |  [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/unconditional_audio_generation-audio_diffusion.py)
-| [controlnet](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/controlnet)                 | [**ControlNet with Stable Diffusion**](https://arxiv.org/abs/2302.05543)   | *Image-to-Image Text-Guided Generation* |  [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/image_to_image_text_guided_generation-controlnet.py)
-| [dance_diffusion](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/dance_diffusion)                 | [**Dance Diffusion**](https://github.com/Harmonai-org/sample-generator)                                                      | *Unconditional Audio Generation* |  [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/unconditional_audio_generation-dance_diffusion.py)
-| [ddpm](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/ddpm)                                       | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239)                                             | *Unconditional Image Generation* |  [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/unconditional_image_generation-ddpm.py)
-| [ddim](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/ddim)                                       | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502)                                                  | *Unconditional Image Generation* | [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/unconditional_image_generation-ddim.py)
-| [latent_diffusion](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/latent_diffusion)               | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)                         | *Text-to-Image Generation* | [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/text_to_image_generation-latent_diffusion.py)
-| [latent_diffusion](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/latent_diffusion)               | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)                         | *Super Superresolution* | [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/super_resolution-latent_diffusion.py)
-| [latent_diffusion_uncond](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/latent_diffusion_uncond) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)                         | *Unconditional Image Generation* | [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/unconditional_image_generation-latent_diffusion_uncond.py)
-| [paint_by_example](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/paint_by_example)                                       | [**Paint by Example: Exemplar-based Image Editing with Diffusion Models**](https://arxiv.org/abs/2211.13227)                           | *Image-Guided Image Inpainting* | [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/image_guided_image_inpainting-paint_by_example.py)
-| [pndm](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/pndm)                                       | [**Pseudo Numerical Methods for Diffusion Models on Manifolds**](https://arxiv.org/abs/2202.09778)                           | *Unconditional Image Generation* | [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/unconditional_image_generation-pndm.py)
-| [repaint](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/repaint)                 | [**Repaint**](https://arxiv.org/abs/2201.09865)                                                      | *Image Inpainting* |  [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/image_inpainting-repaint.py)
-| [score_sde_ve](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/score_sde_ve)                       | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | *Unconditional Image Generation* | [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/unconditional_image_generation-score_sde_ve.py)
-| [semantic_stable_diffusion](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion)                | [**Semantic Guidance**](https://arxiv.org/abs/2301.12247)                                            | *Text-Guided Generation* | [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/text_guided_generation-semantic_stable_diffusion.py)
-| [stable_diffusion](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/stable_diffusion)                | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release)                                            | *Text-to-Image Generation* | [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/text_to_image_generation-stable_diffusion.py)
-| [stable_diffusion](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/stable_diffusion)               | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release)                                            | *Image-to-Image Text-Guided Generation* | [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/image_to_image_text_guided_generation-stable_diffusion.py)
-| [stable_diffusion](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/stable_diffusion)                 | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release)                                            | *Text-Guided Image Inpainting* | [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/text_guided_image_inpainting-stable_diffusion.py)
-| [stable_diffusion_2](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/stable_diffusion)                | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release)                                            | *Text-to-Image Generation* | [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/text_to_image_generation-stable_diffusion_2.py)
-| [stable_diffusion_2](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/stable_diffusion)               | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release)                                            | *Image-to-Image Text-Guided Generation* | [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/image_to_image_text_guided_generation-stable_diffusion_2.py)
-| [stable_diffusion_2](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/stable_diffusion)                 | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release)                                            | *Text-Guided Image Inpainting* | [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/text_guided_image_inpainting-stable_diffusion_2.py)
-| [stable_diffusion_2](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/stable_diffusion)                 | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release)                                            | *Text-Guided Image Upscaling* | [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/text_guided_image_upscaling-stable_diffusion_2.py)
-| [stable_diffusion_2](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/stable_diffusion)                 | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release)                                            | *Text-Guided Image Upscaling* | [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/text_guided_image_upscaling-stable_diffusion_2.py)
-| [stable_diffusion_safe](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_safe)                 | [**Safe Stable Diffusion**](https://arxiv.org/abs/2211.05105)                                                      | *Text-to-Image Generation* |  [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/text_to_image_generation-stable_diffusion_safe.py)
-| [stochastic_karras_ve](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/stochastic_karras_ve)       | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364)                    | *Unconditional Image Generation* | [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/unconditional_image_generation-stochastic_karras_ve.py)
-| [unclip](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/unclip)                 | [**UnCLIP**](https://arxiv.org/abs/2204.06125)                                                      | *Text-to-Image Generation* |  [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/text_to_image_generation-unclip.py)
-| [versatile_diffusion](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion)                 | [**Versatile Diffusion**](https://arxiv.org/abs/2211.08332)                                                      | *Text-to-Image Generation* |  [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/text_to_image_generation-versatile_diffusion.py)
-| [versatile_diffusion](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion)                 | [**Versatile Diffusion**](https://arxiv.org/abs/2211.08332)                                                      | *Image Variation* |  [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/image_variation-versatile_diffusion.py)
-| [versatile_diffusion](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion)                 | [**Versatile Diffusion**](https://arxiv.org/abs/2211.08332)                                                      | *Dual Text and Image Guided Generation* |  [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/dual_text_and_image_guided_generation-versatile_diffusion.py)
-| [vq_diffusion](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/vq_diffusion)                 | [**VQ Diffusion**](https://arxiv.org/abs/2111.14822)                                                      | *Text-to-Image Generation* |  [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/text_to_image_generation-vq_diffusion.py)
diff --git a/ppdiffusers/examples/inference/dual_text_and_image_guided_generation-versatile_diffusion.py b/ppdiffusers/examples/inference/dual_text_and_image_guided_generation-versatile_diffusion.py
deleted file mode 100644
index fb7a20763c80..000000000000
--- a/ppdiffusers/examples/inference/dual_text_and_image_guided_generation-versatile_diffusion.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ppdiffusers import VersatileDiffusionDualGuidedPipeline
-from ppdiffusers.utils import load_image
-
-url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/benz.jpg"
-image = load_image(url)
-text = "a red car in the sun"
-
-pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained("shi-labs/versatile-diffusion")
-pipe.remove_unused_weights()
-
-text_to_image_strength = 0.75
-image = pipe(prompt=text, image=image, text_to_image_strength=text_to_image_strength).images[0]
-image.save("dual_text_and_image_guided_generation-versatile_diffusion-result.png")
diff --git a/ppdiffusers/examples/inference/image_guided_image_inpainting-paint_by_example.py b/ppdiffusers/examples/inference/image_guided_image_inpainting-paint_by_example.py
deleted file mode 100644
index 99812e2bd212..000000000000
--- a/ppdiffusers/examples/inference/image_guided_image_inpainting-paint_by_example.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-
-from ppdiffusers import PaintByExamplePipeline
-from ppdiffusers.utils import load_image
-
-img_url = "https://paddlenlp.bj.bcebos.com/models/community/Fantasy-Studio/data/image_example_1.png"
-mask_url = "https://paddlenlp.bj.bcebos.com/models/community/Fantasy-Studio/data/mask_example_1.png"
-example_url = "https://paddlenlp.bj.bcebos.com/models/community/Fantasy-Studio/data/reference_example_1.jpeg"
-
-init_image = load_image(img_url).resize((512, 512))
-mask_image = load_image(mask_url).resize((512, 512))
-example_image = load_image(example_url).resize((512, 512))
-
-pipe = PaintByExamplePipeline.from_pretrained("Fantasy-Studio/Paint-by-Example")
-
-# 使用fp16加快生成速度
-with paddle.amp.auto_cast(True):
-    image = pipe(image=init_image, mask_image=mask_image, example_image=example_image).images[0]
-image.save("image_guided_image_inpainting-paint_by_example-result.png")
diff --git a/ppdiffusers/examples/inference/image_inpainting-repaint.py b/ppdiffusers/examples/inference/image_inpainting-repaint.py
deleted file mode 100644
index 3d4a971fd734..000000000000
--- a/ppdiffusers/examples/inference/image_inpainting-repaint.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ppdiffusers import RePaintPipeline, RePaintScheduler
-from ppdiffusers.utils import load_image
-
-img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/celeba_hq_256.png"
-mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/mask_256.png"
-
-# Load the original image and the mask as PIL images
-original_image = load_image(img_url).resize((256, 256))
-mask_image = load_image(mask_url).resize((256, 256))
-
-scheduler = RePaintScheduler.from_pretrained("google/ddpm-ema-celebahq-256", subfolder="scheduler")
-pipe = RePaintPipeline.from_pretrained("google/ddpm-ema-celebahq-256", scheduler=scheduler)
-
-output = pipe(
-    image=original_image,
-    mask_image=mask_image,
-    num_inference_steps=250,
-    eta=0.0,
-    jump_length=10,
-    jump_n_sample=10,
-)
-inpainted_image = output.images[0]
-
-inpainted_image.save("image_inpainting-repaint-result.png")
diff --git a/ppdiffusers/examples/inference/image_mixing-stable_diffusion.py b/ppdiffusers/examples/inference/image_mixing-stable_diffusion.py
deleted file mode 100644
index 2474e261438f..000000000000
--- a/ppdiffusers/examples/inference/image_mixing-stable_diffusion.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from io import BytesIO
-
-import paddle
-import PIL
-import requests
-
-from paddlenlp.transformers import CLIPFeatureExtractor, CLIPModel
-from ppdiffusers import DiffusionPipeline
-
-
-def download_image(url):
-    response = requests.get(url)
-    return PIL.Image.open(BytesIO(response.content)).convert("RGB")
-
-
-# Loading additional models
-feature_extractor = CLIPFeatureExtractor.from_pretrained("laion/CLIP-ViT-B-32-laion2B-s34B-b79K")
-clip_model = CLIPModel.from_pretrained("laion/CLIP-ViT-B-32-laion2B-s34B-b79K", paddle_dtype=paddle.float16)
-
-mixing_pipeline = DiffusionPipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4",
-    custom_pipeline="clip_guided_images_mixing_stable_diffusion",
-    clip_model=clip_model,
-    feature_extractor=feature_extractor,
-    paddle_dtype=paddle.float16,
-)
-mixing_pipeline.enable_attention_slicing()
-
-# Pipline running
-generator = paddle.Generator().manual_seed(17)
-
-
-content_image = download_image(
-    "https://paddlenlp.bj.bcebos.com/models/community/westfish/develop/clip_guided_images_mixing_stable_diffusion_images/boromir.jpg"
-)
-style_image = download_image(
-    "https://paddlenlp.bj.bcebos.com/models/community/westfish/develop/clip_guided_images_mixing_stable_diffusion_images/gigachad.jpg"
-)
-
-pipe_images = mixing_pipeline(
-    num_inference_steps=50,
-    content_image=content_image,
-    style_image=style_image,
-    content_prompt="boromir",
-    style_prompt="gigachad",
-    noise_strength=0.65,
-    slerp_latent_style_strength=0.9,
-    slerp_prompt_style_strength=0.1,
-    slerp_clip_image_style_strength=0.1,
-    guidance_scale=9.0,
-    batch_size=1,
-    clip_guidance_scale=100,
-    generator=generator,
-).images
-
-pipe_images[0].save("clip_guided_images_mixing_stable_diffusion.png")
diff --git a/ppdiffusers/examples/inference/image_to_image_text_guided_generation-alt_diffusion.py b/ppdiffusers/examples/inference/image_to_image_text_guided_generation-alt_diffusion.py
deleted file mode 100644
index 1525fc680c2c..000000000000
--- a/ppdiffusers/examples/inference/image_to_image_text_guided_generation-alt_diffusion.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-
-from ppdiffusers import AltDiffusionImg2ImgPipeline
-from ppdiffusers.utils import load_image
-
-pipe = AltDiffusionImg2ImgPipeline.from_pretrained("BAAI/AltDiffusion")
-
-url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
-
-init_image = load_image(url).resize((768, 512))
-
-prompt = "奇幻的景观，以一种艺术的形式。"
-# 使用fp16加快生成速度
-with paddle.amp.auto_cast(True):
-    image = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images[0]
-
-image.save("image_to_image_text_guided_generation-alt_diffusion-result.png")
diff --git a/ppdiffusers/examples/inference/image_to_image_text_guided_generation-controlnet.py b/ppdiffusers/examples/inference/image_to_image_text_guided_generation-controlnet.py
deleted file mode 100644
index b1d9267b2ac0..000000000000
--- a/ppdiffusers/examples/inference/image_to_image_text_guided_generation-controlnet.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-
-from ppdiffusers import ControlNetModel, StableDiffusionControlNetPipeline
-from ppdiffusers.utils import load_image
-
-controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
-pipe = StableDiffusionControlNetPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
-)
-pipe.set_progress_bar_config(disable=None)
-
-generator = paddle.Generator().manual_seed(0)
-prompt = "bird"
-image = load_image(
-    "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
-)
-
-output = pipe(prompt, image, generator=generator)
-
-image = output.images[0]
-
-image.save("image_to_image_text_guided_generation-controlnet-result.png")
diff --git a/ppdiffusers/examples/inference/image_to_image_text_guided_generation-deepfloyd_if.py b/ppdiffusers/examples/inference/image_to_image_text_guided_generation-deepfloyd_if.py
deleted file mode 100644
index bdd71eb35c00..000000000000
--- a/ppdiffusers/examples/inference/image_to_image_text_guided_generation-deepfloyd_if.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-
-from ppdiffusers import IFImg2ImgPipeline, IFImg2ImgSuperResolutionPipeline
-from ppdiffusers.utils import load_image, pd_to_pil
-
-url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
-original_image = load_image(url)
-original_image = original_image.resize((768, 512))
-
-pipe = IFImg2ImgPipeline.from_pretrained(
-    "DeepFloyd/IF-I-XL-v1.0",
-    variant="fp16",
-    paddle_dtype=paddle.float16,
-)
-pipe.enable_xformers_memory_efficient_attention()
-prompt = "A fantasy landscape in style minecraft"
-prompt_embeds, negative_embeds = pipe.encode_prompt(prompt)
-
-image = pipe(
-    image=original_image,
-    prompt_embeds=prompt_embeds,
-    negative_prompt_embeds=negative_embeds,
-    output_type="pd",
-).images
-pipe.to(paddle_device="cpu")
-
-# save intermediate image
-pil_image = pd_to_pil(image)
-pil_image[0].save("./image_to_image_text_guided_generation-deepfloyd_if-if_stage_I.png")
-
-super_res_1_pipe = IFImg2ImgSuperResolutionPipeline.from_pretrained(
-    "DeepFloyd/IF-II-L-v1.0",
-    text_encoder=None,
-    variant="fp16",
-    paddle_dtype=paddle.float16,
-)
-super_res_1_pipe.enable_xformers_memory_efficient_attention()
-
-image = super_res_1_pipe(
-    image=image,
-    original_image=original_image,
-    prompt_embeds=prompt_embeds,
-    negative_prompt_embeds=negative_embeds,
-).images
-image[0].save("./image_to_image_text_guided_generation-deepfloyd_if-if_stage_II.png")
diff --git a/ppdiffusers/examples/inference/image_to_image_text_guided_generation-stable_diffusion.py b/ppdiffusers/examples/inference/image_to_image_text_guided_generation-stable_diffusion.py
deleted file mode 100644
index 5b2d857d58b4..000000000000
--- a/ppdiffusers/examples/inference/image_to_image_text_guided_generation-stable_diffusion.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-
-from ppdiffusers import StableDiffusionImg2ImgPipeline
-from ppdiffusers.utils import load_image
-
-# 加载pipeline
-pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-
-# 下载初始图片
-url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
-
-init_image = load_image(url).resize((768, 512))
-
-prompt = "A fantasy landscape, trending on artstation"
-# 使用fp16加快生成速度
-with paddle.amp.auto_cast(True):
-    image = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images[0]
-
-image.save("image_to_image_text_guided_generation-stable_diffusion-result.png")
diff --git a/ppdiffusers/examples/inference/image_to_image_text_guided_generation-stable_diffusion_2.py b/ppdiffusers/examples/inference/image_to_image_text_guided_generation-stable_diffusion_2.py
deleted file mode 100644
index 67472607587b..000000000000
--- a/ppdiffusers/examples/inference/image_to_image_text_guided_generation-stable_diffusion_2.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-
-from ppdiffusers import StableDiffusionImg2ImgPipeline
-from ppdiffusers.utils import load_image
-
-pipe = StableDiffusionImg2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-2")
-
-# 下载初始图片
-url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
-
-init_image = load_image(url).resize((768, 768))
-
-prompt = "A fantasy landscape, trending on artstation"
-# 使用fp16加快生成速度
-with paddle.amp.auto_cast(True):
-    image = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images[0]
-
-image.save("image_to_image_text_guided_generation-stable_diffusion_2-result.png")
diff --git a/ppdiffusers/examples/inference/image_to_text_generation-unidiffuser.py b/ppdiffusers/examples/inference/image_to_text_generation-unidiffuser.py
deleted file mode 100644
index 1c7678b55930..000000000000
--- a/ppdiffusers/examples/inference/image_to_text_generation-unidiffuser.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ppdiffusers import UniDiffuserPipeline
-from ppdiffusers.utils import load_image
-
-pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser")
-image = load_image("https://bj.bcebos.com/v1/paddlenlp/models/community/thu-ml/data/space.jpg")
-result = pipe(mode="i2t", image=image, prompt=None)
-text = result.texts[0]
-with open("image_to_text_generation-unidiffuser-result.txt", "w") as f:
-    print("{}\n".format(text), file=f)
diff --git a/ppdiffusers/examples/inference/image_variation-stable_diffusion.py b/ppdiffusers/examples/inference/image_variation-stable_diffusion.py
deleted file mode 100644
index b4d0b9d392e1..000000000000
--- a/ppdiffusers/examples/inference/image_variation-stable_diffusion.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from paddle.vision import transforms
-
-from ppdiffusers import StableDiffusionImageVariationPipeline
-from ppdiffusers.utils import load_image
-
-sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained(
-    "lambdalabs/sd-image-variations-diffusers",
-    revision="v2.0",
-    from_diffusers=True,
-    from_hf_hub=True,
-)
-
-im = load_image("https://bj.bcebos.com/v1/paddlenlp/models/community/thu-ml/data/space.jpg")
-
-tform = transforms.Compose(
-    [
-        transforms.ToTensor(),
-        transforms.Resize(
-            (224, 224),
-            interpolation="bicubic",
-        ),
-        transforms.Normalize([0.48145466, 0.4578275, 0.40821073], [0.26862954, 0.26130258, 0.27577711]),
-    ]
-)
-inp = tform(im)
-
-out = sd_pipe(im, guidance_scale=3)
-out["images"][0].save("image_variation-stable_diffusion-result.jpg")
diff --git a/ppdiffusers/examples/inference/image_variation-unidiffuser.py b/ppdiffusers/examples/inference/image_variation-unidiffuser.py
deleted file mode 100644
index efa2ba77d93a..000000000000
--- a/ppdiffusers/examples/inference/image_variation-unidiffuser.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from ppdiffusers import UniDiffuserPipeline
-from ppdiffusers.utils import load_image
-
-pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser")
-image = load_image("https://bj.bcebos.com/v1/paddlenlp/models/community/thu-ml/data/space.jpg")
-result = pipe(mode="i2t2i", image=image, prompt=None)
-image = result.images[0]
-image.save("image_variation-unidiffuser-result.png")
diff --git a/ppdiffusers/examples/inference/image_variation-versatile_diffusion.py b/ppdiffusers/examples/inference/image_variation-versatile_diffusion.py
deleted file mode 100644
index 3b2ec2596cbc..000000000000
--- a/ppdiffusers/examples/inference/image_variation-versatile_diffusion.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ppdiffusers import VersatileDiffusionImageVariationPipeline
-from ppdiffusers.utils import load_image
-
-url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/benz.jpg"
-image = load_image(url)
-
-pipe = VersatileDiffusionImageVariationPipeline.from_pretrained("shi-labs/versatile-diffusion")
-
-image = pipe(image).images[0]
-image.save("image_variation-versatile_diffusion-result.png")
diff --git a/ppdiffusers/examples/inference/super_resolution-latent_diffusion.py b/ppdiffusers/examples/inference/super_resolution-latent_diffusion.py
deleted file mode 100644
index a986de034bc0..000000000000
--- a/ppdiffusers/examples/inference/super_resolution-latent_diffusion.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-
-from ppdiffusers import LDMSuperResolutionPipeline
-from ppdiffusers.utils import load_image
-
-# 加载pipeline
-pipe = LDMSuperResolutionPipeline.from_pretrained("CompVis/ldm-super-resolution-4x-openimages")
-
-# 下载初始图片
-url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
-
-init_image = load_image(url).resize((128, 128))
-init_image.save("original-image.png")
-
-# 使用fp16加快生成速度
-with paddle.amp.auto_cast(True):
-    image = pipe(init_image, num_inference_steps=100, eta=1).images[0]
-
-image.save("super_resolution-latent_diffusion-result.png")
diff --git a/ppdiffusers/examples/inference/text_guided_generation-semantic_stable_diffusion.py b/ppdiffusers/examples/inference/text_guided_generation-semantic_stable_diffusion.py
deleted file mode 100644
index b6b29f140e86..000000000000
--- a/ppdiffusers/examples/inference/text_guided_generation-semantic_stable_diffusion.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-
-from ppdiffusers import SemanticStableDiffusionPipeline
-
-pipe = SemanticStableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-pipe.set_progress_bar_config(disable=None)
-prompt = "a photo of a cat"
-edit = {
-    "editing_prompt": ["sunglasses"],
-    "reverse_editing_direction": [False],
-    "edit_warmup_steps": 10,
-    "edit_guidance_scale": 6,
-    "edit_threshold": 0.95,
-    "edit_momentum_scale": 0.5,
-    "edit_mom_beta": 0.6,
-}
-seed = 3
-guidance_scale = 7
-generator = paddle.Generator().manual_seed(seed)
-output = pipe(
-    [prompt],
-    generator=generator,
-    guidance_scale=guidance_scale,
-    num_inference_steps=50,
-    width=512,
-    height=512,
-)
-image = output.images[0]
-image.save("text_guided_generation-semantic_stable_diffusion-result.png")
diff --git a/ppdiffusers/examples/inference/text_guided_image_inpainting-deepfloyd_if.py b/ppdiffusers/examples/inference/text_guided_image_inpainting-deepfloyd_if.py
deleted file mode 100644
index d28b6c0c44b4..000000000000
--- a/ppdiffusers/examples/inference/text_guided_image_inpainting-deepfloyd_if.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-
-from ppdiffusers import IFInpaintingPipeline, IFInpaintingSuperResolutionPipeline
-from ppdiffusers.utils import load_image, pd_to_pil
-
-url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/if/person.png"
-original_image = load_image(url)
-
-url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/if/glasses_mask.png"
-mask_image = load_image(url)
-
-pipe = IFInpaintingPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", paddle_dtype=paddle.float16)
-pipe.enable_xformers_memory_efficient_attention()
-prompt = "blue sunglasses"
-prompt_embeds, negative_embeds = pipe.encode_prompt(prompt)
-
-image = pipe(
-    image=original_image,
-    mask_image=mask_image,
-    prompt_embeds=prompt_embeds,
-    negative_prompt_embeds=negative_embeds,
-    output_type="pd",
-).images
-pipe.to(paddle_device="cpu")
-# save intermediate image
-pil_image = pd_to_pil(image)
-pil_image[0].save("./text_guided_image_inpainting-deepfloyd_if-if_stage_I.png")
-
-super_res_1_pipe = IFInpaintingSuperResolutionPipeline.from_pretrained(
-    "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", paddle_dtype=paddle.float16
-)
-super_res_1_pipe.enable_xformers_memory_efficient_attention()
-
-image = super_res_1_pipe(
-    image=image,
-    mask_image=mask_image,
-    original_image=original_image,
-    prompt_embeds=prompt_embeds,
-    negative_prompt_embeds=negative_embeds,
-).images
-image[0].save("./text_guided_image_inpainting-deepfloyd_if-if_stage_II.png")
diff --git a/ppdiffusers/examples/inference/text_guided_image_inpainting-stable_diffusion.py b/ppdiffusers/examples/inference/text_guided_image_inpainting-stable_diffusion.py
deleted file mode 100644
index 0fdfe1946a84..000000000000
--- a/ppdiffusers/examples/inference/text_guided_image_inpainting-stable_diffusion.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-
-from ppdiffusers import StableDiffusionInpaintPipelineLegacy
-from ppdiffusers.utils import load_image
-
-img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
-mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png"
-
-init_image = load_image(img_url).resize((512, 512))
-mask_image = load_image(mask_url).resize((512, 512))
-
-pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained("runwayml/stable-diffusion-v1-5")
-
-prompt = "a cat sitting on a bench"
-with paddle.amp.auto_cast(True):
-    image = pipe(prompt=prompt, image=init_image, mask_image=mask_image, strength=0.75).images[0]
-
-image.save("text_guided_image_inpainting-stable_diffusion-result.png")
diff --git a/ppdiffusers/examples/inference/text_guided_image_inpainting-stable_diffusion_2.py b/ppdiffusers/examples/inference/text_guided_image_inpainting-stable_diffusion_2.py
deleted file mode 100644
index 6b27f9a60cf8..000000000000
--- a/ppdiffusers/examples/inference/text_guided_image_inpainting-stable_diffusion_2.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ppdiffusers import StableDiffusionInpaintPipeline
-from ppdiffusers.utils import load_image
-
-img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
-mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png"
-
-init_image = load_image(img_url).resize((512, 512))
-mask_image = load_image(mask_url).resize((512, 512))
-
-pipe = StableDiffusionInpaintPipeline.from_pretrained("stabilityai/stable-diffusion-2-inpainting")
-
-prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
-# image and mask_image should be PIL images.
-# The mask structure is white for inpainting and black for keeping as is
-image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
-image.save("text_guided_image_inpainting-stable_diffusion_2-result.png")
diff --git a/ppdiffusers/examples/inference/text_guided_image_upscaling-stable_diffusion_2.py b/ppdiffusers/examples/inference/text_guided_image_upscaling-stable_diffusion_2.py
deleted file mode 100644
index de2298e710d3..000000000000
--- a/ppdiffusers/examples/inference/text_guided_image_upscaling-stable_diffusion_2.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ppdiffusers import StableDiffusionUpscalePipeline
-from ppdiffusers.utils import load_image
-
-pipe = StableDiffusionUpscalePipeline.from_pretrained("stabilityai/stable-diffusion-x4-upscaler")
-
-url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/low_res_cat.png"
-low_res_img = load_image(url).resize((128, 128))
-
-prompt = "a white cat"
-upscaled_image = pipe(prompt=prompt, image=low_res_img).images[0]
-upscaled_image.save("text_guided_image_upscaling-stable_diffusion_2-result.png")
diff --git a/ppdiffusers/examples/inference/text_to_audio_generation-audio_ldm.py b/ppdiffusers/examples/inference/text_to_audio_generation-audio_ldm.py
deleted file mode 100644
index 2b4c1b1330a9..000000000000
--- a/ppdiffusers/examples/inference/text_to_audio_generation-audio_ldm.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import scipy
-from IPython.display import Audio, display
-
-from ppdiffusers import AudioLDMPipeline
-
-pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm", paddle_dtype=paddle.float16)
-
-prompt = "Techno music with a strong, upbeat tempo and high melodic riffs"
-audio = pipe(prompt, num_inference_steps=10, audio_length_in_s=5.0).audios[0]
-
-output_path = "text_to_audio_generation-audio_ldm-techno.wav"
-# save the audio sample as a .wav file
-scipy.io.wavfile.write(output_path, rate=16000, data=audio)
-
-# 可以直接使用 IPython.display.Audio 来显示音频文件
-display(Audio(output_path))
diff --git a/ppdiffusers/examples/inference/text_to_image_generation-alt_diffusion.py b/ppdiffusers/examples/inference/text_to_image_generation-alt_diffusion.py
deleted file mode 100644
index fccaff284995..000000000000
--- a/ppdiffusers/examples/inference/text_to_image_generation-alt_diffusion.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ppdiffusers import AltDiffusionPipeline, DPMSolverMultistepScheduler
-
-scheduler = DPMSolverMultistepScheduler.from_pretrained("BAAI/AltDiffusion", subfolder="scheduler")
-pipe = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion", scheduler=scheduler)
-
-prompt = "黑暗精灵公主，非常详细，幻想，非常详细，数字绘画，概念艺术，敏锐的焦点，插图"
-# or in English:
-# prompt = "dark elf princess, highly detailed, d & d, fantasy, highly detailed, digital painting, trending on artstation, concept art, sharp focus, illustration, art by artgerm and greg rutkowski and fuji choko and viktoria gavrilenko and hoang lap"
-
-image = pipe(prompt, num_inference_steps=25).images[0]
-image.save("text_to_image_generation-alt_diffusion-result.png")
diff --git a/ppdiffusers/examples/inference/text_to_image_generation-controlnet.py b/ppdiffusers/examples/inference/text_to_image_generation-controlnet.py
deleted file mode 100644
index d68d9b4d7718..000000000000
--- a/ppdiffusers/examples/inference/text_to_image_generation-controlnet.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import cv2
-import numpy as np
-from PIL import Image
-
-from ppdiffusers import ControlNetModel, StableDiffusionControlNetPipeline
-from ppdiffusers.utils import load_image
-
-controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
-
-pipe = StableDiffusionControlNetPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None
-)
-
-resolution = 512
-image = np.array(
-    load_image("https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/control_bird_canny_demo.png")
-)
-image = cv2.Canny(image, 100, 200)
-image = image[:, :, None]
-image = np.concatenate([image, image, image], axis=2)
-canny_image = Image.fromarray(image)
-canny_image = canny_image.resize((resolution, resolution))
-
-
-prompt = "bird"
-image = pipe(
-    prompt=prompt,
-    image=canny_image,
-    num_inference_steps=50,
-    height=resolution,
-    width=resolution,
-    controlnet_conditioning_scale=1.0,
-).images[0]
-image.save("text_to_image_generation-controlnet-result-bird_canny.png")
diff --git a/ppdiffusers/examples/inference/text_to_image_generation-deepfloyd_if.py b/ppdiffusers/examples/inference/text_to_image_generation-deepfloyd_if.py
deleted file mode 100644
index ea8e63be88d3..000000000000
--- a/ppdiffusers/examples/inference/text_to_image_generation-deepfloyd_if.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-
-from ppdiffusers import DiffusionPipeline, IFPipeline, IFSuperResolutionPipeline
-from ppdiffusers.utils import pd_to_pil
-
-# Stage 1: generate images
-pipe = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", paddle_dtype=paddle.float16)
-pipe.enable_xformers_memory_efficient_attention()
-prompt = 'a photo of a kangaroo wearing an orange hoodie and blue sunglasses standing in front of the eiffel tower holding a sign that says "very deep learning"'
-prompt_embeds, negative_embeds = pipe.encode_prompt(prompt)
-image = pipe(
-    prompt_embeds=prompt_embeds,
-    negative_prompt_embeds=negative_embeds,
-    output_type="pd",
-).images
-
-# save intermediate image
-pil_image = pd_to_pil(image)
-pil_image[0].save("text_to_image_generation-deepfloyd_if-result-if_stage_I.png")
-# save gpu memory
-pipe.to(paddle_device="cpu")
-
-# Stage 2: super resolution stage1
-super_res_1_pipe = IFSuperResolutionPipeline.from_pretrained(
-    "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", paddle_dtype=paddle.float16
-)
-super_res_1_pipe.enable_xformers_memory_efficient_attention()
-
-image = super_res_1_pipe(
-    image=image,
-    prompt_embeds=prompt_embeds,
-    negative_prompt_embeds=negative_embeds,
-    output_type="pd",
-).images
-# save intermediate image
-pil_image = pd_to_pil(image)
-pil_image[0].save("text_to_image_generation-deepfloyd_if-result-if_stage_II.png")
-# save gpu memory
-super_res_1_pipe.to(paddle_device="cpu")
-
-# Stage 3: super resolution stage2
-super_res_2_pipe = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-x4-upscaler", paddle_dtype=paddle.float16
-)
-super_res_2_pipe.enable_xformers_memory_efficient_attention()
-
-image = super_res_2_pipe(
-    prompt=prompt,
-    image=image,
-).images
-image[0].save("text_to_image_generation-deepfloyd_if-result-if_stage_III.png")
diff --git a/ppdiffusers/examples/inference/text_to_image_generation-latent_diffusion.py b/ppdiffusers/examples/inference/text_to_image_generation-latent_diffusion.py
deleted file mode 100644
index 4e8fcc4a7144..000000000000
--- a/ppdiffusers/examples/inference/text_to_image_generation-latent_diffusion.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ppdiffusers import LDMTextToImagePipeline
-
-# 加载模型和scheduler
-pipe = LDMTextToImagePipeline.from_pretrained("CompVis/ldm-text2im-large-256")
-
-# 执行pipeline进行推理
-prompt = "a photo of an astronaut riding a horse on mars"
-image = pipe(prompt, guidance_scale=7.5).images[0]
-
-# 保存图片
-image.save("text_to_image_generation-latent_diffusion-result.png")
diff --git a/ppdiffusers/examples/inference/text_to_image_generation-stable_diffusion.py b/ppdiffusers/examples/inference/text_to_image_generation-stable_diffusion.py
deleted file mode 100644
index d19a9917e430..000000000000
--- a/ppdiffusers/examples/inference/text_to_image_generation-stable_diffusion.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ppdiffusers import StableDiffusionPipeline
-
-# 加载模型和scheduler
-pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-
-# 执行pipeline进行推理
-prompt = "a photo of an astronaut riding a horse on mars"
-image = pipe(prompt).images[0]
-
-# 保存图片
-image.save("text_to_image_generation-stable_diffusion-result.png")
diff --git a/ppdiffusers/examples/inference/text_to_image_generation-stable_diffusion_2.py b/ppdiffusers/examples/inference/text_to_image_generation-stable_diffusion_2.py
deleted file mode 100644
index 51acd2a3b9cc..000000000000
--- a/ppdiffusers/examples/inference/text_to_image_generation-stable_diffusion_2.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ppdiffusers import StableDiffusionPipeline
-
-# 加载模型和scheduler
-pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2")
-# 执行pipeline进行推理
-prompt = "a photo of an astronaut riding a horse on mars"
-image = pipe(prompt).images[0]
-
-# 保存图片
-image.save("text_to_image_generation-stable_diffusion_2-result.png")
diff --git a/ppdiffusers/examples/inference/text_to_image_generation-stable_diffusion_safe.py b/ppdiffusers/examples/inference/text_to_image_generation-stable_diffusion_safe.py
deleted file mode 100644
index 4a71ac1a6b27..000000000000
--- a/ppdiffusers/examples/inference/text_to_image_generation-stable_diffusion_safe.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ppdiffusers import StableDiffusionPipelineSafe
-from ppdiffusers.pipelines.stable_diffusion_safe import SafetyConfig
-
-pipe = StableDiffusionPipelineSafe.from_pretrained("runwayml/stable-diffusion-v1-5")
-print(pipe.safety_concept)
-prompt = "the four horsewomen of the apocalypse, painting by tom of finland, gaston bussiere, craig mullins, j. c. leyendecker"
-out = pipe(prompt=prompt, **SafetyConfig.MAX)
-out.images[0].save("text_to_image_generation-stable_diffusion_safe-result.png.png")
diff --git a/ppdiffusers/examples/inference/text_to_image_generation-t2i-adapter.py b/ppdiffusers/examples/inference/text_to_image_generation-t2i-adapter.py
deleted file mode 100644
index 0d0ef4e6ce81..000000000000
--- a/ppdiffusers/examples/inference/text_to_image_generation-t2i-adapter.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import paddle
-
-from ppdiffusers import StableDiffusionAdapterPipeline, T2IAdapter
-from ppdiffusers.utils import PIL_INTERPOLATION, load_image
-
-input_image = load_image("https://huggingface.co/RzZ/sd-v1-4-adapter-color/resolve/main/color_ref.png")
-color_palette = input_image.resize((8, 8))
-color_palette = color_palette.resize((512, 512), resample=PIL_INTERPOLATION["nearest"])
-
-adapter = T2IAdapter.from_pretrained("westfish/sd-v1-4-adapter-color")
-
-pipe = StableDiffusionAdapterPipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4",
-    adapter=adapter,
-    paddle_dtype=paddle.float16,
-)
-
-image = pipe(
-    prompt="At night, glowing cubes in front of the beach",
-    image=color_palette,
-).images[0]
-image.save("text_to_image_generation-t2i-adapter-result-color_adapter.png")
diff --git a/ppdiffusers/examples/inference/text_to_image_generation-unclip.py b/ppdiffusers/examples/inference/text_to_image_generation-unclip.py
deleted file mode 100644
index 7e6bad79f36e..000000000000
--- a/ppdiffusers/examples/inference/text_to_image_generation-unclip.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from ppdiffusers import UnCLIPPipeline
-
-# 加载模型和scheduler
-pipe = UnCLIPPipeline.from_pretrained("kakaobrain/karlo-v1-alpha")
-
-# 执行pipeline进行推理
-prompt = "a high-resolution photograph of a big red frog on a green leaf."
-image = pipe(prompt).images[0]
-
-# 保存图片
-image.save("text_to_image_generation-unclip-result.png")
diff --git a/ppdiffusers/examples/inference/text_to_image_generation-unidiffuser.py b/ppdiffusers/examples/inference/text_to_image_generation-unidiffuser.py
deleted file mode 100644
index 031b9c0f2627..000000000000
--- a/ppdiffusers/examples/inference/text_to_image_generation-unidiffuser.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from ppdiffusers import UniDiffuserPipeline
-
-pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser")
-prompt = "an elephant under the sea"
-result = pipe(mode="t2i", image=None, prompt=prompt)
-image = result.images[0]
-image.save("text_to_image_generation-unidiffuser-result.png")
diff --git a/ppdiffusers/examples/inference/text_to_image_generation-versatile_diffusion.py b/ppdiffusers/examples/inference/text_to_image_generation-versatile_diffusion.py
deleted file mode 100644
index d777a8ce31db..000000000000
--- a/ppdiffusers/examples/inference/text_to_image_generation-versatile_diffusion.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ppdiffusers import VersatileDiffusionTextToImagePipeline
-
-pipe = VersatileDiffusionTextToImagePipeline.from_pretrained("shi-labs/versatile-diffusion")
-pipe.remove_unused_weights()
-
-image = pipe("an astronaut riding on a horse on mars").images[0]
-image.save("text_to_image_generation-versatile_diffusion-result.png")
diff --git a/ppdiffusers/examples/inference/text_to_image_generation-vq_diffusion.py b/ppdiffusers/examples/inference/text_to_image_generation-vq_diffusion.py
deleted file mode 100644
index 8d11d8d280dd..000000000000
--- a/ppdiffusers/examples/inference/text_to_image_generation-vq_diffusion.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ppdiffusers import VQDiffusionPipeline
-
-pipe = VQDiffusionPipeline.from_pretrained("microsoft/vq-diffusion-ithq")
-
-output = pipe("teddy bear playing in the pool", truncation_rate=1.0)
-
-image = output.images[0]
-image.save("text_to_image_generation-vq_diffusion-result.png")
diff --git a/ppdiffusers/examples/inference/text_to_image_generation_mixture_tiling-stable_diffusion.py b/ppdiffusers/examples/inference/text_to_image_generation_mixture_tiling-stable_diffusion.py
deleted file mode 100644
index a3b3b0791e06..000000000000
--- a/ppdiffusers/examples/inference/text_to_image_generation_mixture_tiling-stable_diffusion.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ppdiffusers import DiffusionPipeline, LMSDiscreteScheduler
-
-# Creater scheduler and model (similar to StableDiffusionPipeline)
-scheduler = LMSDiscreteScheduler(
-    beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000
-)
-pipeline = DiffusionPipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4", scheduler=scheduler, custom_pipeline="mixture_tiling.py"
-)
-pipeline
-
-# Mixture of Diffusers generation
-image = pipeline(
-    prompt=[
-        [
-            "A charming house in the countryside, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
-            "A dirt road in the countryside crossing pastures, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
-            "An old and rusty giant robot lying on a dirt road, by jakub rozalski, dark sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece",
-        ]
-    ],
-    tile_height=640,
-    tile_width=640,
-    tile_row_overlap=0,
-    tile_col_overlap=256,
-    guidance_scale=8,
-    seed=7178915308,
-    num_inference_steps=50,
-)["images"][0]
-image.save("mixture_tiling" + ".png")
diff --git a/ppdiffusers/examples/inference/text_to_video_generation-synth.py b/ppdiffusers/examples/inference/text_to_video_generation-synth.py
deleted file mode 100644
index 9da05c3085a0..000000000000
--- a/ppdiffusers/examples/inference/text_to_video_generation-synth.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import imageio
-
-from ppdiffusers import DPMSolverMultistepScheduler, TextToVideoSDPipeline
-
-pipe = TextToVideoSDPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b")
-pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
-
-prompt = "An astronaut riding a horse."
-video_frames = pipe(prompt, num_inference_steps=25).frames
-imageio.mimsave("text_to_video_generation-synth-result-astronaut_riding_a_horse.mp4", video_frames, fps=8)
diff --git a/ppdiffusers/examples/inference/text_to_video_generation-zero.py b/ppdiffusers/examples/inference/text_to_video_generation-zero.py
deleted file mode 100644
index 0e4efb3563d5..000000000000
--- a/ppdiffusers/examples/inference/text_to_video_generation-zero.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import imageio
-
-# pip install imageio[ffmpeg]
-import paddle
-
-from ppdiffusers import TextToVideoZeroPipeline
-
-model_id = "runwayml/stable-diffusion-v1-5"
-pipe = TextToVideoZeroPipeline.from_pretrained(model_id, paddle_dtype=paddle.float16)
-
-prompt = "A panda is playing guitar on times square"
-result = pipe(prompt=prompt).images
-result = [(r * 255).astype("uint8") for r in result]
-imageio.mimsave("text_to_video_generation-zero-result-panda.mp4", result, fps=4)
diff --git a/ppdiffusers/examples/inference/text_variation-unidiffuser.py b/ppdiffusers/examples/inference/text_variation-unidiffuser.py
deleted file mode 100644
index b287426d98a0..000000000000
--- a/ppdiffusers/examples/inference/text_variation-unidiffuser.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from ppdiffusers import UniDiffuserPipeline
-
-pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser")
-prompt = "an elephant under the sea"
-result = pipe(mode="t2i2t", image=None, prompt=prompt)
-text = result.texts[0]
-with open("text_variation-unidiffuser-result.txt", "w") as f:
-    print("{}\n".format(text), file=f)
diff --git a/ppdiffusers/examples/inference/unconditional_audio_generation-audio_diffusion.py b/ppdiffusers/examples/inference/unconditional_audio_generation-audio_diffusion.py
deleted file mode 100644
index e1914bab67da..000000000000
--- a/ppdiffusers/examples/inference/unconditional_audio_generation-audio_diffusion.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-from scipy.io.wavfile import write
-
-from ppdiffusers import AudioDiffusionPipeline
-
-# 加载模型和scheduler
-pipe = AudioDiffusionPipeline.from_pretrained("teticio/audio-diffusion-ddim-256")
-pipe.set_progress_bar_config(disable=None)
-generator = paddle.Generator().manual_seed(42)
-
-output = pipe(generator=generator)
-audio = output.audios[0]
-image = output.images[0]
-
-# 保存音频到本地
-for i, audio in enumerate(audio):
-    write(f"audio_diffusion_test{i}.wav", pipe.mel.sample_rate, audio.transpose())
-
-# 保存图片
-image.save("unconditional_audio_generation-audio_diffusion-result.png")
diff --git a/ppdiffusers/examples/inference/unconditional_audio_generation-dance_diffusion.py b/ppdiffusers/examples/inference/unconditional_audio_generation-dance_diffusion.py
deleted file mode 100644
index 2bb2747f936f..000000000000
--- a/ppdiffusers/examples/inference/unconditional_audio_generation-dance_diffusion.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from scipy.io.wavfile import write
-
-from ppdiffusers import DanceDiffusionPipeline
-
-# 加载模型和scheduler
-pipe = DanceDiffusionPipeline.from_pretrained("harmonai/maestro-150k")
-
-# 生成4s钟的音频
-audios = pipe(audio_length_in_s=4.0).audios
-
-# 保存音频到本地
-for i, audio in enumerate(audios):
-    write(f"unconditional_audio_generation-dance_diffusion-result_{i}.wav", pipe.unet.sample_rate, audio.transpose())
diff --git a/ppdiffusers/examples/inference/unconditional_audio_generation-spectrogram_diffusion.py b/ppdiffusers/examples/inference/unconditional_audio_generation-spectrogram_diffusion.py
deleted file mode 100644
index 5d1f4a978a10..000000000000
--- a/ppdiffusers/examples/inference/unconditional_audio_generation-spectrogram_diffusion.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import scipy
-from IPython.display import Audio, display
-
-from ppdiffusers import MidiProcessor, SpectrogramDiffusionPipeline
-from ppdiffusers.utils.download_utils import ppdiffusers_url_download
-
-# Download MIDI from: wget https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/beethoven_hammerklavier_2.mid
-mid_file_path = ppdiffusers_url_download(
-    "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/beethoven_hammerklavier_2.mid", cache_dir="."
-)
-pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion", paddle_dtype=paddle.float16)
-processor = MidiProcessor()
-output = pipe(processor(mid_file_path))
-audio = output.audios[0]
-
-output_path = "unconditional_audio_generation-spectrogram_diffusion-result-beethoven_hammerklavier_2.wav"
-# save the audio sample as a .wav file
-scipy.io.wavfile.write(output_path, rate=16000, data=audio)
-
-# 可以直接使用 IPython.display.Audio 来显示音频文件
-display(Audio(output_path))
diff --git a/ppdiffusers/examples/inference/unconditional_image_generation-ddim.py b/ppdiffusers/examples/inference/unconditional_image_generation-ddim.py
deleted file mode 100644
index b4e51784c6c3..000000000000
--- a/ppdiffusers/examples/inference/unconditional_image_generation-ddim.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ppdiffusers import DDIMPipeline
-
-# 加载模型和scheduler
-pipe = DDIMPipeline.from_pretrained("dboshardy/ddim-butterflies-128")
-
-# 执行pipeline进行推理
-image = pipe(num_inference_steps=25).images[0]
-
-# 保存图片
-image.save("unconditional_image_generation-ddim-result.png")
diff --git a/ppdiffusers/examples/inference/unconditional_image_generation-ddpm.py b/ppdiffusers/examples/inference/unconditional_image_generation-ddpm.py
deleted file mode 100644
index 48cfa5323d8d..000000000000
--- a/ppdiffusers/examples/inference/unconditional_image_generation-ddpm.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ppdiffusers import DDPMPipeline
-
-# 加载模型和scheduler
-pipe = DDPMPipeline.from_pretrained("google/ddpm-celebahq-256")
-
-# 执行pipeline进行推理
-image = pipe().images[0]
-
-# 保存图片
-image.save("unconditional_image_generation-ddpm-result.png")
diff --git a/ppdiffusers/examples/inference/unconditional_image_generation-latent_diffusion_uncond.py b/ppdiffusers/examples/inference/unconditional_image_generation-latent_diffusion_uncond.py
deleted file mode 100644
index 0d602ad08452..000000000000
--- a/ppdiffusers/examples/inference/unconditional_image_generation-latent_diffusion_uncond.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ppdiffusers import LDMPipeline
-
-# 加载模型和scheduler
-pipe = LDMPipeline.from_pretrained("CompVis/ldm-celebahq-256")
-
-# 执行pipeline进行推理
-image = pipe(num_inference_steps=200).images[0]
-
-# 保存图片
-image.save("unconditional_image_generation-latent_diffusion_uncond-result.png")
diff --git a/ppdiffusers/examples/inference/unconditional_image_generation-pndm.py b/ppdiffusers/examples/inference/unconditional_image_generation-pndm.py
deleted file mode 100644
index 5c0f23036f15..000000000000
--- a/ppdiffusers/examples/inference/unconditional_image_generation-pndm.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ppdiffusers import PNDMPipeline
-
-# 加载模型和scheduler
-pipe = PNDMPipeline.from_pretrained("google/ddpm-celebahq-256")
-
-# 执行pipeline进行推理
-image = pipe(num_inference_steps=1000).images[0]
-
-# 保存图片
-image.save("unconditional_image_generation-pndm-result.png")
diff --git a/ppdiffusers/examples/inference/unconditional_image_generation-score_sde_ve.py b/ppdiffusers/examples/inference/unconditional_image_generation-score_sde_ve.py
deleted file mode 100644
index 5314d46ff8d5..000000000000
--- a/ppdiffusers/examples/inference/unconditional_image_generation-score_sde_ve.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ppdiffusers import ScoreSdeVePipeline
-
-# 加载模型和scheduler
-pipe = ScoreSdeVePipeline.from_pretrained("google/ncsnpp-ffhq-1024")
-
-# 执行pipeline进行推理
-image = pipe().images[0]
-
-# 保存图片
-image.save("unconditional_image_generation-score_sde_ve-result.png")
diff --git a/ppdiffusers/examples/inference/unconditional_image_generation-stochastic_karras_ve.py b/ppdiffusers/examples/inference/unconditional_image_generation-stochastic_karras_ve.py
deleted file mode 100644
index 90f93ac299ed..000000000000
--- a/ppdiffusers/examples/inference/unconditional_image_generation-stochastic_karras_ve.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ppdiffusers import KarrasVePipeline, KarrasVeScheduler
-
-scheduler = KarrasVeScheduler()
-# 加载模型和scheduler
-pipe = KarrasVePipeline.from_pretrained("google/ncsnpp-celebahq-256", scheduler=scheduler)
-
-# 执行pipeline进行推理
-image = pipe().images
-
-# 保存图片
-image[0].save("unconditional_image_generation-stochastic_karras_ve-result.png")
diff --git a/ppdiffusers/examples/inference/unconditional_image_generation-unidiffuser.py b/ppdiffusers/examples/inference/unconditional_image_generation-unidiffuser.py
deleted file mode 100644
index a729d77271ab..000000000000
--- a/ppdiffusers/examples/inference/unconditional_image_generation-unidiffuser.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from ppdiffusers import UniDiffuserPipeline
-
-pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser")
-result = pipe(mode="i", image=None, prompt=None)
-image = result.images[0]
-image.save("unconditional_image_generation-unidiffuser-result.png")
diff --git a/ppdiffusers/examples/inference/unconditional_image_text_joint_generation-unidiffuser.py b/ppdiffusers/examples/inference/unconditional_image_text_joint_generation-unidiffuser.py
deleted file mode 100644
index 5b999ce4a8da..000000000000
--- a/ppdiffusers/examples/inference/unconditional_image_text_joint_generation-unidiffuser.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from ppdiffusers import UniDiffuserPipeline
-
-pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser")
-result = pipe(mode="joint", image=None, prompt=None)
-image = result.images[0]
-image.save("unconditional_image_text_generation-unidiffuser-result.png")
-text = result.texts[0]
-with open("unconditional_image_text_generation-unidiffuser-result.txt", "w") as f:
-    print("{}\n".format(text), file=f)
diff --git a/ppdiffusers/examples/inference/unconditional_text_generation-unidiffuser.py b/ppdiffusers/examples/inference/unconditional_text_generation-unidiffuser.py
deleted file mode 100644
index 3c9e57b970fc..000000000000
--- a/ppdiffusers/examples/inference/unconditional_text_generation-unidiffuser.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from ppdiffusers import UniDiffuserPipeline
-
-pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser")
-result = pipe(mode="t", image=None, prompt=None)
-text = result.texts[0]
-with open("unconditional_text_generation-unidiffuser-result.txt", "w") as f:
-    print("{}\n".format(text), file=f)
diff --git a/ppdiffusers/examples/reproduce/README.md b/ppdiffusers/examples/reproduce/README.md
deleted file mode 100644
index 42cb9a282249..000000000000
--- a/ppdiffusers/examples/reproduce/README.md
+++ /dev/null
@@ -1,44 +0,0 @@
-# Stable Diffusion
-
-## Using `StableDiffusionAttendAndExcitePipeline` with `PNDMScheduler`
-
-Given a pre-trained text-to-image diffusion model (e.g., Stable Diffusion) the method, Attend-and-Excite, guides the generative model to modify the cross-attention values during the image synthesis process to generate images that more faithfully depict the input text prompt.
-
-You can run this pipeline as so:
-
-```python
-from pathlib import Path
-import paddle
-from ppdiffusers import StableDiffusionAttendAndExcitePipeline, PNDMScheduler
-
-
-scheduler = PNDMScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
-pipe = StableDiffusionAttendAndExcitePipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4",
-    scheduler=scheduler,
-    )
-
-seed = 123
-prompt = "A playful kitten chasing a butterfly in a wildflower meadow"
-token_indices = [3,6,10]
-
-generator = paddle.Generator().manual_seed(seed)
-image = pipe(
-    prompt=prompt,
-    token_indices=token_indices,
-    generator=generator,
-).images[0]
-
-# save
-output_dir = Path("output_pd")
-prompt_output_path = output_dir / prompt
-prompt_output_path.mkdir(exist_ok=True, parents=True)
-image.save(prompt_output_path / f'{seed}.png')
-
-```
-
-And the above script running in V100-32GB generates the following image:
-
-<center>
-<img src="https://user-images.githubusercontent.com/40912707/226089491-0f3f66c2-3c88-4518-9ee4-d77debd50e9e.png">
-</center>
diff --git a/ppdiffusers/examples/reproduce/README_cn.md b/ppdiffusers/examples/reproduce/README_cn.md
deleted file mode 100644
index c59504e7c7b9..000000000000
--- a/ppdiffusers/examples/reproduce/README_cn.md
+++ /dev/null
@@ -1,45 +0,0 @@
-# Stable Diffusion
-
-## 使用 `StableDiffusionAttendAndExcitePipeline` 和 `PNDMScheduler`
-
-给定一个预先训练好的文本到图像的扩散模型（例如Stable Diffusion），方法`Attend-and-Excite`能引导生成模型在图像生成过程中修改交叉注意力的数值，使得生成图片更忠实于输入文本的提示。
-
-使用该pipeline的示例代码如下
-
-```python
-
-from pathlib import Path
-import paddle
-from ppdiffusers import StableDiffusionAttendAndExcitePipeline, PNDMScheduler
-
-
-scheduler = PNDMScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
-pipe = StableDiffusionAttendAndExcitePipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4",
-    scheduler=scheduler,
-    )
-
-seed = 123
-prompt = "A playful kitten chasing a butterfly in a wildflower meadow"
-token_indices = [3,6,10]
-
-generator = paddle.Generator().manual_seed(seed)
-image = pipe(
-    prompt=prompt,
-    token_indices=token_indices,
-    generator=generator,
-).images[0]
-
-# save
-output_dir = Path("output_pd")
-prompt_output_path = output_dir / prompt
-prompt_output_path.mkdir(exist_ok=True, parents=True)
-image.save(prompt_output_path / f'{seed}.png')
-
-```
-
-在V100-32GB显卡运行上述代码生成结果如下：
-
-<center>
-<img src="https://user-images.githubusercontent.com/40912707/226089491-0f3f66c2-3c88-4518-9ee4-d77debd50e9e.png">
-</center>
diff --git a/ppdiffusers/examples/reproduce/align_record.md b/ppdiffusers/examples/reproduce/align_record.md
deleted file mode 100644
index e179d50d7869..000000000000
--- a/ppdiffusers/examples/reproduce/align_record.md
+++ /dev/null
@@ -1,69 +0,0 @@
-
-
-## 示例prompt
-
-以[Hugging Face Spaces](https://huggingface.co/spaces/AttendAndExcite/Attend-and-Excite)选用的prompt和seed为例展示对齐效果如下表：`image_error`一栏分别展示了ppdiffusers和diffusers生成的图片及其绝对误差和相对误差在像素点的分布；`latents_error`一栏展示了模型在去噪过程中每一步生成的潜变量的最大绝对误差（蓝色线条）和最大相对误差（橙色线条）。
-
-| prompt                                                       | seed | image_error                                                  | latents_error                                                |
-| :----------------------------------------------------------- | :--: | ------------------------------------------------------------ | ------------------------------------------------------------ |
-| A grizzly bear catching a salmon in a crystal clear river surrounded by a forest | 123  | ![123_error_map](https://user-images.githubusercontent.com/40912707/226089730-215319e0-e9f5-4b7a-b51d-593b82a2e0a2.png) | ![123](https://user-images.githubusercontent.com/40912707/226089728-52b2f185-8cf1-4f39-96a2-9d9bae1e2baf.png) |
-| A horse and a dog                                            | 123  | ![123_error_map](https://user-images.githubusercontent.com/40912707/226089753-fa78e6d8-c241-4436-be1b-e63ac8318199.png)             | !![123](https://user-images.githubusercontent.com/40912707/226089754-18c1b9ce-0287-4f04-8b85-4dbcdb0871c8.png)                       |
-| A mouse and a red car                                        | 2098 | ![2098_error_map](https://user-images.githubusercontent.com/40912707/226089872-9a6fa7a7-f8e5-45fd-acd2-a9ae5e7df6af.png)         | ![2098](https://user-images.githubusercontent.com/40912707/226089874-7004bb01-5e19-4c5a-bd55-af741d9e1e38.png)                  |
-| A painting of an elephant with glasses                       | 123  | ![123_error_map](https://user-images.githubusercontent.com/40912707/226089922-3d833b5c-a522-47b2-a9ed-5d95f2e4b411.png) | ![123](https://user-images.githubusercontent.com/40912707/226089923-35513880-6de1-4da9-bd57-4c747471c359.png)   |
-| A playful kitten chasing a butterfly in a wildflower meadow  | 123  | ![123_error_map](https://user-images.githubusercontent.com/40912707/226089933-5fd1408a-ea13-4329-8f86-391fad46cce8.png) | ![123](https://user-images.githubusercontent.com/40912707/226089934-ce392b7a-e672-46d2-a5e3-d66d23dc66ed.png) |
-| A pod of dolphins leaping out of the water in an ocean with a ship on the background | 123  | ![123_error_map](https://user-images.githubusercontent.com/40912707/226089943-b5def711-deb0-4cb2-9589-79693fd46a4f.png) | ![123](https://user-images.githubusercontent.com/40912707/226089945-3e7c37a4-17d5-4d26-99ee-40812957dea3.png) |
-
-
-
-
-
-## 更多的prompt
-
-
-按照论文的prompt构造方法尝试更多的prompt和seed对比ppdiffuser和diffusers的表现。
-
-三种prompt模板分别如下，加粗的token表示使用注意力的token：
-
-*   a **{animal}** and a **{animal}**
-*   a **{animal}** and a {color} **{object}**
-*   a {colorA} **{objectA}** and a {colorB} **{objectB}**
-
-其中用于填充的单词如下表
-
-| category | words                                                        |
-| -------- | ------------------------------------------------------------ |
-| animals  | cat, dog, bird, bear, lion, horse, elephant, monkey, frog, turtle, rabbit, mouse |
-| objects  | backpack, glasses, crown, suitcase, chair, balloon, bow, car, bowl, bench, clock, apple |
-| colors   | red, orange, yellow, green, blue, purple, pink, brown, gray, black, white |
-
-对每一个prompt再随机生成5个seed，对比结果绘制散点图如下，其中每个点代表该prompt和seed的设置下在该step上的误差，颜色越亮则表示散点越聚集：
-
-| 绝对误差                   | 相对误差                   |
-| -------------------------- | -------------------------- |
-| ![align_record_atol](https://user-images.githubusercontent.com/40912707/226089978-a760e900-9309-4058-9075-b5853fcda549.png) | ![align_record_rtol](https://user-images.githubusercontent.com/40912707/226089977-93d9e502-73ce-4b53-ab3d-066f603ce8d6.png) |
-
-在测试的105组prompt和seed中，有13个prompt和seed的组合生成的图片有较大差别，记录如下：
-
-| prompt                                 | seed | image_error                                                  | latents_error                                               |
-| :------------------------------------- | ---- | ------------------------------------------------------------ | ----------------------------------------------------------- |
-| a dog and a pink crown                 | 1930 | ![1930_error_map](https://user-images.githubusercontent.com/40912707/226090081-1448b7af-db20-4650-b181-44d02421a8fd.png)        | ![1930](https://user-images.githubusercontent.com/40912707/226090079-52dfc07d-db0b-46b6-a870-dee4304a19de.png)                 |
-| a mouse and a yellow apple             | 28   | ![28_error_map](https://user-images.githubusercontent.com/40912707/226090224-ccda6bfc-d0c2-4332-baac-be4ca92f09ae.png)      | ![28](https://user-images.githubusercontent.com/40912707/226090236-3d362322-c6c3-4d9c-9769-60bdf554c030.png)               |
-| a dog and a pink crown                 | 1285 | ![1285_error_map](https://user-images.githubusercontent.com/40912707/226090083-1a654c9f-8c79-45c6-ac4c-88065a9526a0.png)        | ![1285](https://user-images.githubusercontent.com/40912707/226090082-134c5aa3-7c8d-4481-acde-f4303db3f8d5.png)                 |
-| a mouse and a yellow apple             | 60   | ![60_error_map](https://user-images.githubusercontent.com/40912707/226090232-9b5e09d8-139b-4b75-9bd8-463443cb57e2.png)      |![60](https://user-images.githubusercontent.com/40912707/226090231-dcd6aadb-185d-49c8-925f-ef4c491a4197.png)            |
-| a mouse and a yellow apple             | 49   | ![49_error_map](https://user-images.githubusercontent.com/40912707/226090227-0d760233-c9d4-4a54-88c5-cb044ae15571.png)      | ![49](https://user-images.githubusercontent.com/40912707/226090226-f31ca86d-5947-471c-904c-173d5fd515ad.png)   |
-| a white chair and a green chair        | 933  | ![933_error_map](https://user-images.githubusercontent.com/40912707/226090329-c585ffcf-e17a-4cd1-94e1-ac91f3354f0b.png) | ![933](https://user-images.githubusercontent.com/40912707/226090328-6fab40a9-d9e8-457d-8d99-d37bf860f215.png) |
-| a dog and a black bow                  | 148  | ![148_error_map](https://user-images.githubusercontent.com/40912707/226090404-344e8004-ee42-454a-9b08-de19bfc85ad8.png) | ![148](https://user-images.githubusercontent.com/40912707/226090406-d03b9529-7667-4520-b6be-fbbd558230ae.png) |
-| A horse and a dog                      | 27   | ![27_error_map](https://user-images.githubusercontent.com/40912707/226090430-0827b75c-19b6-4357-b475-28edf4b8cadc.png) | ![27](https://user-images.githubusercontent.com/40912707/226090432-4523dc64-9ce7-42ef-bdf2-8e192cebb212.png)|
-| a horse and a gray bow                 | 1753 | !![1743_error_map](https://user-images.githubusercontent.com/40912707/226090077-637bcfff-cbfa-44c9-a5c9-6a482526a734.png)        | ![1743](https://user-images.githubusercontent.com/40912707/226090074-e84b8ab2-27db-444a-9f73-142e91e3a286.png)                 |
-| A painting of an elephant with glasses | 1860 | ![1860_error_map](https://user-images.githubusercontent.com/40912707/226090464-2554b7ca-c40b-4b8b-b746-1529050e91c7.png) |![1860](https://user-images.githubusercontent.com/40912707/226090462-c67f7210-9026-49bc-bf6d-b70dc626be46.png)|
-| a horse and a gray bow                 | 1597 | ![1597_error_map](https://user-images.githubusercontent.com/40912707/226090499-c7ae25d1-59b7-4bfb-b318-048e6c180a92.png) |![1597](https://user-images.githubusercontent.com/40912707/226090498-f2fc3617-843f-4968-8369-3db92b2ad8b5.png)  |
-| a dog and a pink crown                 | 1743 | ![1743_error_map](https://user-images.githubusercontent.com/40912707/226090529-b56f5e0c-0808-4dfc-b7be-d7f0dfa01696.png)| ![1743](https://user-images.githubusercontent.com/40912707/226090528-5425e457-11a6-4985-b17b-237eae93c138.png) |
-| a white chair and a green chair        | 890  |![890_error_map](https://user-images.githubusercontent.com/40912707/226090327-929f60ca-5eb3-4019-a19b-db1ad8a307d2.png) | ![890](https://user-images.githubusercontent.com/40912707/226090332-d8b281ed-2a2b-4964-8c07-e0120990b344.png)   |
-
-
-
-#### 由于存在梯度，同样的prompt和seed设置下多次运行依旧存在误差
-
-| diffusers                                                    | ppdiffusers                                                  |
-| ------------------------------------------------------------ | ------------------------------------------------------------ |
-| ![image-20230318002849117](https://user-images.githubusercontent.com/40912707/226090011-7d4fb85b-229f-4d62-b6bf-b7166c9341ef.png) | ![image-20230318002900522](https://user-images.githubusercontent.com/40912707/226090012-1ffe4095-fa18-46fb-8b11-f9b8369c79a6.png) |
diff --git a/ppdiffusers/examples/stable_diffusion/bf16.sh b/ppdiffusers/examples/stable_diffusion/bf16.sh
deleted file mode 100644
index 11d273595b34..000000000000
--- a/ppdiffusers/examples/stable_diffusion/bf16.sh
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# unset PADDLE_ELASTIC_JOB_ID
-# unset PADDLE_TRAINER_ENDPOINTS
-# unset DISTRIBUTED_TRAINER_ENDPOINTS
-# unset FLAGS_START_PORT
-# unset PADDLE_ELASTIC_TIMEOUT
-
-export LD_LIBRARY_PATH=/usr/local/cuda/compat:$LD_LIBRARY_PATH
-
-export FLAGS_conv_workspace_size_limit=4096
-export FLAG_USE_EMA=0
-export FLAG_BENCHMARK=1
-export FLAG_RECOMPUTE=1
-export FLAG_XFORMERS=1
-
-export OUTPUT_DIR="bf16_paddle"
-export BATCH_SIZE=64
-export MAX_ITER=200000
-
-nohup python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" train_txt2img_laion400m_trainer.py \
-    --do_train \
-    --output_dir ${OUTPUT_DIR} \
-    --per_device_train_batch_size ${BATCH_SIZE} \
-    --gradient_accumulation_steps 1 \
-    --learning_rate 1e-4 \
-    --weight_decay 0.01 \
-    --max_steps ${MAX_ITER} \
-    --lr_scheduler_type "constant" \
-    --warmup_steps 0 \
-    --image_logging_steps 1000 \
-    --logging_steps 10 \
-    --resolution 256 \
-    --save_steps 10000 \
-    --save_total_limit 20 \
-    --seed 23 \
-    --dataloader_num_workers 8 \
-    --pretrained_model_name_or_path ./CompVis-stable-diffusion-v1-4-paddle-init \
-    --file_list ./data/filelist/train.filelist.list \
-    --model_max_length 77 \
-    --max_grad_norm -1 \
-    --disable_tqdm True \
-    --bf16 True \
-    --overwrite_output_dir > paddle_sd_bf16_2048.log 2>&1 &
diff --git a/ppdiffusers/examples/stable_diffusion/data/filelist/laion400m_en.filelist b/ppdiffusers/examples/stable_diffusion/data/filelist/laion400m_en.filelist
deleted file mode 100644
index a70eccdedbad..000000000000
--- a/ppdiffusers/examples/stable_diffusion/data/filelist/laion400m_en.filelist
+++ /dev/null
@@ -1,10 +0,0 @@
-/data/laion400m/part-00000.gz
-/data/laion400m/part-00001.gz
-/data/laion400m/part-00002.gz
-/data/laion400m/part-00003.gz
-/data/laion400m/part-00004.gz
-/data/laion400m/part-00005.gz
-/data/laion400m/part-00006.gz
-/data/laion400m/part-00007.gz
-/data/laion400m/part-00008.gz
-/data/laion400m/part-00009.gz
\ No newline at end of file
diff --git a/ppdiffusers/examples/stable_diffusion/data/filelist/laion_aes.filelist b/ppdiffusers/examples/stable_diffusion/data/filelist/laion_aes.filelist
deleted file mode 100644
index 86b0e5191d63..000000000000
--- a/ppdiffusers/examples/stable_diffusion/data/filelist/laion_aes.filelist
+++ /dev/null
@@ -1,50 +0,0 @@
-/root/laion_aes/part-00000
-/root/laion_aes/part-00001
-/root/laion_aes/part-00002
-/root/laion_aes/part-00003
-/root/laion_aes/part-00004
-/root/laion_aes/part-00005
-/root/laion_aes/part-00006
-/root/laion_aes/part-00007
-/root/laion_aes/part-00008
-/root/laion_aes/part-00009
-/root/laion_aes/part-00010
-/root/laion_aes/part-00011
-/root/laion_aes/part-00012
-/root/laion_aes/part-00013
-/root/laion_aes/part-00014
-/root/laion_aes/part-00015
-/root/laion_aes/part-00016
-/root/laion_aes/part-00017
-/root/laion_aes/part-00018
-/root/laion_aes/part-00019
-/root/laion_aes/part-00020
-/root/laion_aes/part-00021
-/root/laion_aes/part-00022
-/root/laion_aes/part-00023
-/root/laion_aes/part-00024
-/root/laion_aes/part-00025
-/root/laion_aes/part-00026
-/root/laion_aes/part-00027
-/root/laion_aes/part-00028
-/root/laion_aes/part-00029
-/root/laion_aes/part-00030
-/root/laion_aes/part-00031
-/root/laion_aes/part-00032
-/root/laion_aes/part-00033
-/root/laion_aes/part-00034
-/root/laion_aes/part-00035
-/root/laion_aes/part-00036
-/root/laion_aes/part-00037
-/root/laion_aes/part-00038
-/root/laion_aes/part-00039
-/root/laion_aes/part-00040
-/root/laion_aes/part-00041
-/root/laion_aes/part-00042
-/root/laion_aes/part-00043
-/root/laion_aes/part-00044
-/root/laion_aes/part-00045
-/root/laion_aes/part-00046
-/root/laion_aes/part-00047
-/root/laion_aes/part-00048
-/root/laion_aes/part-00049
\ No newline at end of file
diff --git a/ppdiffusers/examples/stable_diffusion/data/filelist/laion_aes.filelist.list b/ppdiffusers/examples/stable_diffusion/data/filelist/laion_aes.filelist.list
deleted file mode 100644
index 0e36e494e2a3..000000000000
--- a/ppdiffusers/examples/stable_diffusion/data/filelist/laion_aes.filelist.list
+++ /dev/null
@@ -1 +0,0 @@
-./data/filelist/laion_aes.filelist
diff --git a/ppdiffusers/examples/stable_diffusion/data/filelist/train.filelist.list b/ppdiffusers/examples/stable_diffusion/data/filelist/train.filelist.list
deleted file mode 100644
index 4bc020729904..000000000000
--- a/ppdiffusers/examples/stable_diffusion/data/filelist/train.filelist.list
+++ /dev/null
@@ -1 +0,0 @@
-./data/filelist/laion400m_en.filelist
diff --git a/ppdiffusers/examples/stable_diffusion/data/filelist/write_filelist.py b/ppdiffusers/examples/stable_diffusion/data/filelist/write_filelist.py
deleted file mode 100644
index 358bca25f4fd..000000000000
--- a/ppdiffusers/examples/stable_diffusion/data/filelist/write_filelist.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-data = []
-for index in range(60000):
-    data.append("/data/laion400m/part-{:05}.gz\n".format(index))
-
-with open("laion400m_en.filelist", "w") as w:
-    w.writelines(data)
diff --git a/ppdiffusers/examples/stable_diffusion/prepare.sh b/ppdiffusers/examples/stable_diffusion/prepare.sh
deleted file mode 100644
index 9e5139d69a87..000000000000
--- a/ppdiffusers/examples/stable_diffusion/prepare.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-rm -rf CompVis-stable-diffusion-v1-4-paddle-init-pd.tar.gz
-rm -rf CompVis-stable-diffusion-v1-4-paddle-init
-
-wget https://bj.bcebos.com/paddlenlp/models/community/CompVis/CompVis-stable-diffusion-v1-4-paddle-init-pd.tar.gz
-tar -zxvf CompVis-stable-diffusion-v1-4-paddle-init-pd.tar.gz
-
-# pip install -r requirements.txt
\ No newline at end of file
diff --git a/ppdiffusers/examples/stable_diffusion/requirements.txt b/ppdiffusers/examples/stable_diffusion/requirements.txt
deleted file mode 100644
index a9e3f0651f2d..000000000000
--- a/ppdiffusers/examples/stable_diffusion/requirements.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-paddlenlp>=2.6.0rc0.post0
-ppdiffusers>=0.17.1
-fastcore
-visualdl
-Pillow
-safetensors
\ No newline at end of file
diff --git a/ppdiffusers/examples/stable_diffusion/sd/__init__.py b/ppdiffusers/examples/stable_diffusion/sd/__init__.py
deleted file mode 100644
index 38449bdc210a..000000000000
--- a/ppdiffusers/examples/stable_diffusion/sd/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# flake8: noqa
-
-from .model import StableDiffusionModel
-from .sd_args import SDDataArguments, SDModelArguments, SDTrainingArguments
-from .sd_trainer import StableDiffusionTrainer
-from .text_image_pair_dataset import TextImagePair, worker_init_fn
diff --git a/ppdiffusers/examples/stable_diffusion/sd/model.py b/ppdiffusers/examples/stable_diffusion/sd/model.py
deleted file mode 100644
index 685b2fa21619..000000000000
--- a/ppdiffusers/examples/stable_diffusion/sd/model.py
+++ /dev/null
@@ -1,355 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import contextlib
-import inspect
-import os
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-
-from paddlenlp.transformers import AutoTokenizer, CLIPTextModel
-from paddlenlp.utils.log import logger
-from ppdiffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    DDPMScheduler,
-    UNet2DConditionModel,
-    is_ppxformers_available,
-)
-from ppdiffusers.initializer import reset_initialized_parameter, zeros_
-from ppdiffusers.models.attention import AttentionBlock
-from ppdiffusers.models.ema import LitEma
-from ppdiffusers.models.resnet import ResnetBlock2D
-from ppdiffusers.models.transformer_2d import Transformer2DModel
-from ppdiffusers.training_utils import freeze_params
-
-
-class StableDiffusionModel(nn.Layer):
-    def __init__(self, model_args):
-        super().__init__()
-        self.model_args = model_args
-        tokenizer_name_or_path = (
-            model_args.tokenizer_name
-            if model_args.tokenizer_name is not None
-            else os.path.join(model_args.pretrained_model_name_or_path, "tokenizer")
-        )
-        vae_name_or_path = (
-            model_args.vae_name_or_path
-            if model_args.vae_name_or_path is not None
-            else os.path.join(model_args.pretrained_model_name_or_path, "vae")
-        )
-        text_encoder_name_or_path = (
-            model_args.text_encoder_name_or_path
-            if model_args.text_encoder_name_or_path is not None
-            else os.path.join(model_args.pretrained_model_name_or_path, "text_encoder")
-        )
-        unet_name_or_path = (
-            model_args.unet_name_or_path
-            if model_args.unet_name_or_path is not None
-            else os.path.join(model_args.pretrained_model_name_or_path, "unet")
-        )
-        # init model and tokenizer
-        tokenizer_kwargs = {}
-        if model_args.model_max_length is not None:
-            tokenizer_kwargs["model_max_length"] = model_args.model_max_length
-        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, **tokenizer_kwargs)
-        self.vae = AutoencoderKL.from_pretrained(vae_name_or_path)
-        self.text_encoder = CLIPTextModel.from_pretrained(text_encoder_name_or_path)
-        try:
-            self.unet = UNet2DConditionModel.from_pretrained(unet_name_or_path)
-        except Exception:
-            self.unet = UNet2DConditionModel.from_config(unet_name_or_path)
-            self.init_unet_weights()
-            logger.info("Init unet model from scratch!")
-
-        freeze_params(self.vae.parameters())
-        logger.info("Freeze vae parameters!")
-        if not self.model_args.train_text_encoder:
-            freeze_params(self.text_encoder.parameters())
-            logger.info("Freeze text_encoder parameters!")
-            self.text_encoder.eval()
-            self.train_text_encoder = False
-        else:
-            self.text_encoder.train()
-            self.train_text_encoder = True
-        self.unet.train()
-        self.vae.eval()
-
-        # init noise_scheduler and eval_scheduler
-        assert self.model_args.prediction_type in ["epsilon", "v_prediction"]
-        self.noise_scheduler = DDPMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            num_train_timesteps=1000,
-            prediction_type=self.model_args.prediction_type,
-        )
-        self.register_buffer("alphas_cumprod", self.noise_scheduler.alphas_cumprod)
-        self.eval_scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            num_train_timesteps=1000,
-            clip_sample=False,
-            set_alpha_to_one=False,
-            steps_offset=1,
-            prediction_type=self.model_args.prediction_type,
-        )
-        self.eval_scheduler.set_timesteps(self.model_args.num_inference_steps)
-        self.use_ema = False
-        self.model_ema = None
-
-    def compute_snr(self, timesteps):
-        """
-        Computes SNR as per https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L847-L849
-        """
-        sqrt_alphas_cumprod = self.alphas_cumprod**0.5
-        sqrt_one_minus_alphas_cumprod = (1.0 - self.alphas_cumprod) ** 0.5
-
-        # Expand the tensors.
-        # Adapted from https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L1026
-        sqrt_alphas_cumprod = sqrt_alphas_cumprod[timesteps].cast("float32")
-        while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape):
-            sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None]
-        alpha = sqrt_alphas_cumprod.expand(timesteps.shape)
-
-        sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[timesteps].cast("float32")
-        while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape):
-            sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., None]
-        sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape)
-
-        # Compute SNR.
-        snr = (alpha / sigma) ** 2
-        return snr
-
-    def forward(self, input_ids=None, pixel_values=None, **kwargs):
-        self.vae.eval()
-        if not self.model_args.train_text_encoder:
-            self.text_encoder.eval()
-
-        # vae encode
-        latents = self.vae.encode(pixel_values).latent_dist.sample()
-        latents = latents * self.vae.config.scaling_factor
-
-        # Sample noise that we'll add to the latents
-        noise = paddle.randn(latents.shape)
-        if self.model_args.noise_offset:
-            # https://www.crosslabs.org//blog/diffusion-with-offset-noise
-            noise += self.model_args.noise_offset * paddle.randn(
-                (latents.shape[0], latents.shape[1], 1, 1), dtype=noise.dtype
-            )
-        if self.model_args.input_perturbation:
-            new_noise = noise + self.model_args.input_perturbation * paddle.randn(noise.shape, dtype=noise.dtype)
-
-        timesteps = paddle.randint(0, self.noise_scheduler.config.num_train_timesteps, (latents.shape[0],)).cast(
-            "int64"
-        )
-        # Add noise to the latents according to the noise magnitude at each timestep
-        # (this is the forward diffusion process)
-        if self.model_args.input_perturbation:
-            noisy_latents = self.add_noise(latents, new_noise, timesteps)
-        else:
-            noisy_latents = self.add_noise(latents, noise, timesteps)
-
-        # text encode
-        encoder_hidden_states = self.text_encoder(input_ids)[0]
-
-        # unet
-        model_pred = self.unet(
-            sample=noisy_latents, timestep=timesteps, encoder_hidden_states=encoder_hidden_states
-        ).sample
-
-        # Get the target for loss depending on the prediction type
-        if self.model_args.prediction_type == "epsilon":
-            target = noise
-        elif self.model_args.prediction_type == "v_prediction":
-            target = self.get_velocity(latents, noise, timesteps)
-        else:
-            raise ValueError(f"Unknown prediction type {self.model_args.prediction_type}")
-
-        # compute loss
-        if self.model_args.snr_gamma is None:
-            loss = (
-                F.mse_loss(model_pred.cast("float32"), target.cast("float32"), reduction="none").mean([1, 2, 3]).mean()
-            )
-        else:
-            # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
-            # Since we predict the noise instead of x_0, the original formulation is slightly changed.
-            # This is discussed in Section 4.2 of the same paper.
-            snr = self.compute_snr(timesteps)
-            mse_loss_weights = (
-                paddle.stack([snr, self.model_args.snr_gamma * paddle.ones_like(timesteps)], axis=1).min(axis=1)[0]
-                / snr
-            )
-            # We first calculate the original loss. Then we mean over the non-batch dimensions and
-            # rebalance the sample-wise losses with their respective loss weights.
-            # Finally, we take the mean of the rebalanced loss.
-            loss = F.mse_loss(model_pred.cast("float32"), target.cast("float32"), reduction="none")
-            loss = loss.mean(list(range(1, len(loss.shape)))) * mse_loss_weights
-            loss = loss.mean()
-        return loss
-
-    def add_noise(
-        self,
-        original_samples: paddle.Tensor,
-        noise: paddle.Tensor,
-        timesteps: paddle.Tensor,
-    ) -> paddle.Tensor:
-        sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5
-        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
-        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
-            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
-
-        sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.5
-        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
-        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
-            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
-
-        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
-        return noisy_samples
-
-    def get_velocity(self, sample: paddle.Tensor, noise: paddle.Tensor, timesteps: paddle.Tensor) -> paddle.Tensor:
-        sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5
-        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
-        while len(sqrt_alpha_prod.shape) < len(sample.shape):
-            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
-
-        sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.5
-        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
-        while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
-            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
-
-        velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
-        return velocity
-
-    def init_unet_weights(self):
-        reset_initialized_parameter(self.unet)
-        zeros_(self.unet.conv_out.weight)
-        zeros_(self.unet.conv_out.bias)
-        for _, m in self.unet.named_sublayers():
-            if isinstance(m, AttentionBlock):
-                zeros_(m.proj_attn.weight)
-                zeros_(m.proj_attn.bias)
-            if isinstance(m, ResnetBlock2D):
-                zeros_(m.conv2.weight)
-                zeros_(m.conv2.bias)
-            if isinstance(m, Transformer2DModel):
-                zeros_(m.proj_out.weight)
-                zeros_(m.proj_out.bias)
-
-    @contextlib.contextmanager
-    def ema_scope(self, context=None):
-        if self.use_ema:
-            self.model_ema.store(self.unet.parameters())
-            self.model_ema.copy_to(self.unet)
-            if context is not None:
-                print(f"{context}: Switched to EMA weights")
-        try:
-            yield None
-        finally:
-            if self.use_ema:
-                self.model_ema.restore(self.unet.parameters())
-                if context is not None:
-                    print(f"{context}: Restored training weights")
-
-    def on_train_batch_end(self):
-        if self.use_ema:
-            self.model_ema(self.unet)
-
-    @paddle.no_grad()
-    def decode_image(self, pixel_values=None, max_batch=8, **kwargs):
-        self.eval()
-        if pixel_values.shape[0] > max_batch:
-            pixel_values = pixel_values[:max_batch]
-        latents = self.vae.encode(pixel_values).latent_dist.sample()
-        image = self.vae.decode(latents).sample
-        image = (image / 2 + 0.5).clip(0, 1).transpose([0, 2, 3, 1])
-        image = (image * 255.0).cast("float32").numpy().round()
-        return image
-
-    @paddle.no_grad()
-    def log_image(self, input_ids=None, height=256, width=256, eta=0.0, guidance_scale=7.5, max_batch=8, **kwargs):
-        self.eval()
-        with self.ema_scope():
-            if height % 8 != 0 or width % 8 != 0:
-                raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-            # only log max_batch image
-            if input_ids.shape[0] > max_batch:
-                input_ids = input_ids[:max_batch]
-            text_embeddings = self.text_encoder(input_ids)[0]
-            do_classifier_free_guidance = guidance_scale > 1.0
-            if do_classifier_free_guidance:
-                batch_size, max_length = input_ids.shape
-                uncond_input = self.tokenizer(
-                    [""] * batch_size,
-                    padding="max_length",
-                    truncation=True,
-                    max_length=max_length,
-                    return_tensors="pd",
-                )
-                uncond_embeddings = self.text_encoder(uncond_input.input_ids)[0]
-                text_embeddings = paddle.concat([uncond_embeddings, text_embeddings], axis=0)
-
-            latents = paddle.randn((input_ids.shape[0], self.unet.in_channels, height // 8, width // 8))
-            latents = latents * self.eval_scheduler.init_noise_sigma
-            accepts_eta = "eta" in set(inspect.signature(self.eval_scheduler.step).parameters.keys())
-            extra_step_kwargs = {}
-            if accepts_eta:
-                extra_step_kwargs["eta"] = eta
-            for t in self.eval_scheduler.timesteps:
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.eval_scheduler.scale_model_input(latent_model_input, t)
-                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                latents = self.eval_scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-            latents = 1 / self.vae.config.scaling_factor * latents
-            image = self.vae.decode(latents).sample
-            image = (image / 2 + 0.5).clip(0, 1).transpose([0, 2, 3, 1]) * 255.0
-        return image.cast("float32").numpy().round()
-
-    def set_recompute(self, use_recompute=False):
-        if use_recompute:
-            self.unet.enable_gradient_checkpointing()
-            if self.model_args.train_text_encoder and hasattr(self.text_encoder, "gradient_checkpointing_enable"):
-                self.text_encoder.gradient_checkpointing_enable()
-
-    def gradient_checkpointing_enable(self):
-        self.set_recompute(True)
-
-    def set_xformers(self, use_xformers=False):
-        if use_xformers:
-            if not is_ppxformers_available():
-                raise ValueError(
-                    'Please run `python -m pip install "paddlepaddle-gpu>=2.5.0.post117" -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html first.'
-                )
-            else:
-                try:
-                    self.unet.enable_xformers_memory_efficient_attention()
-                    if hasattr(self.vae, "enable_xformers_memory_efficient_attention"):
-                        self.vae.enable_xformers_memory_efficient_attention()
-                    if hasattr(self.text_encoder, "enable_xformers_memory_efficient_attention"):
-                        self.text_encoder.enable_xformers_memory_efficient_attention()
-                except Exception as e:
-                    logger.warn(
-                        "Could not enable memory efficient attention. Make sure develop paddlepaddle is installed"
-                        f" correctly and a GPU is available: {e}"
-                    )
-
-    def set_ema(self, use_ema=False):
-        self.use_ema = use_ema
-        if use_ema:
-            self.model_ema = LitEma(self.unet)
diff --git a/ppdiffusers/examples/stable_diffusion/sd/sd_args.py b/ppdiffusers/examples/stable_diffusion/sd/sd_args.py
deleted file mode 100644
index a8a8d8e5ba28..000000000000
--- a/ppdiffusers/examples/stable_diffusion/sd/sd_args.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-from dataclasses import dataclass, field
-from typing import List, Optional
-
-from paddlenlp.trainer import TrainingArguments
-
-__all__ = [
-    "SDTrainingArguments",
-    "SDModelArguments",
-    "SDDataArguments",
-]
-import os
-
-
-def str2bool(v):
-    if v.lower() in ("yes", "true", "t", "y", "1"):
-        return True
-    elif v.lower() in ("no", "false", "f", "n", "0"):
-        return False
-    else:
-        raise ValueError("Unsupported value encountered.")
-
-
-@dataclass
-class SDTrainingArguments(TrainingArguments):
-    image_logging_steps: int = field(default=1000, metadata={"help": "Log image every X steps."})
-    to_static: bool = field(default=False, metadata={"help": "Whether or not to_static"})
-    benchmark: bool = field(
-        default=False,
-        metadata={"help": "Whether or not run benchmark."},
-    )
-    profiler_options: Optional[str] = field(
-        default=None,
-        metadata={"help": "profiler_options."},
-    )
-    report_to: Optional[List[str]] = field(
-        default_factory=lambda: ["custom_visualdl"],
-        metadata={"help": "The list of integrations to report the results and logs to."},
-    )
-    resolution: int = field(
-        default=512,
-        metadata={
-            "help": "The resolution for input images, all the images in the train/validation dataset will be resized to this resolution."
-        },
-    )
-    use_ema: bool = field(default=False, metadata={"help": "Whether or not use ema"})
-    enable_xformers_memory_efficient_attention: bool = field(
-        default=False, metadata={"help": "enable_xformers_memory_efficient_attention."}
-    )
-    only_save_updated_model: bool = field(
-        default=True, metadata={"help": "Whether or not save only_save_updated_model"}
-    )
-    unet_learning_rate: float = field(default=None, metadata={"help": "The initial learning rate for Unet Model."})
-    text_encoder_learning_rate: float = field(
-        default=None, metadata={"help": "The initial learning rate for Text Encoder Model."}
-    )
-
-    def __post_init__(self):
-        super().__post_init__()
-        self.image_logging_steps = (
-            (math.ceil(self.image_logging_steps / self.logging_steps) * self.logging_steps)
-            if self.image_logging_steps > 0
-            else -1
-        )
-        self.use_ema = str2bool(os.getenv("FLAG_USE_EMA", "False")) or self.use_ema
-        self.enable_xformers_memory_efficient_attention = (
-            str2bool(os.getenv("FLAG_XFORMERS", "False")) or self.enable_xformers_memory_efficient_attention
-        )
-        self.recompute = str2bool(os.getenv("FLAG_RECOMPUTE", "False")) or self.recompute
-        self.benchmark = str2bool(os.getenv("FLAG_BENCHMARK", "False")) or self.benchmark
-        self.to_static = str2bool(os.getenv("FLAG_TO_STATIC", "False")) or self.to_static
-
-        if self.text_encoder_learning_rate is None:
-            self.text_encoder_learning_rate = self.learning_rate
-        if self.unet_learning_rate is None:
-            self.unet_learning_rate = self.learning_rate
-
-        # set default learning rate
-        self.learning_rate = self.unet_learning_rate
-
-        if self.to_static:
-            self.use_ema = False
-            self.enable_xformers_memory_efficient_attention = False
-            self.recompute = False
-
-
-@dataclass
-class SDModelArguments:
-    vae_name_or_path: Optional[str] = field(default=None, metadata={"help": "vae_name_or_path"})
-    text_encoder_name_or_path: Optional[str] = field(default=None, metadata={"help": "text_encoder_name_or_path"})
-    unet_name_or_path: Optional[str] = field(default=None, metadata={"help": "unet_name_or_path"})
-    tokenizer_name: Optional[str] = field(
-        default=None,
-        metadata={"help": "Pretrained tokenizer name or path if not the same as pretrained_model_name_or_path"},
-    )
-    pretrained_model_name_or_path: str = field(
-        default="CompVis/stable-diffusion-v1-4",
-        metadata={"help": "Path to pretrained model or model, when we want to resume training."},
-    )
-    model_max_length: int = field(default=77, metadata={"help": "Pretrained tokenizer model_max_length"})
-    prediction_type: str = field(
-        default="epsilon",
-        metadata={
-            "help": "prediction_type, prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4 https://imagen.research.google/video/paper.pdf)"
-        },
-    )
-    num_inference_steps: int = field(default=50, metadata={"help": "num_inference_steps"})
-    train_text_encoder: bool = field(default=False, metadata={"help": "Whether or not train text encoder"})
-
-    noise_offset: float = field(default=0, metadata={"help": "The scale of noise offset."})
-    snr_gamma: Optional[float] = field(
-        default=None,
-        metadata={
-            "help": "SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. More details here: https://arxiv.org/abs/2303.09556."
-        },
-    )
-    input_perturbation: Optional[float] = field(
-        default=0, metadata={"help": "The scale of input perturbation. Recommended 0.1."}
-    )
-
-
-@dataclass
-class SDDataArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training.
-    """
-
-    file_list: str = field(
-        default="./data/filelist/train.filelist.list", metadata={"help": "The name of the file_list."}
-    )
-    num_records: int = field(default=10000000, metadata={"help": "num_records"})
-    buffer_size: int = field(
-        default=100,
-        metadata={"help": "Buffer size"},
-    )
-    shuffle_every_n_samples: int = field(
-        default=5,
-        metadata={"help": "shuffle_every_n_samples."},
-    )
-    interpolation: str = field(
-        default="lanczos",
-        metadata={"help": "interpolation method"},
-    )
diff --git a/ppdiffusers/examples/stable_diffusion/sd/sd_trainer.py b/ppdiffusers/examples/stable_diffusion/sd/sd_trainer.py
deleted file mode 100644
index c84cf2c5db14..000000000000
--- a/ppdiffusers/examples/stable_diffusion/sd/sd_trainer.py
+++ /dev/null
@@ -1,254 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import time
-
-import paddle
-from paddle.io import DataLoader
-
-from paddlenlp.trainer import PrinterCallback, ProgressCallback, Trainer
-from paddlenlp.trainer.integrations import (
-    INTEGRATION_TO_CALLBACK,
-    TrainerCallback,
-    VisualDLCallback,
-    rewrite_logs,
-)
-from paddlenlp.transformers.model_utils import _add_variant
-from paddlenlp.utils import profiler
-from paddlenlp.utils.log import logger
-from ppdiffusers.training_utils import unwrap_model
-
-from .text_image_pair_dataset import TextImagePair, worker_init_fn
-
-PADDLE_WEIGHTS_NAME = "model_state.pdparams"
-TRAINING_ARGS_NAME = "training_args.bin"
-
-
-class VisualDLWithImageCallback(VisualDLCallback):
-    def on_step_end(self, args, state, control, model=None, **kwargs):
-        if hasattr(model, "on_train_batch_end"):
-            model.on_train_batch_end()
-        if args.image_logging_steps > 0 and state.global_step % args.image_logging_steps == 0:
-            control.should_log = True
-
-    def on_log(self, args, state, control, logs=None, **kwargs):
-        # log image on each node
-        inputs = kwargs.get("inputs", None)
-        model = kwargs.get("model", None)
-        image_logs = {}
-        if (
-            inputs is not None
-            and model is not None
-            and args.image_logging_steps > 0
-            and state.global_step % args.image_logging_steps == 0
-        ):
-            max_batch = 4 if args.resolution > 256 else 8
-            image_logs["reconstruction"] = model.decode_image(pixel_values=inputs["pixel_values"], max_batch=max_batch)
-            image_logs["ddim-samples-1.0"] = model.log_image(
-                input_ids=inputs["input_ids"],
-                guidance_scale=1.0,
-                height=args.resolution,
-                width=args.resolution,
-                max_batch=max_batch,
-            )
-            image_logs["ddim-samples-7.5"] = model.log_image(
-                input_ids=inputs["input_ids"],
-                guidance_scale=7.5,
-                height=args.resolution,
-                width=args.resolution,
-                max_batch=max_batch,
-            )
-
-        if not state.is_world_process_zero:
-            return
-
-        if self.vdl_writer is None:
-            self._init_summary_writer(args)
-
-        base_learning_rate = logs.get("learning_rate", None)
-        if base_learning_rate is not None:
-            logs["unet_lr"] = base_learning_rate
-            if model.train_text_encoder:
-                if args.text_encoder_learning_rate != args.unet_learning_rate:
-                    logs["unet_lr"] = base_learning_rate * args.unet_learning_rate
-                    logs["text_encoder_lr"] = base_learning_rate * args.text_encoder_learning_rate
-                else:
-                    logs["text_encoder_lr"] = base_learning_rate
-
-        if self.vdl_writer is not None:
-            logs = rewrite_logs(logs)
-            for k, v in logs.items():
-                if isinstance(v, (int, float)):
-                    self.vdl_writer.add_scalar(k, v, state.global_step)
-                else:
-                    logger.warning(
-                        "Trainer is attempting to log a value of "
-                        f'"{v}" of type {type(v)} for key "{k}" as a scalar. '
-                        "This invocation of VisualDL's writer.add_scalar() "
-                        "is incorrect so we dropped this attribute."
-                    )
-            # log images
-            for k, v in image_logs.items():
-                self.vdl_writer.add_image(k, v, state.global_step, dataformats="NHWC")
-            self.vdl_writer.flush()
-
-
-class AverageStatistical(object):
-    def __init__(self):
-        self.reset()
-
-    def reset(self):
-        self.total_cnt = 0
-        self.time = 0
-
-    def record(self, val, cnt=1):
-        self.time += val
-        self.total_cnt += cnt
-
-    def get_average(self):
-        if self.total_cnt == 0:
-            return 0
-
-        return self.time / self.total_cnt
-
-    def get_average_per_sec(self):
-        if self.time == 0.0:
-            return 0.0
-
-        return float(self.total_cnt) / self.time
-
-    def get_total_cnt(self):
-        return self.total_cnt
-
-    def get_total_time(self):
-        return self.time
-
-
-class BenchmarkCallback(TrainerCallback):
-    def __init__(self, benchmark=True, profiler_options=None):
-        self.benchmark = benchmark
-        self.profiler_options = profiler_options
-
-    def on_train_begin(self, args, state, control, **kwargs):
-        assert args.gradient_accumulation_steps == 1 and not args.do_eval and not args.do_predict
-        if self.benchmark:
-            self.reader_cost_avg = AverageStatistical()
-
-    def on_epoch_begin(self, args, state, control, **kwargs):
-        if self.benchmark:
-            self.epoch_start = time.time()
-            self.batch_start = time.time()
-
-    def on_step_begin(self, args, state, control, **kwargs):
-        if self.benchmark:
-            self.reader_cost_avg.record(time.time() - self.batch_start)
-
-    def on_step_end(self, args, state, control, **kwargs):
-        if self.profiler_options is not None:
-            profiler.add_profiler_step(self.profiler_options)
-
-        if self.benchmark:
-            self.batch_start = time.time()
-            if control.should_log:
-                self.maybe_log_save_evaluate_start = time.time()
-
-    def on_log(self, args, state, control, logs=None, **kwargs):
-        if self.benchmark:
-            if logs is not None and "interval_steps_per_second" in logs:
-                self.batch_start = self.batch_start + (time.time() - self.maybe_log_save_evaluate_start)
-                ips = logs["interval_steps_per_second"] * args.train_batch_size
-                avg_batch_cost = 1 / logs["interval_steps_per_second"]
-                logger.info(
-                    "global step %d / %d, loss: %f, avg_reader_cost: %.5f sec, avg_batch_cost: %.5f sec, avg_samples: %.5f, ips: %.5f sample/sec"
-                    % (
-                        state.global_step,
-                        state.max_steps,
-                        logs["loss"],
-                        self.reader_cost_avg.get_average(),
-                        avg_batch_cost,
-                        args.train_batch_size,
-                        ips,
-                    )
-                )
-                self.reader_cost_avg.reset()
-
-    def on_epoch_end(self, args, state, control, **kwargs):
-        if self.benchmark:
-            train_epoch_cost = time.time() - self.epoch_start
-            logger.info("train epoch: %d, epoch_cost: %.5f s" % (state.epoch, train_epoch_cost))
-
-
-# register visualdl_with_image
-INTEGRATION_TO_CALLBACK.update({"custom_visualdl": VisualDLWithImageCallback})
-
-
-class StableDiffusionTrainer(Trainer):
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        if self.args.benchmark or self.args.profiler_options is not None:
-            self.add_callback(
-                BenchmarkCallback(benchmark=self.args.benchmark, profiler_options=self.args.profiler_options)
-            )
-            if self.args.benchmark:
-                if self.args.disable_tqdm:
-                    self.pop_callback(PrinterCallback)
-                else:
-                    self.pop_callback(ProgressCallback)
-
-    def compute_loss(self, model, inputs, return_outputs=False):
-        loss = model(**inputs)
-        return loss
-
-    def get_train_dataloader(self):
-        if self.train_dataset is None:
-            raise ValueError("Trainer: training requires a train_dataset.")
-        if isinstance(self.train_dataset, TextImagePair):
-            return DataLoader(
-                self.train_dataset,
-                batch_size=self.args.train_batch_size,
-                num_workers=self.args.dataloader_num_workers,
-                worker_init_fn=worker_init_fn,
-            )
-        else:
-            return super().get_train_dataloader()
-
-    def _save(self, output_dir=None, state_dict=None, merge_tensor_parallel=False):
-        output_dir = output_dir if output_dir is not None else self.args.output_dir
-        os.makedirs(output_dir, exist_ok=True)
-        if self.args.only_save_updated_model:
-            unwraped_model = unwrap_model(self.model)
-            logger.info(f"Saving unet checkpoint to {output_dir}/unet")
-            unwraped_model.unet.save_pretrained(os.path.join(output_dir, "unet"))
-
-            if unwraped_model.use_ema:
-                logger.info(f"Saving ema unet checkpoint to {output_dir}/unet")
-                with unwraped_model.ema_scope():
-                    unwraped_model.unet.save_pretrained(os.path.join(output_dir, "unet"), variant="ema")
-
-            if unwraped_model.train_text_encoder:
-                logger.info(f"Saving text encoder checkpoint to {output_dir}/text_encoder")
-                unwraped_model.text_encoder.save_pretrained(os.path.join(output_dir, "text_encoder"))
-        else:
-            logger.info(f"Saving model checkpoint to {output_dir}")
-            if state_dict is None:
-                state_dict = self.model.state_dict()
-            paddle.save(
-                state_dict,
-                os.path.join(output_dir, _add_variant(PADDLE_WEIGHTS_NAME, self.args.weight_name_suffix)),
-            )
-            if self.args.should_save:
-                if self.tokenizer is not None:
-                    self.tokenizer.save_pretrained(output_dir)
-                paddle.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
diff --git a/ppdiffusers/examples/stable_diffusion/sd/text_image_pair_dataset.py b/ppdiffusers/examples/stable_diffusion/sd/text_image_pair_dataset.py
deleted file mode 100644
index b41f0b799469..000000000000
--- a/ppdiffusers/examples/stable_diffusion/sd/text_image_pair_dataset.py
+++ /dev/null
@@ -1,236 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import base64
-import gzip
-import io
-import json
-import random
-
-import numpy as np
-import paddle
-import paddle.distributed as dist
-from paddle.io import IterableDataset, get_worker_info
-from paddle.vision import transforms
-from paddle.vision.transforms.transforms import _get_image_size
-from PIL import Image
-
-Image.MAX_IMAGE_PIXELS = 2300000000
-
-
-def parse_line(line, filename):
-    def parse_src(filename):
-        if "laion_aes" in filename:
-            return "laion_aes"
-        elif "laion400m" in filename:
-            return "laion400m"
-        else:
-            raise NotImplementedError(f"Unkown data source, {filename}")
-
-    try:
-        vec = line.strip().split("\t")
-        data_source = parse_src(filename)
-        if data_source == "laion400m":
-            caption, _, img_b64 = vec[:3]
-        elif data_source == "laion_aes":
-            text_json = json.loads(vec[2])
-            img_b64 = vec[5]
-            caption = text_json.get("caption_en", text_json.get("blip_caption_en", ""))
-        else:
-            _, captions, _, _, _, img_b64 = vec[:6]
-            caption = random.sample(captions.split("|"), 1)[0].replace("\1", "")
-
-        image = Image.open(io.BytesIO(base64.b64decode(img_b64))).convert("RGB")
-        if random.random() < 0.1:
-            caption = ""
-        return dict(image=image, caption=caption)
-    except Exception:
-        print(f"error when parse file {filename}")
-        # traceback.print_exc()
-        return None
-
-
-# donot use random.randint
-class RandomCrop(transforms.RandomCrop):
-    def _get_param(self, img, output_size):
-        w, h = _get_image_size(img)
-        th, tw = output_size
-        if w == tw and h == th:
-            return 0, 0, h, w
-
-        i = paddle.randint(0, h - th + 1).item()
-        j = paddle.randint(0, w - tw + 1).item()
-        return i, j, th, tw
-
-
-class TextImagePair(IterableDataset):
-    def __init__(
-        self,
-        file_list,
-        size,
-        num_records,
-        image_processing=None,
-        buffer_size=1000,
-        shuffle_every_n_samples=5,
-        interpolation="lanczos",
-        tokenizer=None,
-    ):
-        self.size = size
-        if image_processing is None:
-            self.image_processing = transforms.Compose(
-                [
-                    transforms.Resize(int(size / 0.9), interpolation),
-                    RandomCrop(size),
-                    transforms.ToTensor(),
-                    transforms.Normalize(0.5, 0.5),
-                ]
-            )
-        else:
-            self.image_processing = image_processing
-        self.text_processing = lambda caption: tokenizer(
-            caption,
-            padding="max_length",
-            truncation=True,
-            max_length=tokenizer.model_max_length,
-            return_tensors="pd",
-        ).input_ids[0]
-        self.file_list = []
-        file_weights = []
-        with open(file_list, "r") as f:
-            file_lists = f.read().strip().split("\n")
-            for file_l in file_lists:
-                file_l = file_l.split(" ")
-                if len(file_l) > 1:
-                    file_weight = float(file_l[1])
-                    file_weights.append(file_weight)
-                file_l = file_l[0]
-                with open(file_l, "r") as f:
-                    self.file_list.append(f.read().strip().split("\n"))
-        print([len(file_l) for file_l in self.file_list])
-        if len(file_weights) == len(self.file_list):
-            file_weights = np.array(file_weights)
-            file_weight_sum = np.sum(file_weights)
-            assert file_weight_sum > 0, "sum of file weights must > 0"
-            file_weights = file_weights / file_weight_sum
-            print(f"sample weights of files: {file_weights}")
-            self.file_weights_cumsum = np.cumsum(file_weights)
-            self.file_weights_cumsum = np.concatenate([[0.0], self.file_weights_cumsum])
-        else:
-            print("sample each file list with same probabiliy")
-            self.file_weights_cumsum = None
-
-        self.num_records = num_records
-        self.file_ids = [np.arange(len(filelist)) for filelist in self.file_list]
-        print(f"original lengths of self.file_ids: {[len(f) for f in self.file_ids]}")
-        self.buffer_size = buffer_size
-        self.shuffle_every_n_samples = shuffle_every_n_samples
-
-    def sample_loader(self, file_ids, filenames):
-        while True:
-            random.shuffle(file_ids)
-            for i in file_ids:
-                filename = filenames[i].strip("\n")
-                with gzip.open(filename, "rb") if filename.endswith(".gz") else open(filename, "rb") as f:
-                    # retry = 0
-                    while True:
-                        line = f.readline()
-
-                        if line == b"":
-                            break
-                        try:
-                            try:
-                                line = line.decode(encoding="utf-8")
-                            except Exception:
-                                line = line.decode(encoding="gb18030")
-                        except Exception:
-                            print(f"error on file {filename}")
-                            continue
-                        data = parse_line(line, filename)
-                        if data is None:
-                            # retry += 1
-                            # if retry > 100:
-                            #     break
-                            continue
-                        else:
-                            w, h = data["image"].size
-                            if w < self.size or h < self.size:
-                                continue
-                            yield {
-                                "pixel_values": self.image_processing(data["image"]),
-                                "input_ids": self.text_processing(data["caption"]),
-                            }
-
-    def random_load_from_multi_dataset(self):
-        print(f"lengths of self.file_ids in random_load: {[len(f) for f in self.file_ids]}")
-        sample_loader_per_dataset = [
-            iter(self.sample_loader(self.file_ids[i], self.file_list[i])) for i in range(len(self.file_ids))
-        ]
-
-        while True:
-            if self.file_weights_cumsum is None:
-                sample_loader = random.choice(sample_loader_per_dataset)
-            else:
-                rand_num = random.random()
-                for i in range(len(self.file_list)):
-                    if self.file_weights_cumsum[i] <= rand_num < self.file_weights_cumsum[i + 1]:
-                        break
-                sample_loader = sample_loader_per_dataset[i]
-                # debug
-                # print(self.file_list[i][0])
-            yield next(sample_loader)
-
-    def shuffle(self, iterator):
-        buffer_list = []
-        for _ in range(self.buffer_size):
-            buffer_list.append(next(iterator))
-        i = 0
-        while True:
-            if i % self.shuffle_every_n_samples == 0:
-                random.shuffle(buffer_list)
-            yield buffer_list.pop()
-            buffer_list.append(next(iterator))
-            i += 1
-
-    def __len__(self):
-        return self.num_records
-
-    def __iter__(self):
-        return self.shuffle(iter(self.random_load_from_multi_dataset()))
-
-
-def worker_init_fn(_):
-    worker_info = get_worker_info()
-    dataset = worker_info.dataset
-    worker_id = worker_info.id
-
-    local_rank = dist.get_rank()
-    world_size = dist.get_world_size()
-    num_workers = worker_info.num_workers
-    worker_id = worker_info.id
-    worker_global_id = local_rank * num_workers + worker_id
-
-    dataset.rng = np.random.RandomState(worker_global_id)
-    for i in range(len(dataset.file_ids)):
-
-        file_ids = dataset.file_ids[i]
-        num_chunks = world_size * num_workers
-        chunk_size = len(file_ids) // num_chunks
-
-        begin_id = worker_global_id * chunk_size
-        end_id = (worker_global_id + 1) * chunk_size
-        dataset.file_ids[i] = dataset.file_ids[i][begin_id:end_id]
-        print(
-            f"dataset {i}, local_rank: {local_rank}, worker_id: {worker_id}, worker_global_id: {worker_global_id}, file_range: ({begin_id}, {end_id})"
-        )
-    return np.random.seed(np.random.get_state()[1][0] + worker_id)
diff --git a/ppdiffusers/examples/stable_diffusion/train_txt2img_laion400m_trainer.py b/ppdiffusers/examples/stable_diffusion/train_txt2img_laion400m_trainer.py
deleted file mode 100644
index 165a20ca8547..000000000000
--- a/ppdiffusers/examples/stable_diffusion/train_txt2img_laion400m_trainer.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import itertools
-import os
-
-import paddle
-from sd import (
-    SDDataArguments,
-    SDModelArguments,
-    SDTrainingArguments,
-    StableDiffusionModel,
-    StableDiffusionTrainer,
-    TextImagePair,
-)
-
-from paddlenlp.trainer import PdArgumentParser, get_last_checkpoint, set_seed
-from paddlenlp.utils.log import logger
-
-
-def main():
-    parser = PdArgumentParser((SDModelArguments, SDDataArguments, SDTrainingArguments))
-    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-    training_args.print_config(model_args, "Model")
-    training_args.print_config(data_args, "Data")
-
-    paddle.set_device(training_args.device)
-
-    # Detecting last checkpoint.
-    last_checkpoint = None
-    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
-        last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome."
-            )
-        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
-            logger.info(
-                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-    if training_args.seed is not None:
-        set_seed(training_args.seed)
-
-    model = StableDiffusionModel(model_args)
-    model.set_recompute(training_args.recompute)
-    model.set_xformers(training_args.enable_xformers_memory_efficient_attention)
-    model.set_ema(training_args.use_ema)
-
-    if training_args.to_static:
-        input_ids = paddle.static.InputSpec(name="input_ids", shape=[-1, model_args.model_max_length], dtype="int64")
-        pixel_values = paddle.static.InputSpec(
-            name="pixel_values", shape=[-1, 3, training_args.resolution, training_args.resolution], dtype="float32"
-        )
-        specs = [input_ids, pixel_values]
-        paddle.jit.ignore_module([os])
-        model = paddle.jit.to_static(model, input_spec=specs)
-        logger.info("Successfully to apply @to_static with specs: {}".format(specs))
-
-    train_dataset = TextImagePair(
-        file_list=data_args.file_list,
-        size=training_args.resolution,
-        num_records=data_args.num_records,
-        buffer_size=data_args.buffer_size,
-        shuffle_every_n_samples=data_args.shuffle_every_n_samples,
-        interpolation=data_args.interpolation,
-        tokenizer=model.tokenizer,
-    )
-
-    trainer = StableDiffusionTrainer(
-        model=model,
-        args=training_args,
-        train_dataset=train_dataset,
-        tokenizer=model.tokenizer,
-    )
-
-    if model_args.train_text_encoder:
-        if training_args.text_encoder_learning_rate == training_args.unet_learning_rate:
-            params_to_train = itertools.chain(model.text_encoder.parameters(), model.unet.parameters())
-        else:
-            # overwrite default learning rate with 1.0
-            training_args.learning_rate = 1.0
-            params_to_train = [
-                {
-                    "params": model.text_encoder.parameters(),
-                    "learning_rate": training_args.text_encoder_learning_rate,
-                },
-                {
-                    "params": model.unet.parameters(),
-                    "learning_rate": training_args.unet_learning_rate,
-                },
-            ]
-    else:
-        params_to_train = model.unet.parameters()
-    trainer.set_optimizer_grouped_parameters(params_to_train)
-
-    checkpoint = None
-    if training_args.resume_from_checkpoint is not None:
-        checkpoint = training_args.resume_from_checkpoint
-    elif last_checkpoint is not None:
-        checkpoint = last_checkpoint
-
-    # Training
-    trainer.train(resume_from_checkpoint=checkpoint)
-    trainer.save_model()
-    trainer.save_state()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/ppdiffusers/examples/t2i-adapter/README.md b/ppdiffusers/examples/t2i-adapter/README.md
deleted file mode 100644
index eed18c6db1d4..000000000000
--- a/ppdiffusers/examples/t2i-adapter/README.md
+++ /dev/null
@@ -1,167 +0,0 @@
-# T2I-Adapter
-[T2I-Adapter](https://arxiv.org/abs/2302.08453) 是一种通过添加额外条件来控制扩散模型的神经网络结构。它通过将T2I（Text2Image）模型的内部知识与外部控制信号进行对齐，并依据不同的条件来训练多种适配器（Adapter），可以实现丰富和精细的控制及编辑能力。
-<p align="center">
-    <img src="https://github.com/TencentARC/T2I-Adapter/blob/main/assets/overview1.png?raw=true">
-</p>
-
-## 安装依赖
-在运行这部分代码前，我们需要安装develop分支的ppdiffusers库：
-```bash
-cd ppdiffusers
-python setup.py install
-```
-此外我们还需要安装相关依赖：
-```bash
-pip install -r requirements.txt
-```
-
-# 训练与推理
-## Adapter模型训练
-下面我们将以pose2image任务（即姿态控制）为例，介绍如何训练相应的Adapter模型。
-### 数据准备
-请自行按照`adapter/data_preprocess.py`的数据处理逻辑准备好数据，并且将文件放置于`/data`目录，数据中需包含原图像、控制文本、控制图像等信息。
-
-Tips: 我们可以选择下载demo数据并替换掉`/data`目录
-- 下载demo数据`wget https://paddlenlp.bj.bcebos.com/models/community/westfish/t2i-adapter/t2i-adapter-data-demo.zip`；
-
-### 单机单卡训练
-```bash
-export FLAGS_conv_workspace_size_limit=4096
-python -u -m train_t2i_adapter_trainer.py \
-    --do_train \
-    --output_dir ./sd15_openpose \
-    --per_device_train_batch_size 4 \
-    --gradient_accumulation_steps 1 \
-    --learning_rate 1e-5 \
-    --weight_decay 0.02 \
-    --lr_scheduler_type "constant" \
-    --warmup_steps 0 \
-    --max_steps 50000 \
-    --logging_steps 1 \
-    --image_logging_steps 500 \
-    --save_steps 50 \
-    --save_total_limit 1000 \
-    --seed 4096 \
-    --dataloader_num_workers 0 \
-    --pretrained_model_name_or_path runwayml/stable-diffusion-v1-5 \
-    --max_grad_norm -1 \
-    --file_list ./data/train.openpose.filelist \
-    --recompute False --use_ema False \
-    --control_type raw \
-    --data_format img2img \
-    --use_paddle_conv_init False \
-    --overwrite_output_dir \
-    --timestep_sample_schedule cubic
-```
-`train_t2i_adapter_trainer.py`关键传入的参数解释如下：
-> * `--pretrained_model_name_or_path`: 加载预训练SD模型的名称或本地路径，如`runwayml/stable-diffusion-v1-5`，`pretrained_model_name_or_path`的优先级高于`vae_name_or_path`, `text_encoder_name_or_path`和`unet_name_or_path`。
-> * `--per_device_train_batch_size`: 训练时每张显卡所使用的`batch_size批量`，当我们的显存较小的时候，需要将这个值设置的小一点。
-> * `--gradient_accumulation_steps`: 梯度累积的步数，用户可以指定梯度累积的步数，以期在梯度累积的step中减少多卡之间梯度的通信量，减少更新的次数，扩大训练的batch_size。
-> * `--learning_rate`: 学习率。
-> * `--weight_decay`: `AdamW`优化器的`weight_decay`。
-> * `--max_steps`: 最大的训练步数。
-> * `--save_steps`: 每间隔多少步`（global step步数）`，保存模型。
-> * `--save_total_limit`: 最多保存多少个模型。
-> * `--lr_scheduler_type`: 要使用的学习率调度策略。默认为 `constant`。
-> * `--warmup_steps`: 用于从 0 到 `learning_rate` 的线性 warmup 的步数。
-> * `--image_logging_steps`: 每隔多少步，log训练过程中的图片，默认为`1000`步，注意`image_logging_steps`需要是`logging_steps`的整数倍。
-> * `--logging_steps`: logging日志的步数，默认为`50`步。
-> * `--output_dir`: 模型保存路径。
-> * `--seed`: 随机种子，为了可以复现训练结果，Tips：当前paddle设置该随机种子后仍无法完美复现。
-> * `--dataloader_num_workers`: Dataloader所使用的`num_workers`参数。
-> * `--file_path`: 训练数据文件夹所在的地址，上述例子我们使用了`fill50k`目录。
-> * `--num_inference_steps`: 推理预测时候使用的步数。
-> * `--model_max_length`: `tokenizer`中的`model_max_length`参数，超过该长度将会被截断。
-> * `--tokenizer_name`: 我们需要使用的`tokenizer_name`，我们可以使用英文的分词器`bert-base-uncased`，也可以使用中文的分词器`ernie-1.0`。
-> * `--use_ema`: 是否对`unet`使用`ema`，默认为`False`。
-> * `--max_grad_norm`: 梯度剪裁的最大norm值，`-1`表示不使用梯度裁剪策略。
-> * `--use_paddle_conv_init`: 是否使用`paddle`的卷积初始化策略，默认值为 `False`，否则将采用`Uniform`卷积初始化策略。
-> * `--recompute`: 是否开启重计算，(`bool`, 可选, 默认为 `False`)，在开启后我们可以增大`batch_size`。
-> * `--fp16`: 是否使用 fp16 混合精度训练而不是 fp32 训练。(`bool`, 可选, 默认为 `False`)
-> * `--fp16_opt_level`: 混合精度训练模式，可为``O1``或``O2``模式，默认``O1``模式，默认O1. 只在fp16选项开启时候生效。
-> * `--is_ldmbert`: 是否使用`ldmbert`作为`text_encoder`，默认为`False`，即使用 `clip text_encoder`。
-> * `--overwrite_output_dir`: 加入该参数之后，将覆盖之前的模型保存路径，不会自动恢复训练。
-> * `--pretrained_adapter_name_or_path`: 加载预训练Adapter模型的名称或本地路径，默认为`None`，此时将随机初始化模型参数。
-> * `--timestep_sample_schedule`: 训练期间时间步采样策略的类型，从[`linear`, `cosine`, `cubic`]中选择，默认为`linear`。
-
-
-
-### 单机多卡训练 (多机多卡训练，仅需在 paddle.distributed.launch 后加个 --ips IP1,IP2,IP3,IP4)
-```bash
-export FLAGS_conv_workspace_size_limit=4096
-python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" train_t2i_adapter_trainer.py \
-    --do_train \
-    --output_dir ./sd15_openpose \
-    --per_device_train_batch_size 4 \
-    --gradient_accumulation_steps 1 \
-    --learning_rate 1e-5 \
-    --weight_decay 0.02 \
-    --lr_scheduler_type "constant" \
-    --warmup_steps 0 \
-    --max_steps 50000 \
-    --logging_steps 1 \
-    --image_logging_steps 500 \
-    --save_steps 50 \
-    --save_total_limit 1000 \
-    --seed 4096 \
-    --dataloader_num_workers 0 \
-    --pretrained_model_name_or_path runwayml/stable-diffusion-v1-5 \
-    --max_grad_norm -1 \
-    --file_list ./data/train.openpose.filelist \
-    --recompute False --use_ema False \
-    --control_type raw \
-    --data_format img2img \
-    --use_paddle_conv_init False \
-    --overwrite_output_dir
-```
-
-## 模型推理
-### 简易推理
-待模型训练完毕，会在`output_dir`保存训练好的模型权重，我们可以使用如下的代码进行推理。
-```python
-from ppdiffusers import StableDiffusionAdapterPipeline, Adapter
-from ppdiffusers.utils import load_image
-adapter = Adapter.from_pretrained("./sd15_control/checkpoint-12000/adapter")
-pipe = StableDiffusionAdapterPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", adapter = adapter, safety_checker=None)
-pose_image = load_image("https://paddlenlp.bj.bcebos.com/models/community/westfish/t2i-adapter/test/man-openpose.png")
-img = pipe(prompt="a beautiful girl", image=pose_image, guidance_scale=9, num_inference_steps=50).images[0]
-img.save("demo.png")
-```
-
-### 测试集推理
-我们可以使用如下命令针对相应的测试集（需符合`adapter/data_preprocess.py`的数据处理逻辑）进行测试。
-```
-python generate.py \
-    --adapter_model_name_or_path westfish/sd-v1-4-adapter-openpose \
-    --sd_model_name_or_path lllyasviel/sd-controlnet-openpose \
-    --save_path your/output/path \
-    --num_inference_steps 50 \
-    --scheduler_type ddim \
-    --height=512 \
-    --width=512 \
-    --device gpu \
-    --max_generation_limits 1000 \
-    --use_text_cond True \
-    --generate_control_image_processor_type openpose \
-    --file data/test.openpose.filelist \
-    --generate_data_format img2img \
-```
-`generate.py`关键传入的参数解释如下：
-> * `--use_controlnet`: 是否采用ControlNet来进行条件控制，默认为`False`，即默认使用Adapter来进行条件控制。
-> * `--adapter_model_name_or_path`: Adapter采用的的的模型名称或地址。
-> * `--sd_model_name_or_path`: Stable Diffusion采用的的的模型名称或地址。
-> * `--file`: 需要测试的数据。
-> * `--batch_size`: 生成图片所使用的batch_size。
-> * `--save_path`: 生成的图片所要保存的路径。
-> * `--guidance_scales`: guidance_scales值，默认为[3 5 7]。
-> * `--num_inference_steps`: 推理预测时候使用的步数。
-> * `--scheduler_type`: 采样器的类型，支持`ddim`, `pndm`, `euler-ancest` 和 `lms`。
-> * `--height`: 生成图片的高，默认为512。
-> * `--width`: 生成图片的宽，默认为512。
-> * `--seed`: 随机种子。
-> * `--device`: 使用的设备，可以是`gpu`, `cpu`, `gpu:0`, `gpu:1`等。
-> * `--max_generation_limits`: 每次最多生成的个数。
-> * `--use_text_cond`: 是否使用数据集中自带的文本提示词，默认为`True`。
-> * `--use_default_neg_text_cond`: 是否使用默认的负提示词，默认为`True`。
-> * `--generate_control_image_processor_type`: 控制生成的类型，可选择`canny`、`openpose`。
-> * `--generate_data_format`: 数据控制类型，当`generate_control_image_processor_type`为`canny`是设置为`default`，其他情况下设置为`img2img`。
diff --git a/ppdiffusers/examples/t2i-adapter/adapter/__init__.py b/ppdiffusers/examples/t2i-adapter/adapter/__init__.py
deleted file mode 100644
index 7a03949328db..000000000000
--- a/ppdiffusers/examples/t2i-adapter/adapter/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# flake8: noqa
-
-from .adapter_args import DataArguments, GenerateArguments, ModelArguments
-from .adapter_trainer import AdapterLDMTrainer
-from .dumpy_dataset import Fill50kDataset
-from .model import AdapterLDM
-from .text_image_pair_dataset import TextImagePair, worker_init_fn
diff --git a/ppdiffusers/examples/t2i-adapter/adapter/adapter_args.py b/ppdiffusers/examples/t2i-adapter/adapter/adapter_args.py
deleted file mode 100644
index b34b588c1a25..000000000000
--- a/ppdiffusers/examples/t2i-adapter/adapter/adapter_args.py
+++ /dev/null
@@ -1,129 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from dataclasses import dataclass, field
-from typing import Optional
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    adapter_config_file: Optional[str] = field(
-        default="./config/openpose_adapter.json", metadata={"help": "adapter_config_file"}
-    )
-    vae_name_or_path: Optional[str] = field(default=None, metadata={"help": "pretrained_vae_name_or_path"})
-    text_encoder_name_or_path: Optional[str] = field(default=None, metadata={"help": "text_encoder_name_or_path"})
-    unet_name_or_path: Optional[str] = field(default=None, metadata={"help": "unet_encoder_name_or_path"})
-    tokenizer_name: Optional[str] = field(
-        default=None,
-        metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"},
-    )
-    model_max_length: Optional[int] = field(default=77, metadata={"help": "Pretrained tokenizer model_max_length"})
-    num_inference_steps: Optional[int] = field(default=50, metadata={"help": "num_inference_steps"})
-    use_ema: bool = field(default=False, metadata={"help": "Whether or not use ema"})
-    pretrained_model_name_or_path: str = field(
-        default="runwayml/stable-diffusion-v1-5",
-        metadata={"help": "Path to pretrained model or model, when we want to resume training."},
-    )
-    pretrained_adapter_name_or_path: str = field(
-        default=None,
-        metadata={
-            "help": "The pretrained weight of adapter, which is used to facilitate loading the same initialization for training."
-        },
-    )
-    image_logging_steps: Optional[int] = field(default=1000, metadata={"help": "Log image every X steps."})
-    use_paddle_conv_init: bool = field(default=False, metadata={"help": "Whether or not use paddle conv2d init."})
-    is_ldmbert: bool = field(default=False, metadata={"help": "Whether to use ldmbert."})
-    enable_xformers_memory_efficient_attention: bool = field(
-        default=False, metadata={"help": "enable_xformers_memory_efficient_attention."}
-    )
-    control_type: Optional[str] = field(default="canny", metadata={"help": "The type of control"})
-    latents_path: str = field(
-        default=None,
-        metadata={"help": "Path to latents, used for alignment."},
-    )
-    random_alignment: bool = field(default=False, metadata={"help": "Whether to align random."})
-    timestep_sample_schedule: Optional[str] = field(
-        default="linear",
-        metadata={
-            "help": "The type of timestep-sampling schedule during training, select from ['linear', 'cosine', 'cubic']."
-        },
-    )
-
-
-@dataclass
-class DataArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training.
-    """
-
-    file_list: str = field(
-        default="./data/filelist/train.filelist.list", metadata={"help": "The name of the file_list."}
-    )
-    resolution: int = field(
-        default=512,
-        metadata={
-            "help": "The resolution for input images, all the images in the train/validation dataset will be resized to this resolution."
-        },
-    )
-    num_records: int = field(default=10000000, metadata={"help": "num_records"})
-    buffer_size: int = field(
-        default=100,
-        metadata={"help": "Buffer size"},
-    )
-    shuffle_every_n_samples: int = field(
-        default=5,
-        metadata={"help": "shuffle_every_n_samples."},
-    )
-    data_format: str = field(
-        default="default",
-        metadata={
-            "help": "The data format, must be 'default' or 'img2img'.  The img2img format directly provides control image."
-        },
-    )
-
-
-@dataclass
-class GenerateArguments:
-    """
-    Arguments pertaining to specify the model generation settings.
-    """
-
-    use_controlnet: bool = field(default=False, metadata={"help": "Whether or not use text condition"})
-    use_dumpy_dataset: bool = field(default=False, metadata={"help": "Whether or not use dummpy dataset"})
-    adapter_model_name_or_path: str = field(default=None, metadata={"help": "adapter model name or path."})
-    sd_model_name_or_path: str = field(default=None, metadata={"help": "sd model name or path."})
-    file: str = field(default="data/test.openpose.filelist", metadata={"help": "eval file."})
-    seed: int = field(default=42, metadata={"help": "random seed."})
-    scheduler_type: str = field(
-        default="ddim",
-        metadata={"help": "Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim', 'euler-ancest']"},
-    )
-    device: str = field(default="gpu", metadata={"help": "device"})
-    batch_size: int = field(default=16, metadata={"help": "batch_size"})
-    num_inference_steps: int = field(default=50, metadata={"help": "num_inference_steps"})
-    save_path: str = field(default="output/adapter/", metadata={"help": "Path to the output file."})
-    guidance_scales: str = field(default_factory=lambda: [5, 7, 9], metadata={"help": "guidance_scales list."})
-    height: int = field(default=512, metadata={"help": "height."})
-    width: int = field(default=512, metadata={"help": "width."})
-    max_generation_limits: int = field(default=1000, metadata={"help": "max generation limits."})
-    use_text_cond: bool = field(default=True, metadata={"help": "Whether or not use text condition"})
-    use_default_neg_text_cond: bool = field(
-        default=True, metadata={"help": "Whether or not use default negative text condition"}
-    )
-    generate_data_format: str = field(default="img2img", metadata={"help": "Generate data format."})
-    generate_control_image_processor_type: str = field(default="openpose", metadata={"help": "Generate data format."})
diff --git a/ppdiffusers/examples/t2i-adapter/adapter/adapter_trainer.py b/ppdiffusers/examples/t2i-adapter/adapter/adapter_trainer.py
deleted file mode 100644
index 8f5e8e1a20dd..000000000000
--- a/ppdiffusers/examples/t2i-adapter/adapter/adapter_trainer.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import contextlib
-import os
-import sys
-
-import paddle
-import paddle.amp.auto_cast as autocast
-from paddle.io import DataLoader
-
-from paddlenlp.trainer import Trainer
-from paddlenlp.trainer.integrations import (
-    INTEGRATION_TO_CALLBACK,
-    VisualDLCallback,
-    rewrite_logs,
-)
-from paddlenlp.utils.log import logger
-from ppdiffusers.training_utils import unwrap_model
-
-from .text_image_pair_dataset import TextImagePair, worker_init_fn
-
-
-class VisualDLWithImageCallback(VisualDLCallback):
-    def autocast_smart_context_manager(self, args):
-        if args.fp16 or args.bf16:
-            amp_dtype = "float16" if args.fp16 else "bfloat16"
-            ctx_manager = autocast(
-                True,
-                custom_black_list=[
-                    "reduce_sum",
-                    "c_softmax_with_cross_entropy",
-                ],
-                level=args.fp16_opt_level,
-                dtype=amp_dtype,
-            )
-        else:
-            ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
-
-        return ctx_manager
-
-    def on_step_end(self, args, state, control, model=None, **kwargs):
-        if hasattr(model, "on_train_batch_end"):
-            model.on_train_batch_end()
-        if args.image_logging_steps > 0 and state.global_step % args.image_logging_steps == 0:
-            control.should_log = True
-
-    def on_log(self, args, state, control, logs=None, **kwargs):
-        # only on card 0
-        if not state.is_world_process_zero:
-            return
-        # log image on each node
-        inputs = kwargs.get("inputs", None)
-        model = kwargs.get("model", None)
-        image_logs = {}
-        if (
-            inputs is not None
-            and model is not None
-            and args.image_logging_steps > 0
-            and state.global_step % args.image_logging_steps == 0
-        ):
-            with self.autocast_smart_context_manager(args):
-                image_logs["reconstruction"] = model.decode_image(pixel_values=inputs["pixel_values"])
-                image_logs["control"] = model.decode_control_image(adapter_cond=inputs["adapter_cond"])
-                image_logs["ddim-samples-9.0"] = model.log_image(
-                    input_ids=inputs["input_ids"],
-                    adapter_cond=inputs["adapter_cond"],
-                    guidance_scale=9.0,
-                    height=args.resolution,
-                    width=args.resolution,
-                )
-
-        if self.vdl_writer is None:
-            self._init_summary_writer(args)
-
-        if self.vdl_writer is not None:
-            logs = rewrite_logs(logs)
-            for k, v in logs.items():
-                if isinstance(v, (int, float)):
-                    self.vdl_writer.add_scalar(k, v, state.global_step)
-                else:
-                    logger.warning(
-                        "Trainer is attempting to log a value of "
-                        f'"{v}" of type {type(v)} for key "{k}" as a scalar. '
-                        "This invocation of VisualDL's writer.add_scalar() "
-                        "is incorrect so we dropped this attribute."
-                    )
-            # log images
-            for k, v in image_logs.items():
-                self.vdl_writer.add_image(k, v, state.global_step, dataformats="NHWC")
-            self.vdl_writer.flush()
-
-
-# register visualdl_with_image
-INTEGRATION_TO_CALLBACK.update({"custom_visualdl": VisualDLWithImageCallback})
-
-
-def collate_fn(examples):
-    pixel_values = paddle.stack([paddle.to_tensor(example["pixel_values"]) for example in examples])
-    input_ids = paddle.stack([paddle.to_tensor(example["input_ids"]) for example in examples])
-    adapter_cond = paddle.stack([paddle.to_tensor(example["adapter_cond"]) for example in examples])
-
-    batch = {"input_ids": input_ids, "pixel_values": pixel_values, "adapter_cond": adapter_cond}
-    return batch
-
-
-class AdapterLDMTrainer(Trainer):
-    def compute_loss(self, model, inputs, return_outputs=False):
-        loss = model(**inputs)
-        return loss
-
-    def get_train_dataloader(self):
-        if self.train_dataset is None:
-            raise ValueError("Trainer: training requires a train_dataset.")
-        if isinstance(self.train_dataset, TextImagePair):
-            return DataLoader(
-                self.train_dataset,
-                batch_size=self.args.train_batch_size,
-                num_workers=self.args.dataloader_num_workers,
-                worker_init_fn=worker_init_fn,
-                collate_fn=collate_fn,
-            )
-        else:
-            return super().get_train_dataloader()
-
-    def _save(self, output_dir=None, state_dict=None, merge_tensor_parallel=False):
-        super()._save(output_dir=output_dir, state_dict=state_dict, merge_tensor_parallel=merge_tensor_parallel)
-        output_dir = output_dir if output_dir is not None else self.args.output_dir
-        unwrap_model(self.model).adapter.save_pretrained(os.path.join(output_dir, "adapter"))
diff --git a/ppdiffusers/examples/t2i-adapter/adapter/annotator_utils.py b/ppdiffusers/examples/t2i-adapter/adapter/annotator_utils.py
deleted file mode 100644
index 68c832602166..000000000000
--- a/ppdiffusers/examples/t2i-adapter/adapter/annotator_utils.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-annotator utils.
-"""
-
-import numpy as np
-import paddle
-from annotator.canny import CannyDetector
-from annotator.hed import HEDdetector
-from annotator.util import HWC3
-from paddle.vision import transforms
-
-
-def create_annotator(control_type):
-    """create_annotator by control type."""
-    if control_type == "canny":
-        return CannyProcessor()
-    elif control_type == "hed":
-        return HedProcessor()
-    elif control_type == "raw":
-        return DummyProcessor()
-    else:
-        raise NotImplementedError
-
-
-class DummyProcessor:
-    """
-    Dummy.
-    """
-
-    def __init__(self):
-        self.post_process = transforms.ToTensor()
-
-    def process_data_load(self, image):
-        """
-        Args:
-          image: PIL image.
-        Return:
-          numpy or tensor. (0 ~ 1)
-        """
-        res = self.post_process(image)
-        return res
-
-    def process_model_forward(self, image):
-        """dummy"""
-        return image
-
-
-class CannyProcessor:
-    """
-    canny wrapper.
-    """
-
-    def __init__(self):
-        self.canny_thresh = (100, 200)
-        self.apply_canny = CannyDetector()
-        self.post_process = transforms.ToTensor()
-
-    def process_data_load(self, image):
-        """
-        Args:
-          image: PIL image.
-        Return:
-          numpy or tensor. (0 ~ 1)
-        """
-        image = np.array(image)
-        img = HWC3(image)
-        H, W, C = img.shape
-        # TODO: random thresh.
-        detected_map = self.apply_canny(img, *self.canny_thresh)
-        detected_map = HWC3(detected_map)
-        res = self.post_process(detected_map)
-        return res
-
-    def process_model_forward(self, image):
-        """
-        Args:
-          tensor (GPU)
-        Return:
-          tensor (GPU)
-        """
-        return image
-
-
-class HedProcessor:
-    """
-    HED wrapper.
-    """
-
-    def __init__(self):
-        self.apply_hed = HEDdetector(modelpath="you/hed/model")
-        self.post_process = transforms.ToTensor()
-
-    def process_data_load(self, image):
-        """
-        Args:
-          image: PIL image.
-        Return:
-          numpy or tensor.
-        """
-        image = np.array(image)
-        img = HWC3(image)  # numpy shape=(H, W, C), RGB
-        img = image[:, :, ::-1]  # numpy shape=(H, W, C), BGR
-        res = self.post_process(img)  # tensor, shape=(C, H, W), BGR, \in (0, 1)
-        return res
-
-    def process_model_forward(self, image):
-        """
-        Args:
-          tensor (GPU), shape=(B, 3, H, W), (0, 1)
-        Return:
-          tensor (GPU), shape=(B, 3, H, W), (0, 1)
-        """
-        with paddle.no_grad():
-            edge = self.apply_hed.netNetwork(image)  # (B, 1, H, W)
-            B, C, H, W = edge.shape
-            edge = edge.expand([B, 3, H, W])  # (B, 3, H, W)
-        return edge
diff --git a/ppdiffusers/examples/t2i-adapter/adapter/data_preprocess.py b/ppdiffusers/examples/t2i-adapter/adapter/data_preprocess.py
deleted file mode 100644
index e179df14c8f4..000000000000
--- a/ppdiffusers/examples/t2i-adapter/adapter/data_preprocess.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import base64
-import io
-import json
-import random
-
-from PIL import Image
-
-
-def base64_to_image(base64_str):
-    byte_data = base64.b64decode(base64_str)
-    image_data = io.BytesIO(byte_data)
-    img = Image.open(image_data)
-    if img.mode != "RGB":
-        img = img.convert("RGB")
-    return img
-
-
-def process_data(line, filename, data_format):
-    try:
-        data = line.strip().split("\t")
-        if data_format == "img2img":
-            text_id = data[0]
-            text_json = json.loads(data[2])
-            image_num = int(data[3])  # 2
-            image_b64str = data[4 + image_num]  # data[6]
-            control_image_b64str = data[4 + image_num + 1]  # data[7]
-        else:
-            text_id = data[0]
-            text_json = json.loads(data[2])
-            image_b64str = data[5]
-            control_image_b64str = None
-
-        caption = ""
-        caption += text_json.get("caption_en", text_json.get("blip_caption_en", ""))
-        if caption != "":
-            image_base64 = image_b64str
-        else:
-            return None
-
-        return image_base64, caption, text_id, control_image_b64str
-
-    except Exception as e:
-        print(f"error when parse file {filename}")
-        print(e)
-        return None
-
-
-def parse_line(line, filename, data_format="default"):
-    try:
-        res = process_data(line, filename, data_format)
-        if res is not None:
-            image_base64, caption, _id, control_image_base64 = res
-            image = Image.open(io.BytesIO(base64.b64decode(image_base64))).convert("RGB")
-            if control_image_base64 is not None:
-                image_extract = io.BytesIO(base64.b64decode(control_image_base64))
-                control_image = Image.open(image_extract).convert("RGB")
-
-                control_image = control_image.resize(image.size)
-            else:
-                control_image = None
-
-            if image.size[0] < image.size[1]:  # 长图裁剪
-                crop_size = (0, 0, image.size[0], image.size[0])
-            else:  # 宽图裁剪
-                crop_size = (
-                    (image.size[0] - image.size[1]) // 2,
-                    0,
-                    (image.size[0] + image.size[1]) // 2,
-                    image.size[1],
-                )
-            image = image.crop(crop_size)
-            if control_image is not None:
-                control_image = control_image.crop(crop_size)
-
-            # drop out
-            if random.random() < 0.5:
-                caption = ""
-            return dict(
-                image=image,
-                caption=caption,
-                _id=_id,
-                control_image=control_image,
-            )
-        else:
-            return None
-    except Exception as e:
-        print(f"error when parse file {filename}")
-        print(e)
-        return None
diff --git a/ppdiffusers/examples/t2i-adapter/adapter/dumpy_dataset.py b/ppdiffusers/examples/t2i-adapter/adapter/dumpy_dataset.py
deleted file mode 100644
index 679e67b4749d..000000000000
--- a/ppdiffusers/examples/t2i-adapter/adapter/dumpy_dataset.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-
-import numpy as np
-import paddle
-from paddle.io import Dataset
-from PIL import Image
-
-
-class Fill50kDataset(Dataset):
-    def __init__(self, tokenizer, file_path="./fill50k", do_image_processing=True, do_text_processing=True):
-        self.tokenizer = tokenizer
-        self.image_list = []
-        self.label_list = []
-        self.file_path = file_path
-        self.do_image_processing = do_image_processing
-        self.do_text_processing = do_text_processing
-        self.data = []
-        self.file_path = file_path
-        with open(os.path.join(file_path, "prompt.json"), "rt") as f:
-            for line in f:
-                self.data.append(json.loads(line))
-
-        self.text_processing = None
-        if tokenizer:
-            self.text_processing = lambda caption: tokenizer(
-                caption,
-                padding="max_length",
-                truncation=True,
-                max_length=tokenizer.model_max_length,
-                return_tensors="np",
-            ).input_ids[0]
-        self.do_image_processing = do_image_processing
-        self.do_text_processing = do_text_processing
-
-    def __len__(self):
-        return len(self.data)
-
-    def __getitem__(self, idx):
-        item = self.data[idx]
-
-        source_filename = item["source"]
-        target_filename = item["target"]
-        prompt = item["prompt"]
-
-        source = Image.open(os.path.join(self.file_path, source_filename))
-        target = Image.open(os.path.join(self.file_path, target_filename))
-
-        if self.do_image_processing:
-            # Normalize source images to [0, 1].
-            source = source.astype(np.float32) / 255.0
-            source = paddle.to_tensor(source.transpose([2, 0, 1]), dtype=paddle.float32)
-
-            # Normalize target images to [-1, 1].
-            target = (target.astype(np.float32) / 127.5) - 1.0
-            target = paddle.to_tensor(target.transpose([2, 0, 1]), dtype=paddle.float32)
-
-        if self.text_processing and self.do_text_processing:
-            input_ids = self.text_processing(prompt)
-            input_ids = paddle.to_tensor(input_ids, dtype=paddle.int64)
-        else:
-            input_ids = prompt
-
-        return dict(
-            input_ids=input_ids,
-            pixel_values=target,
-            adapter_cond=source,
-        )
diff --git a/ppdiffusers/examples/t2i-adapter/adapter/model.py b/ppdiffusers/examples/t2i-adapter/adapter/model.py
deleted file mode 100644
index f8c7f83ad460..000000000000
--- a/ppdiffusers/examples/t2i-adapter/adapter/model.py
+++ /dev/null
@@ -1,332 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import contextlib
-import inspect
-import json
-import os
-
-import numpy as np
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-
-from paddlenlp.transformers import AutoTokenizer, CLIPTextModel
-from paddlenlp.utils.log import logger
-from ppdiffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    DDPMScheduler,
-    LDMBertModel,
-    T2IAdapter,
-    UNet2DConditionModel,
-    is_ppxformers_available,
-)
-
-# from ppdiffusers.initializer import reset_initialized_parameter
-from ppdiffusers.models.ema import LitEma
-from ppdiffusers.training_utils import freeze_params
-
-from .annotator_utils import create_annotator
-
-
-def read_json(file):
-    with open(file, "r", encoding="utf-8") as f:
-        data = json.load(f)
-    return data
-
-
-generator = np.random.RandomState(42)
-
-
-class AdapterLDM(nn.Layer):
-    def __init__(self, model_args):
-        super().__init__()
-        # init control image processor
-        self.control_image_processor = create_annotator(model_args.control_type)
-
-        # init tokenizer
-        tokenizer_name_or_path = (
-            model_args.tokenizer_name
-            if model_args.pretrained_model_name_or_path is None
-            else os.path.join(model_args.pretrained_model_name_or_path, "tokenizer")
-        )
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            tokenizer_name_or_path, model_max_length=model_args.model_max_length
-        )
-
-        vae_name = "vqvae" if model_args.is_ldmbert else "vae"
-        # init vae
-        vae_name_or_path = (
-            model_args.vae_name_or_path
-            if model_args.pretrained_model_name_or_path is None
-            else os.path.join(model_args.pretrained_model_name_or_path, vae_name)
-        )
-
-        self.vae = AutoencoderKL.from_pretrained(vae_name_or_path)
-        freeze_params(self.vae.parameters())
-        logger.info("Freeze vae parameters!")
-
-        if model_args.is_ldmbert:
-            text_encoder_name_or_path = (
-                model_args.text_encoder_name_or_path
-                if model_args.pretrained_model_name_or_path is None
-                else os.path.join(model_args.pretrained_model_name_or_path, "bert")
-            )
-            # init text_encoder
-            self.text_encoder = LDMBertModel.from_pretrained(text_encoder_name_or_path)
-        else:
-            text_encoder_name_or_path = (
-                model_args.text_encoder_name_or_path
-                if model_args.pretrained_model_name_or_path is None
-                else os.path.join(model_args.pretrained_model_name_or_path, "text_encoder")
-            )
-            self.text_encoder = CLIPTextModel.from_pretrained(text_encoder_name_or_path)
-
-        freeze_params(self.text_encoder.parameters())
-        logger.info("Freeze text_encoder parameters!")
-
-        unet_name_or_path = (
-            model_args.unet_name_or_path
-            if model_args.pretrained_model_name_or_path is None
-            else os.path.join(model_args.pretrained_model_name_or_path, "unet")
-        )
-
-        self.unet = UNet2DConditionModel.from_pretrained(unet_name_or_path)
-
-        freeze_params(self.unet.parameters())
-        logger.info("Freeze unet parameters!")
-
-        if model_args.pretrained_adapter_name_or_path:
-            self.adapter = T2IAdapter.from_pretrained(model_args.pretrained_adapter_name_or_path)
-        else:
-            self.adapter = T2IAdapter(**read_json(model_args.adapter_config_file))
-
-        self.noise_scheduler = DDPMScheduler(
-            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000
-        )
-        self.eval_scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-            steps_offset=1,
-        )
-        self.eval_scheduler.set_timesteps(model_args.num_inference_steps)
-        self.use_ema = model_args.use_ema
-        if self.use_ema:
-            self.model_ema = LitEma(self.adapter)
-        self.adapter_conditioning_scale = 1.0
-
-        if model_args.enable_xformers_memory_efficient_attention and is_ppxformers_available():
-            try:
-                self.unet.enable_xformers_memory_efficient_attention()
-                self.adapter.enable_xformers_memory_efficient_attention()
-            except Exception as e:
-                logger.warn(
-                    "Could not enable memory efficient attention. Make sure develop paddlepaddle is installed"
-                    f" correctly and a GPU is available: {e}"
-                )
-        self.use_preconfig_latents = False
-        if model_args.latents_path:
-            self.use_preconfig_latents = True
-            self.register_buffer("preconfig_latents", paddle.load(model_args.latents_path))
-        self.random_alignment = model_args.random_alignment
-        self.timestep_sample_schedule = model_args.timestep_sample_schedule
-
-    @contextlib.contextmanager
-    def ema_scope(self, context=None):
-        if self.use_ema:
-            self.model_ema.store(self.controlnet.parameters())
-            self.model_ema.copy_to(self.controlnet)
-            if context is not None:
-                print(f"{context}: Switched to EMA weights")
-        try:
-            yield None
-        finally:
-            if self.use_ema:
-                self.model_ema.restore(self.controlnet.parameters())
-                if context is not None:
-                    print(f"{context}: Restored training weights")
-
-    def on_train_batch_end(self):
-        if self.use_ema:
-            self.model_ema(self.adapter)
-
-    def get_time_with_schedule(self, timestep_sample_schedule, bs):
-        if timestep_sample_schedule == "linear":
-            t = paddle.randint(low=0, high=self.noise_scheduler.num_train_timesteps, shape=(bs,)).astype(dtype="int64")
-        elif timestep_sample_schedule == "cosine":
-            t = paddle.rand(shape=(bs,))
-            t = paddle.cos(x=np.pi / 2.0 * t) * self.noise_scheduler.num_train_timesteps
-            t = t.astype(dtype="int64")
-        elif timestep_sample_schedule == "cubic":
-            t = paddle.rand(shape=(bs,))
-            t = (1 - t**3) * self.noise_scheduler.num_train_timesteps
-            t = t.astype(dtype="int64")
-        else:
-            raise NotImplementedError
-        t = paddle.clip(x=t, min=0, max=self.noise_scheduler.num_train_timesteps - 1)
-        return t
-
-    def get_time_with_schedule_and_numpy_generator(self, timestep_sample_schedule, bs):
-        if timestep_sample_schedule == "linear":
-            t = paddle.to_tensor(
-                generator.randint(0, self.noise_scheduler.num_train_timesteps, size=(bs,)), dtype="int64"
-            )
-        elif timestep_sample_schedule == "cosine":
-            t = paddle.to_tensor(generator.rand(bs))
-            t = paddle.cos(x=np.pi / 2.0 * t) * self.noise_scheduler.num_train_timesteps
-            t = t.astype(dtype="int64")
-        elif timestep_sample_schedule == "cubic":
-            t = paddle.to_tensor(generator.rand(bs))
-            t = (1 - t**3) * self.noise_scheduler.num_train_timesteps
-            t = t.astype(dtype="int64")
-        else:
-            raise NotImplementedError
-        t = paddle.clip(x=t, min=0, max=self.noise_scheduler.num_train_timesteps - 1)
-        return t
-
-    def forward(self, input_ids=None, pixel_values=None, adapter_cond=None, **kwargs):
-        with paddle.no_grad():
-            adapter_cond = self.control_image_processor.process_model_forward(adapter_cond)
-        self.train()
-        with paddle.amp.auto_cast(enable=False):
-            with paddle.no_grad():
-                self.vae.eval()
-                self.text_encoder.eval()
-                latents = self.vae.encode(pixel_values).latent_dist.sample()
-                latents = latents * 0.18215
-                if self.random_alignment:
-                    timesteps = self.get_time_with_schedule_and_numpy_generator(
-                        self.timestep_sample_schedule, latents.shape[0]
-                    )
-                    noise = paddle.to_tensor(generator.randn(*latents.shape), dtype="float32")
-                else:
-                    timesteps = self.get_time_with_schedule(self.timestep_sample_schedule, latents.shape[0])
-                    noise = paddle.randn(latents.shape)
-                noisy_latents = self.noise_scheduler.add_noise(latents, noise, timesteps)
-                encoder_hidden_states = self.text_encoder(input_ids)[0]
-        adapter_state = self.adapter(adapter_cond)
-
-        for k, v in enumerate(adapter_state):
-            adapter_state[k] = v * self.adapter_conditioning_scale
-
-        # predict the noise residual
-        noise_pred = self.unet(
-            noisy_latents,
-            timestep=timesteps,
-            encoder_hidden_states=encoder_hidden_states,
-            down_block_additional_residuals=adapter_state,
-        ).sample
-        loss = F.mse_loss(noise_pred, noise, reduction="mean")
-        return loss
-
-    @paddle.no_grad()
-    def decode_image(self, pixel_values=None, **kwargs):
-        self.eval()
-        if pixel_values.shape[0] > 8:
-            pixel_values = pixel_values[:8]
-        latents = self.vae.encode(pixel_values).latent_dist.sample()
-        image = self.vae.decode(latents).sample
-        image = (image / 2 + 0.5).clip(0, 1).transpose([0, 2, 3, 1])
-        image = (image * 255.0).cast("float32").numpy().round()
-        return image
-
-    @paddle.no_grad()
-    def decode_control_image(self, adapter_cond=None, **kwargs):
-        adapter_cond = self.control_image_processor.process_model_forward(adapter_cond)  # (0, 1)
-        return 255 * (adapter_cond.transpose([0, 2, 3, 1])).cast("float32").numpy().round()
-
-    @paddle.no_grad()
-    def log_image(self, input_ids=None, adapter_cond=None, height=512, width=512, eta=0.0, guidance_scale=9, **kwargs):
-        adapter_cond = self.control_image_processor.process_model_forward(adapter_cond)
-        self.eval()
-        with self.ema_scope():
-            if height % 8 != 0 or width % 8 != 0:
-                raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-            # only log 8 image
-            if input_ids.shape[0] > 4:
-                input_ids = input_ids[:4]
-
-            text_embeddings = self.text_encoder(input_ids)[0]
-            do_classifier_free_guidance = guidance_scale > 1.0
-            if do_classifier_free_guidance:
-                batch_size, max_length = input_ids.shape
-                uncond_input = self.tokenizer(
-                    [""] * batch_size,
-                    padding="max_length",
-                    truncation=True,
-                    max_length=max_length,
-                    return_tensors="pd",
-                )
-                uncond_embeddings = self.text_encoder(uncond_input.input_ids)[0]
-                text_embeddings = paddle.concat([uncond_embeddings, text_embeddings], axis=0)
-            if self.use_preconfig_latents:
-                latents = self.preconfig_latents
-            else:
-                latents = paddle.randn((input_ids.shape[0], self.unet.in_channels, height // 8, width // 8))
-            # ddim donot use this
-            latents = latents * self.eval_scheduler.init_noise_sigma
-
-            accepts_eta = "eta" in set(inspect.signature(self.eval_scheduler.step).parameters.keys())
-            extra_step_kwargs = {}
-            if accepts_eta:
-                extra_step_kwargs["eta"] = eta
-
-            for t in self.eval_scheduler.timesteps:
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-
-                # ddim donot use this
-                latent_model_input = self.eval_scheduler.scale_model_input(latent_model_input, t)
-
-                # Adapter predict the noise residual
-                adapter_state = self.adapter(adapter_cond)
-                for k, v in enumerate(adapter_state):
-                    adapter_state[k] = v * self.adapter_conditioning_scale
-                if do_classifier_free_guidance:
-                    for k, v in enumerate(adapter_state):
-                        adapter_state[k] = paddle.concat(x=[v] * 2, axis=0)
-
-                # predict the noise residual
-                noise_pred = self.unet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=text_embeddings,
-                    down_block_additional_residuals=[state.clone() for state in adapter_state],
-                ).sample
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.eval_scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-            latents = 1 / 0.18215 * latents
-            image = self.vae.decode(latents).sample
-            image = (image / 2 + 0.5).clip(0, 1).transpose([0, 2, 3, 1]) * 255.0
-
-        return image.cast("float32").numpy().round()
-
-    def set_recompute(self, value=False):
-        def fn(layer):
-            if hasattr(layer, "gradient_checkpointing"):
-                layer.gradient_checkpointing = value
-                print("Set", layer.__class__, "recompute", layer.gradient_checkpointing)
-
-        self.adapter.apply(fn)
diff --git a/ppdiffusers/examples/t2i-adapter/adapter/text_image_pair_dataset.py b/ppdiffusers/examples/t2i-adapter/adapter/text_image_pair_dataset.py
deleted file mode 100644
index a3d1481c3980..000000000000
--- a/ppdiffusers/examples/t2i-adapter/adapter/text_image_pair_dataset.py
+++ /dev/null
@@ -1,224 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gzip
-import random
-
-import numpy as np
-import paddle
-import paddle.distributed as dist
-from paddle.io import IterableDataset, get_worker_info
-from paddle.vision import transforms
-from paddle.vision.transforms.transforms import _get_image_size
-from PIL import Image
-
-from .data_preprocess import parse_line
-
-Image.MAX_IMAGE_PIXELS = 2300000000
-
-
-# donot use random.randint
-class RandomCrop(transforms.RandomCrop):
-    def _get_param(self, img, output_size):
-        w, h = _get_image_size(img)
-        th, tw = output_size
-        if w == tw and h == th:
-            return 0, 0, h, w
-
-        i = paddle.randint(0, h - th + 1).item()
-        j = paddle.randint(0, w - tw + 1).item()
-        return i, j, th, tw
-
-
-class TextImagePair(IterableDataset):
-    def __init__(
-        self,
-        file_list,
-        size,
-        num_records,
-        image_processing=None,
-        buffer_size=1000,
-        shuffle_every_n_samples=5,
-        interpolation="lanczos",
-        tokenizer=None,
-        control_image_processor=None,
-        data_format="default",
-        do_image_processing=True,
-    ):
-        self.size = size
-        self.resize_transform = transforms.Resize(int(size), interpolation)
-        if image_processing is None:
-            self.image_processing = transforms.Compose(
-                [
-                    transforms.ToTensor(),  # (0 ~ 1)
-                    transforms.Normalize(0.5, 0.5),  # (-1 ~ 1)
-                ]
-            )
-        else:
-            self.image_processing = image_processing
-        if tokenizer is not None:
-            self.text_processing = lambda caption: tokenizer(
-                caption,
-                padding="max_length",
-                truncation=True,
-                max_length=tokenizer.model_max_length,
-                return_tensors="np",
-            ).input_ids[0]
-        else:
-            self.text_processing = None
-
-        if control_image_processor is not None:
-            self.control_image_processor = control_image_processor
-        else:
-            self.control_image_processor = None
-
-        self.file_list = []
-        file_weights = []
-        with open(file_list, "r") as f:
-            file_lists = f.read().strip().split("\n")
-            for file_l in file_lists:
-                file_l = file_l.split(" ")
-                if len(file_l) > 1:
-                    file_weight = float(file_l[1])
-                    file_weights.append(file_weight)
-                file_l = file_l[0]
-                with open(file_l, "r") as f:
-                    self.file_list.append(f.read().strip().split("\n"))
-        print([len(file_l) for file_l in self.file_list])
-        if len(file_weights) == len(self.file_list):
-            file_weights = np.array(file_weights)
-            file_weight_sum = np.sum(file_weights)
-            assert file_weight_sum > 0, "sum of file weights must > 0"
-            file_weights = file_weights / file_weight_sum
-            print(f"sample weights of files: {file_weights}")
-            self.file_weights_cumsum = np.cumsum(file_weights)
-            self.file_weights_cumsum = np.concatenate([[0.0], self.file_weights_cumsum])
-        else:
-            print("sample each file list with same probabiliy")
-            self.file_weights_cumsum = None
-
-        self.num_records = num_records
-        self.file_ids = [np.arange(len(filelist)) for filelist in self.file_list]
-        print(f"original lengths of self.file_ids: {[len(f) for f in self.file_ids]}")
-        self.buffer_size = buffer_size
-        self.shuffle_every_n_samples = shuffle_every_n_samples
-        self.data_format = data_format
-        self.do_image_processing = do_image_processing
-
-    def sample_loader(self, file_ids, filenames):
-        while True:
-            random.shuffle(file_ids)
-            for i in file_ids:
-                filename = filenames[i].strip("\n")
-                with gzip.open(filename, "rb") if filename.endswith(".gz") else open(filename, "rb") as f:
-                    # retry = 0
-                    while True:
-                        line = f.readline()
-
-                        if line == b"":
-                            break
-                        try:
-                            try:
-                                line = line.decode(encoding="utf-8")
-                            except Exception:
-                                line = line.decode(encoding="gb18030")
-                        except Exception:
-                            print(f"error on file {filename}")
-                            continue
-                        data = parse_line(line, filename, self.data_format)
-                        if data is None:
-                            continue
-                        else:
-                            w, h = data["image"].size
-                            if w < self.size or h < self.size:
-                                continue
-                            image = self.resize_transform(data["image"])
-
-                            control_image = data["control_image"]
-                            if control_image is not None:
-                                control_image = self.resize_transform(control_image)
-                            else:
-                                control_image = image
-                            out = {
-                                "pixel_values": self.image_processing(image).numpy()
-                                if self.do_image_processing
-                                else image,
-                                "input_ids": self.text_processing(data["caption"])
-                                if self.text_processing
-                                else data["caption"],
-                                "adapter_cond": self.control_image_processor.process_data_load(control_image).numpy()
-                                if self.control_image_processor
-                                else control_image,
-                            }
-                            yield out
-
-    def random_load_from_multi_dataset(self):
-        print(f"lengths of self.file_ids in random_load: {[len(f) for f in self.file_ids]}")
-        sample_loader_per_dataset = [
-            iter(self.sample_loader(self.file_ids[i], self.file_list[i])) for i in range(len(self.file_ids))
-        ]
-
-        while True:
-            if self.file_weights_cumsum is None:
-                sample_loader = random.choice(sample_loader_per_dataset)
-            else:
-                rand_num = random.random()
-                for i in range(len(self.file_list)):
-                    if self.file_weights_cumsum[i] <= rand_num < self.file_weights_cumsum[i + 1]:
-                        break
-                sample_loader = sample_loader_per_dataset[i]
-            yield next(sample_loader)
-
-    def shuffle(self, iterator):
-        buffer_list = []
-        for _ in range(self.buffer_size):
-            buffer_list.append(next(iterator))
-        i = 0
-        while True:
-            if i % self.shuffle_every_n_samples == 0:
-                random.shuffle(buffer_list)
-            yield buffer_list.pop()
-            buffer_list.append(next(iterator))
-            i += 1
-
-    def __len__(self):
-        return self.num_records
-
-    def __iter__(self):
-        return self.shuffle(iter(self.random_load_from_multi_dataset()))
-
-
-def worker_init_fn(_):
-    worker_info = get_worker_info()
-    dataset = worker_info.dataset
-    worker_id = worker_info.id
-    local_rank = dist.get_rank()
-    world_size = dist.get_world_size()
-    num_workers = worker_info.num_workers
-    worker_global_id = local_rank * num_workers + worker_id
-
-    dataset.rng = np.random.RandomState(worker_global_id)
-    for i in range(len(dataset.file_ids)):
-
-        file_ids = dataset.file_ids[i]
-        num_chunks = world_size * num_workers
-        chunk_size = len(file_ids) // num_chunks
-
-        begin_id = worker_global_id * chunk_size
-        end_id = (worker_global_id + 1) * chunk_size
-        dataset.file_ids[i] = dataset.file_ids[i][begin_id:end_id]
-        print(
-            f"dataset {i}, local_rank: {local_rank}, worker_id: {worker_id}, worker_global_id: {worker_global_id}, file_range: ({begin_id}, {end_id})"
-        )
-    return np.random.seed(np.random.get_state()[1][0] + worker_id)
diff --git a/ppdiffusers/examples/t2i-adapter/annotator b/ppdiffusers/examples/t2i-adapter/annotator
deleted file mode 120000
index 394fe0c7511e..000000000000
--- a/ppdiffusers/examples/t2i-adapter/annotator
+++ /dev/null
@@ -1 +0,0 @@
-../controlnet/annotator/
\ No newline at end of file
diff --git a/ppdiffusers/examples/t2i-adapter/config/openpose_adapter.json b/ppdiffusers/examples/t2i-adapter/config/openpose_adapter.json
deleted file mode 100644
index de3ee07d5f16..000000000000
--- a/ppdiffusers/examples/t2i-adapter/config/openpose_adapter.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
-  "_class_name": "Adapter",
-  "_ppdiffusers_version": "0.2.2",
-  "channels_in": 3,
-  "block_out_channels": [
-    320,
-    640,
-    1280,
-    1280
-  ],
-  "num_res_blocks": 2,
-  "kernel_size": 1,
-  "res_block_skip": true,
-  "use_conv": false,
-  "input_scale_factor": 8
-}
diff --git a/ppdiffusers/examples/t2i-adapter/data/laion-aes-canny.filelist.test b/ppdiffusers/examples/t2i-adapter/data/laion-aes-canny.filelist.test
deleted file mode 100644
index 450e8db94cf1..000000000000
--- a/ppdiffusers/examples/t2i-adapter/data/laion-aes-canny.filelist.test
+++ /dev/null
@@ -1 +0,0 @@
-/root/data/t2i-adapter/canny/part-00049
\ No newline at end of file
diff --git a/ppdiffusers/examples/t2i-adapter/data/laion-aes-canny.filelist.train b/ppdiffusers/examples/t2i-adapter/data/laion-aes-canny.filelist.train
deleted file mode 100644
index 21dbcaa02088..000000000000
--- a/ppdiffusers/examples/t2i-adapter/data/laion-aes-canny.filelist.train
+++ /dev/null
@@ -1,50 +0,0 @@
-/root/data/t2i-adapter/canny/part-00000
-/root/data/t2i-adapter/canny/part-00001
-/root/data/t2i-adapter/canny/part-00002
-/root/data/t2i-adapter/canny/part-00003
-/root/data/t2i-adapter/canny/part-00004
-/root/data/t2i-adapter/canny/part-00005
-/root/data/t2i-adapter/canny/part-00006
-/root/data/t2i-adapter/canny/part-00007
-/root/data/t2i-adapter/canny/part-00008
-/root/data/t2i-adapter/canny/part-00009
-/root/data/t2i-adapter/canny/part-00010
-/root/data/t2i-adapter/canny/part-00011
-/root/data/t2i-adapter/canny/part-00012
-/root/data/t2i-adapter/canny/part-00013
-/root/data/t2i-adapter/canny/part-00014
-/root/data/t2i-adapter/canny/part-00015
-/root/data/t2i-adapter/canny/part-00016
-/root/data/t2i-adapter/canny/part-00017
-/root/data/t2i-adapter/canny/part-00018
-/root/data/t2i-adapter/canny/part-00019
-/root/data/t2i-adapter/canny/part-00020
-/root/data/t2i-adapter/canny/part-00021
-/root/data/t2i-adapter/canny/part-00022
-/root/data/t2i-adapter/canny/part-00023
-/root/data/t2i-adapter/canny/part-00024
-/root/data/t2i-adapter/canny/part-00025
-/root/data/t2i-adapter/canny/part-00026
-/root/data/t2i-adapter/canny/part-00027
-/root/data/t2i-adapter/canny/part-00028
-/root/data/t2i-adapter/canny/part-00029
-/root/data/t2i-adapter/canny/part-00030
-/root/data/t2i-adapter/canny/part-00031
-/root/data/t2i-adapter/canny/part-00032
-/root/data/t2i-adapter/canny/part-00033
-/root/data/t2i-adapter/canny/part-00034
-/root/data/t2i-adapter/canny/part-00035
-/root/data/t2i-adapter/canny/part-00036
-/root/data/t2i-adapter/canny/part-00037
-/root/data/t2i-adapter/canny/part-00038
-/root/data/t2i-adapter/canny/part-00039
-/root/data/t2i-adapter/canny/part-00040
-/root/data/t2i-adapter/canny/part-00041
-/root/data/t2i-adapter/canny/part-00042
-/root/data/t2i-adapter/canny/part-00043
-/root/data/t2i-adapter/canny/part-00044
-/root/data/t2i-adapter/canny/part-00045
-/root/data/t2i-adapter/canny/part-00046
-/root/data/t2i-adapter/canny/part-00047
-/root/data/t2i-adapter/canny/part-00048
-/root/data/t2i-adapter/canny/part-00049
\ No newline at end of file
diff --git a/ppdiffusers/examples/t2i-adapter/data/laion-aes-openpose.filelist.test b/ppdiffusers/examples/t2i-adapter/data/laion-aes-openpose.filelist.test
deleted file mode 100644
index 48c218aca2f5..000000000000
--- a/ppdiffusers/examples/t2i-adapter/data/laion-aes-openpose.filelist.test
+++ /dev/null
@@ -1 +0,0 @@
-/root/data/t2i-adapter/openpose/part-00049
\ No newline at end of file
diff --git a/ppdiffusers/examples/t2i-adapter/data/laion-aes-openpose.filelist.train b/ppdiffusers/examples/t2i-adapter/data/laion-aes-openpose.filelist.train
deleted file mode 100644
index 36b0a9111abf..000000000000
--- a/ppdiffusers/examples/t2i-adapter/data/laion-aes-openpose.filelist.train
+++ /dev/null
@@ -1,49 +0,0 @@
-/root/data/t2i-adapter/openpose/part-00000
-/root/data/t2i-adapter/openpose/part-00001
-/root/data/t2i-adapter/openpose/part-00002
-/root/data/t2i-adapter/openpose/part-00003
-/root/data/t2i-adapter/openpose/part-00004
-/root/data/t2i-adapter/openpose/part-00005
-/root/data/t2i-adapter/openpose/part-00006
-/root/data/t2i-adapter/openpose/part-00007
-/root/data/t2i-adapter/openpose/part-00008
-/root/data/t2i-adapter/openpose/part-00009
-/root/data/t2i-adapter/openpose/part-00010
-/root/data/t2i-adapter/openpose/part-00011
-/root/data/t2i-adapter/openpose/part-00012
-/root/data/t2i-adapter/openpose/part-00013
-/root/data/t2i-adapter/openpose/part-00014
-/root/data/t2i-adapter/openpose/part-00015
-/root/data/t2i-adapter/openpose/part-00016
-/root/data/t2i-adapter/openpose/part-00017
-/root/data/t2i-adapter/openpose/part-00018
-/root/data/t2i-adapter/openpose/part-00019
-/root/data/t2i-adapter/openpose/part-00020
-/root/data/t2i-adapter/openpose/part-00021
-/root/data/t2i-adapter/openpose/part-00022
-/root/data/t2i-adapter/openpose/part-00023
-/root/data/t2i-adapter/openpose/part-00024
-/root/data/t2i-adapter/openpose/part-00025
-/root/data/t2i-adapter/openpose/part-00026
-/root/data/t2i-adapter/openpose/part-00027
-/root/data/t2i-adapter/openpose/part-00028
-/root/data/t2i-adapter/openpose/part-00029
-/root/data/t2i-adapter/openpose/part-00031
-/root/data/t2i-adapter/openpose/part-00032
-/root/data/t2i-adapter/openpose/part-00033
-/root/data/t2i-adapter/openpose/part-00034
-/root/data/t2i-adapter/openpose/part-00035
-/root/data/t2i-adapter/openpose/part-00036
-/root/data/t2i-adapter/openpose/part-00037
-/root/data/t2i-adapter/openpose/part-00038
-/root/data/t2i-adapter/openpose/part-00039
-/root/data/t2i-adapter/openpose/part-00040
-/root/data/t2i-adapter/openpose/part-00041
-/root/data/t2i-adapter/openpose/part-00042
-/root/data/t2i-adapter/openpose/part-00043
-/root/data/t2i-adapter/openpose/part-00044
-/root/data/t2i-adapter/openpose/part-00045
-/root/data/t2i-adapter/openpose/part-00046
-/root/data/t2i-adapter/openpose/part-00047
-/root/data/t2i-adapter/openpose/part-00048
-/root/data/t2i-adapter/openpose/part-00049
\ No newline at end of file
diff --git a/ppdiffusers/examples/t2i-adapter/data/test.canny.filelist b/ppdiffusers/examples/t2i-adapter/data/test.canny.filelist
deleted file mode 100644
index 9568ab352459..000000000000
--- a/ppdiffusers/examples/t2i-adapter/data/test.canny.filelist
+++ /dev/null
@@ -1 +0,0 @@
-./data/laion-aes-canny.filelist.test
\ No newline at end of file
diff --git a/ppdiffusers/examples/t2i-adapter/data/test.openpose.filelist b/ppdiffusers/examples/t2i-adapter/data/test.openpose.filelist
deleted file mode 100644
index 4433b5685909..000000000000
--- a/ppdiffusers/examples/t2i-adapter/data/test.openpose.filelist
+++ /dev/null
@@ -1 +0,0 @@
-./data/laion-aes-openpose.filelist.test
\ No newline at end of file
diff --git a/ppdiffusers/examples/t2i-adapter/data/train.canny.filelist b/ppdiffusers/examples/t2i-adapter/data/train.canny.filelist
deleted file mode 100644
index 551f88da249e..000000000000
--- a/ppdiffusers/examples/t2i-adapter/data/train.canny.filelist
+++ /dev/null
@@ -1 +0,0 @@
-./data/laion-aes-openpose.filelist.train
\ No newline at end of file
diff --git a/ppdiffusers/examples/t2i-adapter/data/train.openpose.filelist b/ppdiffusers/examples/t2i-adapter/data/train.openpose.filelist
deleted file mode 100644
index 551f88da249e..000000000000
--- a/ppdiffusers/examples/t2i-adapter/data/train.openpose.filelist
+++ /dev/null
@@ -1 +0,0 @@
-./data/laion-aes-openpose.filelist.train
\ No newline at end of file
diff --git a/ppdiffusers/examples/t2i-adapter/generate.py b/ppdiffusers/examples/t2i-adapter/generate.py
deleted file mode 100644
index 0a56d61f6a0d..000000000000
--- a/ppdiffusers/examples/t2i-adapter/generate.py
+++ /dev/null
@@ -1,236 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import random
-
-import numpy as np
-import paddle
-from adapter import DataArguments, Fill50kDataset, GenerateArguments, TextImagePair
-from annotator.canny import CannyDetector
-from annotator.util import HWC3
-from PIL import Image
-from tqdm import tqdm
-
-from paddlenlp.trainer import PdArgumentParser
-from ppdiffusers import (
-    ControlNetModel,
-    DDIMScheduler,
-    EulerAncestralDiscreteScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    StableDiffusionAdapterPipeline,
-    StableDiffusionControlNetPipeline,
-    T2IAdapter,
-)
-
-DEFAULT_NEGATIVE_PROMPT = (
-    "longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, "
-    "fewer digits, cropped, worst quality, low quality"
-)
-
-
-class CannyProcessor:
-    """
-    canny wrapper.
-    """
-
-    def __init__(self, is_output_3d=False):
-        self.is_output_3d = is_output_3d
-        self.canny_thresh = (100, 200)
-        self.apply_canny = CannyDetector()
-
-    def process_data_load(self, image):
-        """
-        Args:
-          image: PIL image.
-        Return:
-          numpy or tensor. (0 ~ 1)
-        """
-        image = np.array(image)
-        img = HWC3(image)
-        H, W, C = img.shape
-        # TODO: random thresh.
-        detected_map = self.apply_canny(img, *self.canny_thresh)
-        if self.is_output_3d:
-            detected_map = HWC3(detected_map)
-        detected_map = Image.fromarray(detected_map)
-        return detected_map
-
-    def process_model_forward(self, image):
-        """
-        Args:
-          tensor (GPU)
-        Return:
-          tensor (GPU)
-        """
-        return image
-
-
-def set_seed(seed: int):
-    random.seed(seed)
-    np.random.seed(seed)
-    paddle.seed(seed)
-
-
-def generate_images(
-    use_controlnet=False,
-    adapter_model_name_or_path=None,
-    sd_model_name_or_path=None,
-    batch_size=16,
-    test_dataset=None,
-    save_path="output",
-    guidance_scales=[3, 4, 5, 6, 7, 8],
-    num_inference_steps=50,
-    scheduler_type="ddim",
-    device="gpu",
-    max_generation_limits=1000,
-    use_text_cond=True,
-    use_default_neg_text_cond=True,
-    generate_control_image_processor_type=None,
-    eta=0.0,
-):
-    # set pipe
-    paddle.set_device(device)
-    if use_controlnet:
-        controlnet = ControlNetModel.from_pretrained(adapter_model_name_or_path)
-        pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            sd_model_name_or_path, controlnet=controlnet, safety_checker=None
-        )
-    else:
-        adapter = T2IAdapter.from_pretrained(adapter_model_name_or_path)
-        pipe = StableDiffusionAdapterPipeline.from_pretrained(
-            sd_model_name_or_path, adapter=adapter, safety_checker=None
-        )
-    pipe.set_progress_bar_config(disable=True)
-
-    # set scheduler
-    beta_start = pipe.scheduler.beta_start
-    beta_end = pipe.scheduler.beta_end
-    if scheduler_type == "pndm":
-        scheduler = PNDMScheduler(
-            beta_start=beta_start,
-            beta_end=beta_end,
-            beta_schedule="scaled_linear",
-            set_alpha_to_one=False,
-            steps_offset=1,
-            # Make sure the scheduler compatible with PNDM
-            skip_prk_steps=True,
-        )
-    elif scheduler_type == "lms":
-        scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
-    elif scheduler_type == "euler-ancestral":
-        scheduler = EulerAncestralDiscreteScheduler(
-            beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear"
-        )
-    elif scheduler_type == "ddim":
-        scheduler = DDIMScheduler(
-            beta_start=beta_start,
-            beta_end=beta_end,
-            beta_schedule="scaled_linear",
-            # Make sure the scheduler compatible with DDIM
-            clip_sample=False,
-            set_alpha_to_one=False,
-            steps_offset=1,
-        )
-    else:
-        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
-    pipe.scheduler = scheduler
-
-    # generate
-    if generate_control_image_processor_type == "canny":
-        if use_controlnet:
-            canny_processor = CannyProcessor(is_output_3d=True)
-        else:
-            canny_processor = CannyProcessor()
-    for cfg in guidance_scales:
-        set_seed(generate_args.seed)
-        new_save_path = os.path.join(save_path, f"cfg_{cfg}")
-        os.makedirs(new_save_path, exist_ok=True)
-        cond_save_path = os.path.join(save_path, "adapter_cond")
-        os.makedirs(cond_save_path, exist_ok=True)
-        origin_save_path = os.path.join(save_path, "origin_cond")
-        os.makedirs(origin_save_path, exist_ok=True)
-        write_file = open(os.path.join(save_path, "caption.txt"), "w")
-        i = 0
-        for data in tqdm(test_dataset):
-            if (
-                generate_control_image_processor_type == "canny"
-            ):  # Canny mode needs to manually process the control image
-                data["adapter_cond"] = canny_processor.process_data_load(data["pixel_values"])
-            images = pipe(
-                data["input_ids"] if use_text_cond else "",
-                negative_prompt=DEFAULT_NEGATIVE_PROMPT if use_default_neg_text_cond else "",
-                image=data["adapter_cond"],
-                guidance_scale=float(cfg),
-                eta=eta,
-                num_inference_steps=num_inference_steps,
-            )[0]
-            data["adapter_cond"].save(os.path.join(cond_save_path, "{:05d}_000.png".format(i)))
-            data["pixel_values"].save(os.path.join(origin_save_path, "{:05d}_000.png".format(i)))
-            write_file.write("{:05d}_000".format(i) + "\t" + data["input_ids"].strip() + "\n")
-            for image in images:
-                path = os.path.join(new_save_path, "{:05d}_000.png".format(i))
-                image.save(path)
-                i += 1
-            if i % max_generation_limits == 0:
-                break
-
-
-if __name__ == "__main__":
-    parser = PdArgumentParser((DataArguments, GenerateArguments))
-    data_args, generate_args = parser.parse_args_into_dataclasses()
-    print("-----------  Configuration Arguments -----------")
-    for arg, value in sorted(vars(generate_args).items()):
-        print("%s: %s" % (arg, value))
-    print("------------------------------------------------")
-    set_seed(generate_args.seed)
-
-    if generate_args.use_dumpy_dataset:
-        test_dataset = Fill50kDataset(
-            tokenizer=None,
-            file_path=generate_args.file,
-            do_image_processing=False,
-            do_text_processing=False,
-        )
-
-    else:
-        test_dataset = TextImagePair(
-            file_list=generate_args.file,
-            size=data_args.resolution,
-            num_records=data_args.num_records,
-            buffer_size=data_args.buffer_size,
-            shuffle_every_n_samples=data_args.shuffle_every_n_samples,
-            interpolation="lanczos",
-            data_format=generate_args.generate_data_format,
-            control_image_processor=None,
-            do_image_processing=False,
-        )
-
-    generate_images(
-        use_controlnet=generate_args.use_controlnet,
-        adapter_model_name_or_path=generate_args.adapter_model_name_or_path,
-        sd_model_name_or_path=generate_args.sd_model_name_or_path,
-        batch_size=generate_args.batch_size,
-        test_dataset=test_dataset,
-        save_path=generate_args.save_path,
-        guidance_scales=generate_args.guidance_scales,
-        num_inference_steps=generate_args.num_inference_steps,
-        scheduler_type=generate_args.scheduler_type,
-        device=generate_args.device,
-        max_generation_limits=generate_args.max_generation_limits,
-        use_text_cond=generate_args.use_text_cond,
-        use_default_neg_text_cond=generate_args.use_default_neg_text_cond,
-        generate_control_image_processor_type=generate_args.generate_control_image_processor_type,
-    )
diff --git a/ppdiffusers/examples/t2i-adapter/requirements.txt b/ppdiffusers/examples/t2i-adapter/requirements.txt
deleted file mode 100644
index e6b5c9550bf9..000000000000
--- a/ppdiffusers/examples/t2i-adapter/requirements.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-paddlehub>=2.3.1
-paddleseg>=2.7.0
-paddlenlp>=2.5.1
-opencv-python
-pillow==9.4.0
\ No newline at end of file
diff --git a/ppdiffusers/examples/t2i-adapter/tools/convert_diffusers_adapter_to_ppdiffusers.py b/ppdiffusers/examples/t2i-adapter/tools/convert_diffusers_adapter_to_ppdiffusers.py
deleted file mode 100644
index 01f4839ec21f..000000000000
--- a/ppdiffusers/examples/t2i-adapter/tools/convert_diffusers_adapter_to_ppdiffusers.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-from collections import OrderedDict
-
-import paddle
-import torch
-from diffusers import T2IAdapter as DiffusersAdapterNetModel
-
-from ppdiffusers import T2IAdapter as PPDiffusersAdapterNetModel
-
-paddle.set_device("cpu")
-
-
-def convert_to_ppdiffusers(controlnet, dtype="float32"):
-    need_transpose = []
-    for k, v in controlnet.named_modules():
-        if isinstance(v, torch.nn.Linear):
-            need_transpose.append(k + ".weight")
-    new_controlnet = OrderedDict()
-    for k, v in controlnet.state_dict().items():
-        if k not in need_transpose:
-            new_controlnet[k] = v.cpu().numpy().astype(dtype)
-        else:
-            new_controlnet[k] = v.t().cpu().numpy().astype(dtype)
-    return new_controlnet
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
-    parser.add_argument(
-        "--pretrained_model_name_or_path",
-        type=str,
-        default="RzZ/sd-v1-4-adapter-color",
-        help="Path to pretrained model or model identifier from huggingface.co/models.",
-    )
-    parser.add_argument(
-        "--output_path",
-        type=str,
-        default="paddle_models/sd-v1-4-adapter-color",
-        help="The model output path.",
-    )
-    args = parser.parse_args()
-
-    th_controlnet = DiffusersAdapterNetModel.from_pretrained(args.pretrained_model_name_or_path)
-    controlnet_state_dict = convert_to_ppdiffusers(th_controlnet)
-    pp_controlnet = PPDiffusersAdapterNetModel.from_config(th_controlnet.config)
-    pp_controlnet.set_dict(controlnet_state_dict)
-    if not os.path.exists(args.output_path):
-        os.makedirs(args.output_path)
-    pp_controlnet.save_pretrained(args.output_path)
diff --git a/ppdiffusers/examples/t2i-adapter/tools/convert_orig_adapter_ckpt_to_ppdiffusers.py b/ppdiffusers/examples/t2i-adapter/tools/convert_orig_adapter_ckpt_to_ppdiffusers.py
deleted file mode 100644
index e32ea35b8f46..000000000000
--- a/ppdiffusers/examples/t2i-adapter/tools/convert_orig_adapter_ckpt_to_ppdiffusers.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import json
-from collections import OrderedDict
-
-import paddle
-import torch
-from fastcore.all import patch_to
-
-
-def read_json(file):
-    with open(file, "r", encoding="utf-8") as f:
-        data = json.load(f)
-    return data
-
-
-def convert_to_paddle(vae_or_unet, dtype="float32"):
-    need_transpose = []
-    for k, v in vae_or_unet.items():
-        if isinstance(v, torch.nn.Linear):
-            need_transpose.append(k + ".weight")
-    new_vae_or_unet = OrderedDict()
-    for k, v in vae_or_unet.items():
-        if k not in need_transpose:
-            new_vae_or_unet[k] = v.cpu().numpy().astype(dtype)
-        else:
-            new_vae_or_unet[k] = v.t().cpu().numpy().astype(dtype)
-    return new_vae_or_unet
-
-
-@patch_to(paddle.nn.Layer)
-def load_state_dict(self: paddle.nn.Layer, state_dict: dict, use_structured_name=True, strict=True):
-    orig = self.state_dict()
-    orig_keys = set([k for k in orig.keys()])
-    loaded_keys = set([k for k in state_dict.keys()])
-
-    missing_keys = list(orig_keys - loaded_keys)
-    unexpected_keys = list(loaded_keys - orig_keys)
-    print(f"missing_keys: {missing_keys}")
-    print(f"unexpected_keys: {unexpected_keys}")
-    if strict and (len(missing_keys) > 0 or len(unexpected_keys) > 0):
-        raise ValueError("state_dict donot match the orignial state_dict!")
-    return self.load_dict(state_dict, use_structured_name=use_structured_name)
-
-
-def convert_adapter(state):
-    mapping = {
-        "down_opt": "downsample",
-        "in_conv": "conv1",
-        "out_conv": "conv2",
-    }
-
-    def apply(name):
-        for k, v in mapping.items():
-            name = name.replace(k, v)
-        return name
-
-    cvr_state = {apply(k): v for k, v in state.items()}
-    return cvr_state
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
-
-    parser.add_argument(
-        "--orig_t2i_adapter_project_path",
-        type=str,
-        default="pytorch/T2I-Adapter",
-        help="Path to a torch model parameters file",
-    )
-    parser.add_argument(
-        "--orig_t2i_adapter_pretrained_ckpt_path",
-        type=str,
-        default="ckpt/t2iadapter_openpose_sd14v1.pth",
-        help="Path to a torch model parameters file",
-    )
-    parser.add_argument(
-        "--ppdiffusers_t2i_adapter_model_config_path",
-        type=str,
-        default="ppdiffusers/examples/t2i-adapter/config/openpose_adapter.json",
-        help="Path to a torch model parameters file",
-    )
-    parser.add_argument(
-        "--ppdiffusers_t2i_adapter_model_output_path",
-        type=str,
-        default="paddle_models/sd-v1-4-adapter-openpose_initialized",
-        help="The model output path.",
-    )
-    args = parser.parse_args()
-
-    import os
-    import sys
-
-    sys.path.append(args.orig_t2i_adapter_project_path)
-    from ldm.modules.encoders.adapter import Adapter as torch_network
-
-    Torch_Model = torch_network(
-        cin=int(3 * 64), channels=[320, 640, 1280, 1280][:4], nums_rb=2, ksize=1, sk=True, use_conv=False
-    )
-    from ppdiffusers import T2IAdapter as paddle_network
-
-    Paddle_Model = paddle_network(**read_json(args.ppdiffusers_t2i_adapter_model_config_path))
-
-    torch_model = Torch_Model
-    if args.orig_t2i_adapter_pretrained_ckpt_path:
-        torch_model.load_state_dict(
-            torch.load(args.orig_t2i_adapter_pretrained_ckpt_path, map_location=torch.device("cpu")), strict=True
-        )
-    # When orig_t2i_adapter_pretrained_ckpt_path is not specified, the randomly initialized torch weights are stored in orig_t2i_adapter_pretrained_ckpt_path
-    else:
-        torch.save(
-            torch_model.state_dict(),
-            os.path.join(args.orig_t2i_adapter_project_path, "ckpt", "torch_t2i_model_initialized.pth"),
-        )
-    torch_model_dict = convert_adapter(torch_model.state_dict())
-    numpy_state_dict = convert_to_paddle(torch_model_dict)
-    paddle_model = Paddle_Model
-    paddle_model.load_state_dict(numpy_state_dict)
-    paddle_model.save_pretrained(args.ppdiffusers_t2i_adapter_model_output_path)
diff --git a/ppdiffusers/examples/t2i-adapter/tools/convert_t2i_adapter_to_latest_version.py b/ppdiffusers/examples/t2i-adapter/tools/convert_t2i_adapter_to_latest_version.py
deleted file mode 100644
index 578b10f944ac..000000000000
--- a/ppdiffusers/examples/t2i-adapter/tools/convert_t2i_adapter_to_latest_version.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Conversion script for the T2I-Adapter checkpoints.
-"""
-
-import argparse
-import re
-
-import paddle
-
-
-def convert_adapter(state):
-    mapping = {
-        "down_opt": "downsample",
-        "in_conv": "conv1",
-        "out_conv": "conv2",
-    }
-
-    def apply(name):
-        for k, v in mapping.items():
-            name = name.replace(k, v)
-        return name
-
-    cvr_state = {apply(k): v for k, v in state.items()}
-    return cvr_state
-
-
-def convert_adapter_light(old_state_dict):
-    mapping = {
-        "body.0.in_conv.bias": "conv_in.bias",
-        "body.0.in_conv.weight": "conv_in.weight",
-        "body.0.out_conv.bias": "body.3.out_conv.bias",
-        "body.0.out_conv.weight": "body.3.out_conv.weight",
-        "body.1.in_conv.bias": "body.4.in_conv.bias",
-        "body.1.in_conv.weight": "body.4.in_conv.weight",
-        "body.1.out_conv.bias": "body.7.out_conv.bias",
-        "body.1.out_conv.weight": "body.7.out_conv.weight",
-        "body.2.in_conv.bias": "body.8.in_conv.bias",
-        "body.2.in_conv.weight": "body.8.in_conv.weight",
-        "body.2.out_conv.bias": "body.11.out_conv.bias",
-        "body.2.out_conv.weight": "body.11.out_conv.weight",
-        "body.3.in_conv.bias": "body.12.in_conv.bias",
-        "body.3.in_conv.weight": "body.12.in_conv.weight",
-        "body.3.out_conv.bias": "body.15.out_conv.bias",
-        "body.3.out_conv.weight": "body.15.out_conv.weight",
-    }
-    cvr_state = {}
-    resblock = re.compile(r"body\.(\d+)\.body\.(\d+)\.(.+)")
-    for k, v in old_state_dict.items():
-        m = resblock.match(k)
-        if m:
-            new_group = int(m.group(1)) * 4 + int(m.group(2))
-            cvr_state[f"body.{new_group}.{m.group(3)}"] = v
-        else:
-            cvr_state[mapping[k]] = v
-    return cvr_state
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
-    )
-    parser.add_argument(
-        "--output_path", default=None, type=str, required=True, help="Path to the store the result checkpoint."
-    )
-    parser.add_argument(
-        "--is_adapter_light",
-        default=False,
-        type=bool,
-        required=False,
-        help="Is checkpoint come from Adapter-Light architecture. ex: color-adapter",
-    )
-
-    args = parser.parse_args()
-    src_state = paddle.load(args.checkpoint_path)
-    if args.is_adapter_light:
-        res_state = convert_adapter_light(src_state)
-    else:
-        res_state = convert_adapter(src_state)
-    paddle.save(res_state, args.output_path)
diff --git a/ppdiffusers/examples/t2i-adapter/tools/make_dummpy_dataset.py b/ppdiffusers/examples/t2i-adapter/tools/make_dummpy_dataset.py
deleted file mode 100644
index 172b6727c299..000000000000
--- a/ppdiffusers/examples/t2i-adapter/tools/make_dummpy_dataset.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import json
-import os
-
-from tqdm import tqdm
-
-from ppdiffusers.utils import load_image
-
-dataset_base_name_one_type_one_url_base = ""
-dataset_base_name_one_type_two_url_base = ""
-dataset_base_name_two_type_one_url_base = ""
-dataset_base_name_two_type_two_url_base = ""
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "--dataset_base_name",
-    type=str,
-    default="artv4_openpose_test13",
-    help="The dataset basename.",
-)
-parser.add_argument(
-    "--ids_list_path",
-    type=str,
-    default="artv4_openpose_test13_ids.txt",
-    help="The ids list path.",
-)
-parser.add_argument(
-    "--ids_list_path",
-    type=str,
-    default="artv4_openpose_test13_ids.txt",
-    help="The ids list path.",
-)
-parser.add_argument(
-    "--source_prompt_list_one_path",
-    type=str,
-    default="prompts_artv4_openpose_test1_en_prompts.txt",
-    help="The first source prompt list path.",
-)
-parser.add_argument(
-    "--source_prompt_list_two_path",
-    type=str,
-    default="prompts_artv4_openpose_test2_en_prompts.txt",
-    help="The second source prompt list path.",
-)
-parser.add_argument(
-    "--source_prompt_list_three_path",
-    type=str,
-    default="prompts_artv4_openpose_test3_en_prompts.txt",
-    help="The third source prompt list path.",
-)
-parser.add_argument(
-    "--dataset_prompt_json_name",
-    type=str,
-    default="prompt.json",
-    help="The dataset prompt json name.",
-)
-args = parser.parse_args()
-
-
-def get_images_form_urls(ids_list, dir_path, dataset_base_name, type=None, is_resize=False):
-    for i, id in enumerate(tqdm(ids_list)):
-        if dataset_base_name == "artv4_openpose_test13":
-            if type == "原图":
-                img_url = dataset_base_name_one_type_one_url_base + f"{id}/{id}_final00_control.png"
-            elif type == "Openpose控制图":
-                img_url = dataset_base_name_one_type_two_url_base + f"{id}/{id}_final00_control_openpose.png"
-        if dataset_base_name == "artv4_openpose_test2":
-            if type == "原图":
-                img_url = dataset_base_name_two_type_one_url_base + f"{id}/{id}_final00_control.png"
-            elif type == "Openpose控制图":
-                img_url = dataset_base_name_two_type_one_url_base + f"{id}/{id}_final00_control_openpose.png"
-        in_image = load_image(img_url)
-        if is_resize:
-            in_image = in_image.resize((512, 512))
-        os.makedirs(dir_path, exist_ok=True)
-        name = str(i) + "_" + id + ".png"
-        in_image.save(os.path.join(dir_path, name))
-
-
-def get_prompt_json_file(ids_list, prompt_lists, dataset_base_name):
-    with open(os.path.join(dataset_base_name, args.dataset_prompt_json_name), "w") as wf:
-        for i, id in enumerate(ids_list):
-            which_prompt_list = int(id.split("_")[1][-1]) - 1
-            which_prompt = int(id.split("_")[-1])
-            name = str(i) + "_" + id + ".png"
-
-            data = {
-                "source": "source/" + name,
-                "target": "target/" + name,
-                "prompt": prompt_lists[which_prompt_list][which_prompt].strip(),
-            }
-            json_str = json.dumps(data)
-            wf.write(json_str + "\n")
-
-
-if __name__ == "__main__":
-    dataset_base_name = args.dataset_base_name
-    ids_list = [line.strip() for line in open(args.ids_list_path, "r", encoding="utf8").readlines()]
-
-    source_prompt_lists = [
-        [line.strip() for line in open(args.source_prompt_list_one_path, "r", encoding="utf8").readlines()],
-        [line.strip() for line in open(args.source_prompt_list_two_path, "r", encoding="utf8").readlines()],
-        [line.strip() for line in open(args.source_prompt_list_three_path, "r", encoding="utf8").readlines()],
-    ]
-
-    source_dir = os.path.join(dataset_base_name, "source")
-    target_dir = os.path.join(dataset_base_name, "target")
-    get_images_form_urls(ids_list, source_dir, dataset_base_name, type="Openpose控制图", is_resize=False)
-    get_images_form_urls(ids_list, target_dir, dataset_base_name, type="原图", is_resize=False)
-    get_prompt_json_file(ids_list, source_prompt_lists, dataset_base_name)
diff --git a/ppdiffusers/examples/t2i-adapter/train_t2i_adapter_trainer.py b/ppdiffusers/examples/t2i-adapter/train_t2i_adapter_trainer.py
deleted file mode 100644
index 688c3c3b5a59..000000000000
--- a/ppdiffusers/examples/t2i-adapter/train_t2i_adapter_trainer.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-import os
-
-import paddle
-from adapter import (
-    AdapterLDM,
-    AdapterLDMTrainer,
-    DataArguments,
-    ModelArguments,
-    TextImagePair,
-)
-
-from paddlenlp.trainer import PdArgumentParser, TrainingArguments, get_last_checkpoint
-from paddlenlp.utils.log import logger
-
-
-def unfreeze_params(params):
-    for param in params:
-        param.stop_gradient = False
-
-
-def main():
-    parser = PdArgumentParser((ModelArguments, DataArguments, TrainingArguments))
-    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-    # report to custom_visualdl
-    training_args.report_to = ["custom_visualdl"]
-    training_args.resolution = data_args.resolution
-    training_args.image_logging_steps = model_args.image_logging_steps = (
-        math.ceil(model_args.image_logging_steps / training_args.logging_steps) * training_args.logging_steps
-    )
-    training_args.print_config(model_args, "Model")
-    training_args.print_config(data_args, "Data")
-
-    paddle.set_device(training_args.device)
-
-    # Detecting last checkpoint.
-    last_checkpoint = None
-    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
-        last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome."
-            )
-        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
-            logger.info(
-                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-
-    model = AdapterLDM(model_args)
-    train_dataset = TextImagePair(
-        file_list=data_args.file_list,
-        size=data_args.resolution,
-        num_records=data_args.num_records,
-        buffer_size=data_args.buffer_size,
-        shuffle_every_n_samples=data_args.shuffle_every_n_samples,
-        interpolation="lanczos",
-        tokenizer=model.tokenizer,
-        control_image_processor=model.control_image_processor,
-        data_format=data_args.data_format,
-    )
-    trainer = AdapterLDMTrainer(
-        model=model,
-        args=training_args,
-        train_dataset=train_dataset,
-        tokenizer=model.tokenizer,
-    )
-    # must set recompute after trainer init
-    trainer.model.set_recompute(training_args.recompute)
-
-    params_to_train = trainer.model.adapter.parameters()
-    trainer.set_optimizer_grouped_parameters(params_to_train)
-
-    checkpoint = None
-    if training_args.resume_from_checkpoint is not None:
-        checkpoint = training_args.resume_from_checkpoint
-    elif last_checkpoint is not None:
-        checkpoint = last_checkpoint
-
-    # Training
-    trainer.train(resume_from_checkpoint=checkpoint)
-    trainer.save_model()
-    trainer.save_state()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/ppdiffusers/examples/text_to_image/README.md b/ppdiffusers/examples/text_to_image/README.md
deleted file mode 100644
index 769da60ee14d..000000000000
--- a/ppdiffusers/examples/text_to_image/README.md
+++ /dev/null
@@ -1,250 +0,0 @@
-# 微调Stable Diffusion模型
-
-`train_text_to_image.py`脚本展示如何在自定义数据集上微调Stable Diffusion模型。
-
-___Note___:
-
-___该训练代码是实验性质的。由于这里的代码微调了整个`UNet模型`，通常该模型可能会产生过拟合的现象，可能会产生像`"catastrophic forgetting"`的问题。如果用户在自己的数据集上进行微调训练，为了得到更好的训练结果，建议尝试使用不同的参数值。___
-
-
-## 1 本地运行
-### 1.1 安装依赖
-
-在运行这个训练代码前，我们需要安装下面的训练依赖。
-
-```bash
-pip install -U ppdiffusers visualdl
-```
-
-### 1.2 Pokemon训练教程
-
-为了下载`CompVis/stable-diffusion-v1-4`模型权重，我们需要阅读并签署相关的License。在这里我们默认用户已经阅读并签署了解了相关License，有关License及模型的详细介绍，请访问[CompVis/stable-diffusion-v1-4 card](https://huggingface.co/CompVis/stable-diffusion-v1-4)。
-
-> License: The CreativeML OpenRAIL M license is an Open RAIL M license, adapted from the work that BigScience and the RAIL Initiative are jointly carrying in the area of responsible AI licensing. See also the article about the BLOOM Open RAIL license on which our license is based.
-<br>
-
-#### 1.2.1 硬件要求
-当我们开启`gradient_checkpointing`功能后（Tips：该功能可以在一定程度上减少显存消耗），我们可以在24GB显存的GPU上微调模型。如果想要使用更大的`batch_size`进行更快的训练，建议用户使用具有30GB+显存的显卡。
-
-#### 1.2.2 单机单卡训练
-```bash
-export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export dataset_name="lambdalabs/pokemon-blip-captions"
-
-python -u train_text_to_image.py \
-  --pretrained_model_name_or_path=$MODEL_NAME \
-  --dataset_name=$dataset_name \
-  --resolution=512 --center_crop --random_flip \
-  --train_batch_size=1 \
-  --gradient_accumulation_steps=4 \
-  --gradient_checkpointing \
-  --max_train_steps=15000 \
-  --learning_rate=1e-05 \
-  --max_grad_norm=1 \
-  --lr_scheduler="constant" --lr_warmup_steps=0 \
-  --output_dir="sd-pokemon-model"
-```
-
-
-`train_text_to_image.py`代码可传入的参数解释如下：
-> 主要修改的参数
-> * `--pretrained_model_name_or_path`: 所使用的Stable Diffusion模型权重名称或者本地下载的模型路径，目前支持了上表中的8种模型权重，我们可直接替换使用。
-> * `--dataset_name`: 数据集名字，可填写`HuggingFace hub`已有的数据集名字。
-> * `--dataset_config_name`: 数据集所使用的config配置名字。
-> * `--train_data_dir`: 如果选择自定义数据集的话，需要提供数据集地址，该地址要遵循 https://huggingface.co/docs/datasets/image_dataset#imagefolder 上的格式。
-> * `--image_column`: 图片所在的列名，默认为`image`。
-> * `--caption_column`: 文本描述所在的列名，默认为`text`。
-> * `--gradient_checkpointing`: 是否开启`gradient_checkpointing`功能，在一定程度上能够更显显存，但是会减慢训练速度。
-> * `--use_ema`: 是否使用EMA模型。
-> * `--num_train_epochs`: 训练的轮数，默认值为`100`。
-> * `--max_train_steps`: 最大的训练步数，当我们设置这个值后，它会重新计算所需的`num_train_epochs`轮数。
-> * `--save_steps`: 每间隔多少步`（global step步数）`，保存学习到的文件`learned_embeds.pdparams`。
-> * `--gradient_accumulation_steps`: 梯度累积的步数，用户可以指定梯度累积的步数，在梯度累积的step中。减少多卡之间梯度的通信，减少更新的次数，扩大训练的batch_size。
-
-> 可以修改的参数
-> * `--learning_rate`: 学习率。
-> * `--scale_lr`: 是否根据GPU数量，梯度累积步数，以及批量数对学习率进行缩放。缩放公式：`learning_rate * gradient_accumulation_steps * train_batch_size * num_processes`。
-> * `--lr_scheduler`: 要使用的学习率调度策略。默认为 `constant`。
-> * `--lr_warmup_steps`: 用于从 0 到 `learning_rate` 的线性 warmup 的步数。
-> * `--train_batch_size`: 训练时每张显卡所使用的`batch_size批量`，当我们的显存较小的时候，需要将这个值设置的小一点。
-> * `--center_crop`: 在调整图片宽和高之前是否将裁剪图像居中，默认值为`False`。
-> * `--height`: 输入给模型的图片`高度`，由于用户输入的并不是固定大小的图片，因此代码中会将原始大小的图片压缩成指定`高度`的图片，默认值为`None`。
-> * `--width`: 输入给模型的图片`宽度`，由于用户输入的并不是固定大小的图片，因此代码中会将原始大小的图片压缩成指定`宽度`的图片，默认值为`None`。
-> * `--resolution`: 输入给模型图片的`分辨率`，当`高度`或`宽度`为`None`时，我们将会使用`resolution`，默认值为`512`。
-> * `--gradient_checkpointing`: 是否开启`gradient_checkpointing`功能，在一定程度上能够更显显存，但是会减慢训练速度。
-> * `--output_dir`: 模型训练完所保存的路径，默认设置为`sd-pokemon-model`文件夹，建议用户每训练一个模型可以修改一下输出路径，防止先前已有的模型被覆盖了。
-> * `--enable_xformers_memory_efficient_attention`: 是否开启`xformers`，开启后训练速度会变慢，但是能够节省显存。注意我们需要安装develop版本的paddlepaddle！
-
-> 基本无需修改的参数
-> * `--seed`: 随机种子，为了可以复现训练结果，Tips：当前paddle设置该随机种子后仍无法完美复现。
-> * `--adam_beta1`: `AdamW` 优化器时的 `beta1` 超参数。默认为 `0.9`。
-> * `--adam_beta2`: `AdamW` 优化器时的 `beta2` 超参数。默认为 `0.999`。
-> * `--adam_weight_decay`: `AdamW` 优化器时的 `weight_decay` 超参数。 默认为`0.02`。
-> * `--adam_weight_decay`: `AdamW` 优化器时的 `epsilon` 超参数。默认为 `1e-8`。
-> * `--max_grad_norm`: 最大梯度范数（用于梯度裁剪）。默认为 `-1` 表示不使用。
-> * `--logging_dir`: Tensorboard 或 VisualDL 记录日志的地址，注意：该地址会与输出目录进行拼接，即，最终的日志地址为`<output_dir>/<logging_dir>`。
-> * `--report_to`: 用于记录日志的工具，可选`["tensorboard", "visualdl"]`，默认为`visualdl`，如果选用`tensorboard`，请使用命令安装`pip install tensorboardX`。
-> * `--push_to_hub`: 是否将模型上传到 `huggingface hub`，默认值为 `False`。
-> * `--hub_token`: 上传到 `huggingface hub` 所需要使用的 `token`，如果我们已经登录了，那么我们就无需填写。
-> * `--hub_model_id`: 上传到 `huggingface hub` 的模型库名称， 如果为 `None` 的话表示我们将使用 `output_dir` 的名称作为模型库名称。
-
-#### 1.2.3 单机多卡训练
-通过设置`--gpus`，我们可以指定 GPU 为 `0,1,2,3` 卡。这里我们只训练了`4000step`，因为这里的`4000 step x 4卡`近似于`单卡训练 16000 step`。
-
-```bash
-export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export dataset_name="lambdalabs/pokemon-blip-captions"
-
-python -u -m paddle.distributed.launch --gpus "0,1,2,3" train_text_to_image.py \
-  --pretrained_model_name_or_path=$MODEL_NAME \
-  --dataset_name=$dataset_name \
-  --resolution=512 --center_crop --random_flip \
-  --train_batch_size=1 \
-  --gradient_accumulation_steps=4 \
-  --gradient_checkpointing \
-  --max_train_steps=4000 \
-  --learning_rate=1e-05 \
-  --max_grad_norm=1 \
-  --lr_scheduler="constant" --lr_warmup_steps=0 \
-  --output_dir="sd-pokemon-model"
-```
-
-
-#### 1.2.4 预测生成图片
-
-当训练完成后，模型将自动保存到`output_dir`目录，在上述例子中，我们的模型最终保存到了`sd-pokemon-model`文件夹。我们可以使用`StableDiffusionPipeline`快速加载该模型。
-
-```
-├── train_text_to_image.py # 训练脚本
-├── sd-pokemon-model  # 我们指定的输出文件路径
-    ├── vae # vae权重文件夹
-        ├── model_state.pdparams
-        ├── config.json
-    ├── text_encoder # text_encoder权重文件夹
-        ├── config.json
-        ├── model_state.pdparams
-    ├── unet # unet权重文件夹
-        ├── model_state.pdparams
-        ├── config.json
-    ├── scheduler # scheduler文件夹
-        ├── scheduler_config.json
-    ├── feature_extractor # feature_extractor文件夹
-        ├── preprocessor_config.json
-    ├── tokenizer # tokenizer文件夹
-        ├── tokenizer_config.json
-        ├── merges.txt
-        ├── special_tokens_map.json
-        ├── added_tokens.json
-        ├── vocab.json
-```
-
-```python
-from ppdiffusers import StableDiffusionPipeline
-
-# 我们所需加载的模型地址，这里我们输入了训练时候使用的 output_dir 地址
-model_path = "sd-pokemon-model"
-pipe = StableDiffusionPipeline.from_pretrained(model_path)
-
-image = pipe(prompt="yoda").images[0]
-# 保存图片，我们可以查看 yoda-pokemon.png 图片。
-image.save("yoda-pokemon.png")
-```
-<p align="center">
-    <img src="https://user-images.githubusercontent.com/50394665/196165976-a999bf68-382c-484d-b86e-5006a05c90d8.png">
-</p>
-
-### 1.3 自定义数据集训练教程
-如果用户想要在自己的数据集上进行训练，那么需要根据`huggingface的 datasets 库`所需的格式准备数据集，有关数据集的介绍可以查看 [HF dataset的文档](https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder-with-metadata).
-
-如果用户想要修改代码中的部分训练逻辑，那么需要修改训练代码。
-
-```bash
-export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-# 这里需要输入你自己的训练集路径
-export TRAIN_DIR="path_to_your_dataset"
-
-python -u train_text_to_image.py \
-  --pretrained_model_name_or_path=$MODEL_NAME \
-  --dataset_name=$dataset_name \
-  --use_ema \
-  --resolution=512 --center_crop --random_flip \
-  --train_batch_size=1 \
-  --gradient_accumulation_steps=4 \
-  --gradient_checkpointing \
-  --max_train_steps=15000 \
-  --learning_rate=1e-05 \
-  --max_grad_norm=1 \
-  --lr_scheduler="constant" --lr_warmup_steps=0 \
-  --output_dir="sd-custom-model"
-```
-
-# 使用 LoRA 和 Text-to-Image 技术进行模型训练
-
-[LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685) 是微软研究员引入的一项新技术，主要用于处理大模型微调的问题。目前超过数十亿以上参数的具有强能力的大模型 (例如 GPT-3) 通常在为了适应其下游任务的微调中会呈现出巨大开销。LoRA 建议冻结预训练模型的权重并在每个 Transformer 块中注入可训练层 (秩-分解矩阵)。因为不需要为大多数模型权重计算梯度，所以大大减少了需要训练参数的数量并且降低了 GPU 的内存要求。研究人员发现，通过聚焦大模型的 Transformer 注意力块，使用 LoRA 进行的微调质量与全模型微调相当，同时速度更快且需要更少的计算。
-
-简而言之，LoRA允许通过向现有权重添加一对秩分解矩阵，并只训练这些新添加的权重来适应预训练的模型。这有几个优点：
-
-- 保持预训练的权重不变，这样模型就不容易出现灾难性遗忘 [catastrophic forgetting](https://www.pnas.org/doi/10.1073/pnas.1611835114)；
-- 秩分解矩阵的参数比原始模型少得多，这意味着训练的 LoRA 权重很容易移植；
-- LoRA 注意力层允许通过一个 `scale` 参数来控制模型适应新训练图像的程度。
-
-[cloneofsimo](https://github.com/cloneofsimo) 是第一个在 [LoRA GitHub](https://github.com/cloneofsimo/lora) 仓库中尝试使用 LoRA 训练 Stable Diffusion 的人。
-
-## 训练
-
-**___Note: 如果我们使用 [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 进行训练，那么我们需要将 `resolution` 改成 768 .___**
-
-```bash
-export MODEL_NAME="runwayml/stable-diffusion-v1-5"
-export DATASET_NAME="lambdalabs/pokemon-blip-captions"
-export OUTPUT_DIR="sd-pokemon-model-lora"
-
-python train_text_to_image_lora.py \
-  --pretrained_model_name_or_path=$MODEL_NAME \
-  --dataset_name=$DATASET_NAME \
-  --dataloader_num_workers=8 \
-  --resolution=512 --center_crop --random_flip \
-  --train_batch_size=1 \
-  --gradient_accumulation_steps=4 \
-  --max_train_steps=15000 \
-  --learning_rate=1e-04 \
-  --max_grad_norm=1 \
-  --lr_scheduler="cosine" --lr_warmup_steps=0 \
-  --output_dir=${OUTPUT_DIR} \
-  --report_to=visualdl \
-  --checkpointing_steps=500 \
-  --validation_prompt="Totoro" \
-  --lora_rank=4 \
-  --seed=1337 \
-  --validation_epochs 10
-```
-**___Note: 当我使用 LoRA 训练模型的时候，我们需要使用更大的学习率，因此我们这里使用 *1e-4* 而不是 *1e-5*.___**
-
-最终经过微调后的 LoRA 权重，我们已经上传到了 [junnyu/sd-model-finetuned-lora-a100](https://huggingface.co/junnyu/sd-model-finetuned-lora-a100). **___Note: [最终的权重](https://huggingface.co/junnyu/sd-model-finetuned-lora-a100/blob/main/paddle_lora_weights.pdparams) 只有 3 MB 的大小.___**
-
-
-## 推理
-
-经过训练， LoRA 权重可以直接加载到原始的 pipeline 中。
-
-```python
-from ppdiffusers import StableDiffusionPipeline
-import paddle
-
-model_path = "junnyu/sd-model-finetuned-lora-a100"
-pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", paddle_dtype=paddle.float32)
-# 注意：如果我们想从 HF Hub 加载权重，那么我们需要设置 from_hf_hub=True
-pipe.unet.load_attn_procs(model_path, from_hf_hub=True)
-
-prompt = "Totoro"
-image = pipe(prompt, num_inference_steps=30, guidance_scale=7.5).images[0]
-image.save("Totoro.png")
-```
-<p align="center">
-    <img src="https://user-images.githubusercontent.com/50394665/218942887-a036c605-6ef4-495a-af83-39e4ce3e0055.png">
-</p>
-
-# 参考资料
-- https://github.com/huggingface/diffusers/tree/main/examples/text_to_image
-- https://github.com/CompVis/stable-diffusion
-- https://huggingface.co/lambdalabs/sd-pokemon-diffusers
diff --git a/ppdiffusers/examples/text_to_image/requirements.txt b/ppdiffusers/examples/text_to_image/requirements.txt
deleted file mode 100644
index d77a600a0daf..000000000000
--- a/ppdiffusers/examples/text_to_image/requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-paddlenlp>=2.6.0rc0
-Pillow
-ppdiffusers>=0.16.1
\ No newline at end of file
diff --git a/ppdiffusers/examples/text_to_image/train_text_to_image.py b/ppdiffusers/examples/text_to_image/train_text_to_image.py
deleted file mode 100644
index 4d0ca26486cb..000000000000
--- a/ppdiffusers/examples/text_to_image/train_text_to_image.py
+++ /dev/null
@@ -1,816 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import contextlib
-import math
-import os
-import random
-import sys
-from pathlib import Path
-from typing import Optional
-
-import numpy as np
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from datasets import DatasetDict, load_dataset
-from huggingface_hub import HfFolder, Repository, create_repo, whoami
-from paddle.distributed.fleet.utils.hybrid_parallel_util import (
-    fused_allreduce_gradients,
-)
-from paddle.io import BatchSampler, DataLoader, DistributedBatchSampler
-from paddle.optimizer import AdamW
-from paddle.vision import BaseTransform, transforms
-from tqdm.auto import tqdm
-
-from paddlenlp.trainer import set_seed
-from paddlenlp.transformers import AutoTokenizer, PretrainedConfig
-from paddlenlp.utils.downloader import get_path_from_url_with_filelock
-from paddlenlp.utils.log import logger
-from ppdiffusers import (
-    AutoencoderKL,
-    DDPMScheduler,
-    DiffusionPipeline,
-    UNet2DConditionModel,
-    is_ppxformers_available,
-)
-from ppdiffusers.optimization import get_scheduler
-from ppdiffusers.training_utils import (
-    EMAModel,
-    freeze_params,
-    main_process_first,
-    unwrap_model,
-)
-from ppdiffusers.utils import PPDIFFUSERS_CACHE, check_min_version
-
-check_min_version("0.16.1")
-
-
-def url_or_path_join(*path_list):
-    return os.path.join(*path_list) if os.path.isdir(os.path.join(*path_list)) else "/".join(path_list)
-
-
-class Lambda(BaseTransform):
-    def __init__(self, fn, keys=None):
-        super().__init__(keys)
-        self.fn = fn
-
-    def _apply_image(self, img):
-        return self.fn(img)
-
-
-def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str):
-    try:
-        text_encoder_config = PretrainedConfig.from_pretrained(
-            url_or_path_join(pretrained_model_name_or_path, "text_encoder")
-        )
-        model_class = text_encoder_config.architectures[0]
-    except Exception:
-        model_class = "LDMBertModel"
-    if model_class == "CLIPTextModel":
-        from paddlenlp.transformers import CLIPTextModel
-
-        return CLIPTextModel
-    elif model_class == "RobertaSeriesModelWithTransformation":
-        from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import (
-            RobertaSeriesModelWithTransformation,
-        )
-
-        return RobertaSeriesModelWithTransformation
-    elif model_class == "BertModel":
-        from paddlenlp.transformers import BertModel
-
-        return BertModel
-    elif model_class == "LDMBertModel":
-        from ppdiffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import (
-            LDMBertModel,
-        )
-
-        return LDMBertModel
-    else:
-        raise ValueError(f"{model_class} is not supported.")
-
-
-def set_recompute(model, value=False):
-    def fn(layer):
-        # ldmbert
-        if hasattr(layer, "enable_recompute"):
-            layer.enable_recompute = value
-            print("Set", layer.__class__, "recompute", layer.enable_recompute)
-        # unet
-        if hasattr(layer, "gradient_checkpointing"):
-            layer.gradient_checkpointing = value
-            print("Set", layer.__class__, "recompute", layer.gradient_checkpointing)
-
-    model.apply(fn)
-
-
-def get_report_to(args):
-    if args.report_to == "visualdl":
-        from visualdl import LogWriter
-
-        writer = LogWriter(logdir=args.logging_dir)
-    elif args.report_to == "tensorboard":
-        from tensorboardX import SummaryWriter
-
-        writer = SummaryWriter(logdir=args.logging_dir)
-    else:
-        raise ValueError("report_to must be in ['visualdl', 'tensorboard']")
-    return writer
-
-
-def parse_args(input_args=None):
-    parser = argparse.ArgumentParser(description="Simple example of a training a text to image model script.")
-    parser.add_argument(
-        "--pretrained_model_name_or_path",
-        type=str,
-        default=None,
-        required=True,
-        help="Path to pretrained model or model identifier from huggingface.co/models.",
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        type=str,
-        default=None,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--train_text_encoder",
-        action="store_true",
-        help="Whether to train the text encoder.",
-    )
-    parser.add_argument(
-        "--dataset_name",
-        type=str,
-        default=None,
-        help=(
-            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
-            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
-            " or to a folder containing files that 🤗 Datasets can understand."
-        ),
-    )
-    parser.add_argument(
-        "--dataset_config_name",
-        type=str,
-        default=None,
-        help="The config of the Dataset, leave as None if there's only one config.",
-    )
-    parser.add_argument(
-        "--train_data_dir",
-        type=str,
-        default=None,
-        help=(
-            "A folder containing the training data. Folder contents must follow the structure described in"
-            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
-            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
-        ),
-    )
-    parser.add_argument(
-        "--image_column", type=str, default="image", help="The column of the dataset containing an image."
-    )
-    parser.add_argument(
-        "--caption_column",
-        type=str,
-        default="text",
-        help="The column of the dataset containing a caption or a list of captions.",
-    )
-    parser.add_argument(
-        "--max_train_samples",
-        type=int,
-        default=None,
-        help=(
-            "For debugging purposes or quicker training, truncate the number of training examples to this "
-            "value if set."
-        ),
-    )
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        default="sd-model-finetuned",
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-    parser.add_argument(
-        "--cache_dir",
-        type=str,
-        default=None,
-        help="The directory where the downloaded models and datasets will be stored.",
-    )
-    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
-    parser.add_argument(
-        "--height",
-        type=int,
-        default=None,
-        help=(
-            "The height for input images, all the images in the train/validation dataset will be resized to this"
-            " height"
-        ),
-    )
-    parser.add_argument(
-        "--width",
-        type=int,
-        default=None,
-        help=(
-            "The width for input images, all the images in the train/validation dataset will be resized to this"
-            " width"
-        ),
-    )
-    parser.add_argument(
-        "--resolution",
-        type=int,
-        default=512,
-        help=(
-            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
-            " resolution"
-        ),
-    )
-    parser.add_argument(
-        "--center_crop",
-        default=False,
-        action="store_true",
-        help=(
-            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
-            " cropped. The images will be resized to the resolution first before cropping."
-        ),
-    )
-    parser.add_argument(
-        "--random_flip",
-        action="store_true",
-        help="whether to randomly flip images horizontally",
-    )
-    parser.add_argument(
-        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
-    )
-    parser.add_argument("--num_train_epochs", type=int, default=100)
-    parser.add_argument(
-        "--max_train_steps",
-        type=int,
-        default=None,
-        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
-    )
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument(
-        "--gradient_checkpointing",
-        action="store_true",
-        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
-    )
-    parser.add_argument(
-        "--learning_rate",
-        type=float,
-        default=1e-4,
-        help="Initial learning rate (after the potential warmup period) to use.",
-    )
-    parser.add_argument(
-        "--scale_lr",
-        action="store_true",
-        default=False,
-        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
-    )
-    parser.add_argument(
-        "--lr_scheduler",
-        type=str,
-        default="constant",
-        help=(
-            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
-            ' "constant", "constant_with_warmup"]'
-        ),
-    )
-    parser.add_argument(
-        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
-    )
-    parser.add_argument(
-        "--snr_gamma",
-        type=float,
-        default=None,
-        help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. "
-        "More details here: https://arxiv.org/abs/2303.09556.",
-    )
-    parser.add_argument(
-        "--lr_num_cycles",
-        type=int,
-        default=1,
-        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
-    )
-    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
-    parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.")
-    parser.add_argument("--debug", action="store_true", help="Whether to debug this training script.")
-    parser.add_argument(
-        "--dataloader_num_workers",
-        type=int,
-        default=0,
-        help=(
-            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
-        ),
-    )
-    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
-    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
-    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
-    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
-    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
-    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
-    parser.add_argument(
-        "--hub_model_id",
-        type=str,
-        default=None,
-        help="The name of the repository to keep in sync with the local `output_dir`.",
-    )
-    parser.add_argument(
-        "--logging_dir",
-        type=str,
-        default="logs",
-        help=(
-            "[TensorBoard](https://www.tensorflow.org/tensorboard) or [VisualDL](https://www.paddlepaddle.org.cn/paddle/visualdl) log directory. Will default to"
-            "*output_dir/logs"
-        ),
-    )
-    parser.add_argument(
-        "--report_to", type=str, default="visualdl", choices=["tensorboard", "visualdl"], help="Log writer type."
-    )
-    parser.add_argument(
-        "--checkpointing_steps",
-        type=int,
-        default=500,
-        help=("Save a checkpoint of the training state every X updates."),
-    )
-    parser.add_argument(
-        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
-    )
-    parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
-    if input_args is not None:
-        args = parser.parse_args(input_args)
-    else:
-        args = parser.parse_args()
-
-    # Sanity checks
-    if args.dataset_name is None and args.train_data_dir is None:
-        raise ValueError("Need either a dataset name or a training folder.")
-    args.logging_dir = os.path.join(args.output_dir, args.logging_dir)
-    if args.height is None or args.width is None and args.resolution is not None:
-        args.height = args.width = args.resolution
-
-    return args
-
-
-def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
-    if token is None:
-        token = HfFolder.get_token()
-    if organization is None:
-        username = whoami(token)["name"]
-        return f"{username}/{model_id}"
-    else:
-        return f"{organization}/{model_id}"
-
-
-DATASET_NAME_MAPPING = {
-    "lambdalabs/pokemon-blip-captions": ("image", "text"),
-}
-
-
-def main():
-    args = parse_args()
-    rank = paddle.distributed.get_rank()
-    is_main_process = rank == 0
-    num_processes = paddle.distributed.get_world_size()
-    if num_processes > 1:
-        paddle.distributed.init_parallel_env()
-
-    # If passed along, set the training seed now.
-    if args.seed is not None:
-        set_seed(args.seed)
-
-    # Handle the repository creation
-    if is_main_process:
-        if args.output_dir is not None:
-            os.makedirs(args.output_dir, exist_ok=True)
-        if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            create_repo(repo_name, exist_ok=True, token=args.hub_token)
-            repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
-
-            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
-                if "step_*" not in gitignore:
-                    gitignore.write("step_*\n")
-                if "epoch_*" not in gitignore:
-                    gitignore.write("epoch_*\n")
-
-    # Load the tokenizer
-    if args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)
-    elif args.pretrained_model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(url_or_path_join(args.pretrained_model_name_or_path, "tokenizer"))
-
-    # import correct text encoder class
-    text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path)
-
-    # Load scheduler and models
-    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
-    text_encoder = text_encoder_cls.from_pretrained(
-        url_or_path_join(args.pretrained_model_name_or_path, "text_encoder")
-    )
-    text_config = text_encoder.config if isinstance(text_encoder.config, dict) else text_encoder.config.to_dict()
-    if text_config.get("use_attention_mask", None) is not None and text_config["use_attention_mask"]:
-        use_attention_mask = True
-    else:
-        use_attention_mask = False
-    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
-    unet = UNet2DConditionModel.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="unet",
-    )
-
-    freeze_params(vae.parameters())
-    if not args.train_text_encoder:
-        freeze_params(text_encoder.parameters())
-    if args.use_ema:
-        ema_unet = UNet2DConditionModel.from_pretrained(
-            args.pretrained_model_name_or_path,
-            subfolder="unet",
-        )
-        ema_unet = EMAModel(ema_unet.parameters())
-
-    if args.gradient_checkpointing:
-        unet.enable_gradient_checkpointing()
-        if args.train_text_encoder:
-            set_recompute(text_encoder, True)
-
-    if args.enable_xformers_memory_efficient_attention and is_ppxformers_available():
-        try:
-            unet.enable_xformers_memory_efficient_attention()
-        except Exception as e:
-            logger.warn(
-                "Could not enable memory efficient attention. Make sure develop paddlepaddle is installed"
-                f" correctly and a GPU is available: {e}"
-            )
-
-    def compute_snr(timesteps):
-        """
-        Computes SNR as per https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L847-L849
-        """
-        alphas_cumprod = noise_scheduler.alphas_cumprod
-        sqrt_alphas_cumprod = alphas_cumprod**0.5
-        sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod) ** 0.5
-
-        # Expand the tensors.
-        # Adapted from https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L1026
-        sqrt_alphas_cumprod = sqrt_alphas_cumprod[timesteps].cast("float32")
-        while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape):
-            sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None]
-        alpha = sqrt_alphas_cumprod.expand(timesteps.shape)
-
-        sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[timesteps].cast("float32")
-        while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape):
-            sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., None]
-        sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape)
-
-        # Compute SNR.
-        snr = (alpha / sigma) ** 2
-        return snr
-
-    # Get the datasets: you can either provide your own training and evaluation files (see below)
-    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
-
-    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
-    # download the dataset.
-    if args.debug:
-        file_path = get_path_from_url_with_filelock(
-            "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/pokemon-blip-captions.tar.gz",
-            PPDIFFUSERS_CACHE,
-        )
-        dataset = DatasetDict.load_from_disk(file_path)
-        args.dataset_name = "lambdalabs/pokemon-blip-captions"
-    else:
-        if args.dataset_name is not None:
-            # Downloading and loading a dataset from the hub.
-            dataset = load_dataset(
-                args.dataset_name,
-                args.dataset_config_name,
-                cache_dir=args.cache_dir,
-            )
-        else:
-            data_files = {}
-            if args.train_data_dir is not None:
-                data_files["train"] = os.path.join(args.train_data_dir, "**")
-            dataset = load_dataset(
-                "imagefolder",
-                data_files=data_files,
-                cache_dir=args.cache_dir,
-            )
-            # See more about loading custom images at
-            # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
-
-    # Preprocessing the datasets.
-    # We need to tokenize inputs and targets.
-    column_names = dataset["train"].column_names
-
-    # 6. Get the column names for input/target.
-    dataset_columns = DATASET_NAME_MAPPING.get(args.dataset_name, None)
-    if args.image_column is None:
-        image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
-    else:
-        image_column = args.image_column
-        if image_column not in column_names:
-            raise ValueError(
-                f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}"
-            )
-    if args.caption_column is None:
-        caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
-    else:
-        caption_column = args.caption_column
-        if caption_column not in column_names:
-            raise ValueError(
-                f"--caption_column' value '{args.caption_column}' needs to be one of: {', '.join(column_names)}"
-            )
-
-    # Preprocessing the datasets.
-    # We need to tokenize input captions and transform the images.
-    def tokenize_captions(examples, is_train=True):
-        captions = []
-        for caption in examples[caption_column]:
-            if isinstance(caption, str):
-                captions.append(caption)
-            elif isinstance(caption, (list, np.ndarray)):
-                # take a random caption if there are multiple
-                captions.append(random.choice(caption) if is_train else caption[0])
-            else:
-                raise ValueError(
-                    f"Caption column `{caption_column}` should contain either strings or lists of strings."
-                )
-        inputs = tokenizer(
-            captions,
-            max_length=tokenizer.model_max_length,
-            padding="do_not_pad",
-            truncation=True,
-            return_attention_mask=False,
-        )
-        return inputs.input_ids
-
-    # Preprocessing the datasets.
-    train_transforms = transforms.Compose(
-        [
-            transforms.Resize((args.height, args.width), interpolation="bilinear"),
-            transforms.CenterCrop((args.height, args.width))
-            if args.center_crop
-            else transforms.RandomCrop((args.height, args.width)),
-            transforms.RandomHorizontalFlip() if args.random_flip else Lambda(lambda x: x),
-            transforms.ToTensor(),
-            transforms.Normalize([0.5], [0.5]),
-        ]
-    )
-
-    def preprocess_train(examples):
-        images = [image.convert("RGB") for image in examples[image_column]]
-        examples["pixel_values"] = [train_transforms(image) for image in images]
-        examples["input_ids"] = tokenize_captions(examples)
-        return examples
-
-    with main_process_first():
-        if args.max_train_samples is not None:
-            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
-        # Set the training transforms
-        train_dataset = dataset["train"].with_transform(preprocess_train)
-
-    def collate_fn(examples):
-        pixel_values = paddle.stack([example["pixel_values"] for example in examples]).cast("float32")
-        input_ids = [example["input_ids"] for example in examples]
-        input_ids = tokenizer.pad(
-            {"input_ids": input_ids}, padding="max_length", max_length=tokenizer.model_max_length, return_tensors="pd"
-        ).input_ids
-        return {
-            "input_ids": input_ids,
-            "pixel_values": pixel_values,
-        }
-
-    train_sampler = (
-        DistributedBatchSampler(train_dataset, batch_size=args.train_batch_size, shuffle=True)
-        if num_processes > 1
-        else BatchSampler(train_dataset, batch_size=args.train_batch_size, shuffle=True)
-    )
-    train_dataloader = DataLoader(
-        train_dataset, batch_sampler=train_sampler, collate_fn=collate_fn, num_workers=args.dataloader_num_workers
-    )
-
-    # Scheduler and math around the number of training steps.
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
-    if args.max_train_steps is None:
-        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
-    # Afterwards we recalculate our number of training epochs
-    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
-
-    if num_processes > 1:
-        unet = paddle.DataParallel(unet)
-        if args.train_text_encoder:
-            text_encoder = paddle.DataParallel(text_encoder)
-
-    params_to_optimize = (
-        list(unet.parameters()) + list(text_encoder.parameters()) if args.train_text_encoder else unet.parameters()
-    )
-
-    if args.scale_lr:
-        args.learning_rate = (
-            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * num_processes
-        )
-
-    lr_scheduler = get_scheduler(
-        args.lr_scheduler,
-        learning_rate=args.learning_rate,
-        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
-        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
-        num_cycles=args.lr_num_cycles,
-        power=args.lr_power,
-    )
-    # Initialize the optimizer
-    optimizer = AdamW(
-        learning_rate=lr_scheduler,
-        parameters=params_to_optimize,
-        beta1=args.adam_beta1,
-        beta2=args.adam_beta2,
-        weight_decay=args.adam_weight_decay,
-        epsilon=args.adam_epsilon,
-        grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm) if args.max_grad_norm > 0 else None,
-    )
-
-    if is_main_process:
-        logger.info("-----------  Configuration Arguments -----------")
-        for arg, value in sorted(vars(args).items()):
-            logger.info("%s: %s" % (arg, value))
-        logger.info("------------------------------------------------")
-        writer = get_report_to(args)
-
-    # Train!
-    total_batch_size = args.train_batch_size * num_processes * args.gradient_accumulation_steps
-
-    logger.info("***** Running training *****")
-    logger.info(f"  Num examples = {len(train_dataset)}")
-    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
-    logger.info(f"  Num Epochs = {args.num_train_epochs}")
-    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
-    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
-    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
-    logger.info(f"  Total optimization steps = {args.max_train_steps}")
-
-    # Only show the progress bar once on each machine.
-    progress_bar = tqdm(range(args.max_train_steps), disable=not is_main_process)
-    progress_bar.set_description("Train Steps")
-    global_step = 0
-
-    # Keep vae in eval model as we don't train these
-    vae.eval()
-    if args.train_text_encoder:
-        text_encoder.train()
-    else:
-        text_encoder.eval()
-    unet.train()
-
-    for epoch in range(args.num_train_epochs):
-        for step, batch in enumerate(train_dataloader):
-            # Convert images to latent space
-            latents = vae.encode(batch["pixel_values"]).latent_dist.sample()
-            latents = latents * vae.config.scaling_factor
-
-            # Sample noise that we'll add to the latents
-            noise = paddle.randn(latents.shape, dtype=latents.dtype)
-            if args.noise_offset:
-                # https://www.crosslabs.org//blog/diffusion-with-offset-noise
-                noise += args.noise_offset * paddle.randn(
-                    (latents.shape[0], latents.shape[1], 1, 1), dtype=latents.dtype
-                )
-            batch_size = latents.shape[0]
-            # Sample a random timestep for each image
-            timesteps = paddle.randint(0, noise_scheduler.config.num_train_timesteps, (batch_size,)).cast("int64")
-
-            # Add noise to the latents according to the noise magnitude at each timestep
-            # (this is the forward diffusion process)
-            noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
-
-            if num_processes > 1 and (
-                args.gradient_checkpointing or ((step + 1) % args.gradient_accumulation_steps != 0)
-            ):
-                # grad acc, no_sync when (step + 1) % args.gradient_accumulation_steps != 0:
-                # gradient_checkpointing, no_sync every where
-                # gradient_checkpointing + grad_acc, no_sync every where
-                unet_ctx_manager = unet.no_sync()
-                if args.train_text_encoder:
-                    text_encoder_ctx_manager = text_encoder.no_sync()
-                else:
-                    text_encoder_ctx_manager = (
-                        contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
-                    )
-            else:
-                unet_ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
-                text_encoder_ctx_manager = (
-                    contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
-                )
-
-            with text_encoder_ctx_manager:
-                # Get the text embedding for conditioning
-                if use_attention_mask:
-                    attention_mask = (batch["input_ids"] != tokenizer.pad_token_id).cast("int64")
-                else:
-                    attention_mask = None
-                encoder_hidden_states = text_encoder(batch["input_ids"], attention_mask=attention_mask)[0]
-
-                with unet_ctx_manager:
-                    # Predict the noise residual / sample
-                    model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
-
-                    # Get the target for loss depending on the prediction type
-                    if noise_scheduler.config.prediction_type == "epsilon":
-                        target = noise
-                    elif noise_scheduler.config.prediction_type == "v_prediction":
-                        target = noise_scheduler.get_velocity(latents, noise, timesteps)
-                    else:
-                        raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
-
-                    if args.snr_gamma is None:
-                        loss = F.mse_loss(model_pred.cast("float32"), target.cast("float32"), reduction="mean")
-                    else:
-                        # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
-                        # Since we predict the noise instead of x_0, the original formulation is slightly changed.
-                        # This is discussed in Section 4.2 of the same paper.
-                        snr = compute_snr(timesteps)
-                        mse_loss_weights = (
-                            paddle.stack([snr, args.snr_gamma * paddle.ones_like(timesteps)], axis=1).min(1)[0] / snr
-                        )
-                        # We first calculate the original loss. Then we mean over the non-batch dimensions and
-                        # rebalance the sample-wise losses with their respective loss weights.
-                        # Finally, we take the mean of the rebalanced loss.
-                        loss = F.mse_loss(model_pred.cast("float32"), target.cast("float32"), reduction="none")
-                        loss = loss.mean(axis=list(range(1, len(loss.shape)))) * mse_loss_weights
-                        loss = loss.mean()
-
-                    if args.gradient_accumulation_steps > 1:
-                        loss = loss / args.gradient_accumulation_steps
-                    loss.backward()
-
-            if (step + 1) % args.gradient_accumulation_steps == 0:
-                if num_processes > 1 and args.gradient_checkpointing:
-                    fused_allreduce_gradients(params_to_optimize, None)
-                optimizer.step()
-                lr_scheduler.step()
-                optimizer.clear_grad()
-                progress_bar.update(1)
-                global_step += 1
-                step_loss = loss.item() * args.gradient_accumulation_steps
-                if args.use_ema:
-                    ema_unet.step(unet.parameters())
-                logs = {
-                    "epoch": str(epoch).zfill(4),
-                    "step_loss": round(step_loss, 10),
-                    "lr": lr_scheduler.get_lr(),
-                }
-                progress_bar.set_postfix(**logs)
-
-                if is_main_process:
-                    for name, val in logs.items():
-                        if name == "epoch":
-                            continue
-                        writer.add_scalar(f"train/{name}", val, global_step)
-
-                    if global_step % args.checkpointing_steps == 0:
-                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
-                        unwrap_model(unet).save_pretrained(os.path.join(save_path, "unet"))
-                        if args.train_text_encoder:
-                            unwrap_model(text_encoder).save_pretrained(os.path.join(save_path, "text_encoder"))
-
-                if global_step >= args.max_train_steps:
-                    break
-
-    # Create the pipeline using the trained modules and save it.
-    if is_main_process:
-        writer.close()
-        unet = unwrap_model(unet)
-        if args.use_ema:
-            ema_unet.copy_to(unet.parameters())
-        pipeline = DiffusionPipeline.from_pretrained(
-            args.pretrained_model_name_or_path,
-            unet=unet,
-            text_encoder=unwrap_model(text_encoder),
-        )
-        pipeline.save_pretrained(args.output_dir)
-
-        if args.push_to_hub:
-            repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/ppdiffusers/examples/text_to_image/train_text_to_image_lora.py b/ppdiffusers/examples/text_to_image/train_text_to_image_lora.py
deleted file mode 100644
index d43fe57e28c1..000000000000
--- a/ppdiffusers/examples/text_to_image/train_text_to_image_lora.py
+++ /dev/null
@@ -1,921 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import contextlib
-import gc
-import math
-import os
-import random
-import sys
-from pathlib import Path
-from typing import Optional
-
-import numpy as np
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from datasets import DatasetDict, load_dataset
-from huggingface_hub import HfFolder, Repository, create_repo, whoami
-from paddle.distributed.fleet.utils.hybrid_parallel_util import (
-    fused_allreduce_gradients,
-)
-from paddle.io import BatchSampler, DataLoader, DistributedBatchSampler
-from paddle.optimizer import AdamW
-from paddle.vision import BaseTransform, transforms
-from tqdm.auto import tqdm
-
-from paddlenlp.trainer import set_seed
-from paddlenlp.transformers import AutoTokenizer, PretrainedConfig
-from paddlenlp.utils.downloader import get_path_from_url_with_filelock
-from paddlenlp.utils.log import logger
-from ppdiffusers import (
-    AutoencoderKL,
-    DDPMScheduler,
-    DiffusionPipeline,
-    DPMSolverMultistepScheduler,
-    UNet2DConditionModel,
-    is_ppxformers_available,
-)
-from ppdiffusers.loaders import AttnProcsLayers, LoraLoaderMixin
-from ppdiffusers.models.attention_processor import (
-    AttnProcessor,
-    AttnProcessor2_5,
-    LoRAAttnProcessor,
-    LoRAAttnProcessor2_5,
-)
-from ppdiffusers.optimization import get_scheduler
-from ppdiffusers.training_utils import freeze_params, main_process_first, unwrap_model
-from ppdiffusers.utils import (
-    PPDIFFUSERS_CACHE,
-    TEXT_ENCODER_ATTN_MODULE,
-    check_min_version,
-)
-
-check_min_version("0.16.1")
-
-
-def url_or_path_join(*path_list):
-    return os.path.join(*path_list) if os.path.isdir(os.path.join(*path_list)) else "/".join(path_list)
-
-
-def save_model_card(repo_id: str, images=None, base_model=str, dataset_name=str, repo_folder=None):
-    img_str = ""
-    for i, image in enumerate(images):
-        image.save(os.path.join(repo_folder, f"image_{i}.png"))
-        img_str += f"![img_{i}](./image_{i}.png)\n"
-
-    yaml = f"""
----
-license: creativeml-openrail-m
-base_model: {base_model}
-tags:
-- stable-diffusion
-- stable-diffusion-ppdiffusers
-- text-to-image
-- ppdiffusers
-- lora
-inference: false
----
-    """
-    model_card = f"""
-# LoRA text2image fine-tuning - {repo_id}
-These are LoRA adaption weights for {base_model}. The weights were fine-tuned on the {dataset_name} dataset. You can find some example images in the following. \n
-{img_str}
-"""
-    with open(os.path.join(repo_folder, "README.md"), "w") as f:
-        f.write(yaml + model_card)
-
-
-def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str):
-    try:
-        text_encoder_config = PretrainedConfig.from_pretrained(
-            url_or_path_join(pretrained_model_name_or_path, "text_encoder")
-        )
-        model_class = text_encoder_config.architectures[0]
-    except Exception:
-        model_class = "LDMBertModel"
-    if model_class == "CLIPTextModel":
-        from paddlenlp.transformers import CLIPTextModel
-
-        return CLIPTextModel
-    elif model_class == "RobertaSeriesModelWithTransformation":
-        from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import (
-            RobertaSeriesModelWithTransformation,
-        )
-
-        return RobertaSeriesModelWithTransformation
-    elif model_class == "BertModel":
-        from paddlenlp.transformers import BertModel
-
-        return BertModel
-    elif model_class == "LDMBertModel":
-        from ppdiffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import (
-            LDMBertModel,
-        )
-
-        return LDMBertModel
-    else:
-        raise ValueError(f"{model_class} is not supported.")
-
-
-class Lambda(BaseTransform):
-    def __init__(self, fn, keys=None):
-        super().__init__(keys)
-        self.fn = fn
-
-    def _apply_image(self, img):
-        return self.fn(img)
-
-
-def get_report_to(args):
-    if args.report_to == "visualdl":
-        from visualdl import LogWriter
-
-        writer = LogWriter(logdir=args.logging_dir)
-    elif args.report_to == "tensorboard":
-        from tensorboardX import SummaryWriter
-
-        writer = SummaryWriter(logdir=args.logging_dir)
-    else:
-        raise ValueError("report_to must be in ['visualdl', 'tensorboard']")
-    return writer
-
-
-def parse_args(input_args=None):
-    parser = argparse.ArgumentParser(description="Simple example of a training text to image lora script.")
-    parser.add_argument(
-        "--pretrained_model_name_or_path",
-        type=str,
-        default=None,
-        required=True,
-        help="Path to pretrained model or model identifier from huggingface.co/models.",
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        type=str,
-        default=None,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--dataset_name",
-        type=str,
-        default=None,
-        help=(
-            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
-            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
-            " or to a folder containing files that 🤗 Datasets can understand."
-        ),
-    )
-    parser.add_argument(
-        "--dataset_config_name",
-        type=str,
-        default=None,
-        help="The config of the Dataset, leave as None if there's only one config.",
-    )
-    parser.add_argument(
-        "--train_data_dir",
-        type=str,
-        default=None,
-        help=(
-            "A folder containing the training data. Folder contents must follow the structure described in"
-            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
-            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
-        ),
-    )
-    parser.add_argument(
-        "--image_column", type=str, default="image", help="The column of the dataset containing an image."
-    )
-    parser.add_argument(
-        "--caption_column",
-        type=str,
-        default="text",
-        help="The column of the dataset containing a caption or a list of captions.",
-    )
-    parser.add_argument(
-        "--validation_prompt", type=str, default=None, help="A prompt that is sampled during training for inference."
-    )
-    parser.add_argument(
-        "--num_validation_images",
-        type=int,
-        default=4,
-        help="Number of images that should be generated during validation with `validation_prompt`.",
-    )
-    parser.add_argument(
-        "--validation_epochs",
-        type=int,
-        default=50,
-        help=(
-            "Run fine-tuning validation every X epochs. The validation process consists of running the prompt"
-            " `args.validation_prompt` multiple times: `args.num_validation_images`."
-        ),
-    )
-    parser.add_argument(
-        "--max_train_samples",
-        type=int,
-        default=None,
-        help=(
-            "For debugging purposes or quicker training, truncate the number of training examples to this "
-            "value if set."
-        ),
-    )
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        default="sd-model-finetuned-lora",
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-    parser.add_argument(
-        "--cache_dir",
-        type=str,
-        default=None,
-        help="The directory where the downloaded models and datasets will be stored.",
-    )
-    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
-    parser.add_argument(
-        "--height",
-        type=int,
-        default=None,
-        help=(
-            "The height for input images, all the images in the train/validation dataset will be resized to this"
-            " height"
-        ),
-    )
-    parser.add_argument(
-        "--width",
-        type=int,
-        default=None,
-        help=(
-            "The width for input images, all the images in the train/validation dataset will be resized to this"
-            " width"
-        ),
-    )
-    parser.add_argument(
-        "--resolution",
-        type=int,
-        default=512,
-        help=(
-            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
-            " resolution"
-        ),
-    )
-    parser.add_argument(
-        "--center_crop",
-        default=False,
-        action="store_true",
-        help=(
-            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
-            " cropped. The images will be resized to the resolution first before cropping."
-        ),
-    )
-    parser.add_argument(
-        "--lora_rank",
-        type=int,
-        default=4,
-        help="The rank of lora linear.",
-    )
-    parser.add_argument(
-        "--random_flip",
-        action="store_true",
-        help="whether to randomly flip images horizontally",
-    )
-    parser.add_argument(
-        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
-    )
-    parser.add_argument(
-        "--train_text_encoder",
-        action="store_true",
-        help="Whether to train the text encoder. If set, the text encoder should be float32 precision.",
-    )
-    parser.add_argument("--num_train_epochs", type=int, default=100)
-    parser.add_argument(
-        "--max_train_steps",
-        type=int,
-        default=None,
-        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
-    )
-    parser.add_argument(
-        "--checkpointing_steps",
-        type=int,
-        default=500,
-        help=("Save a checkpoint of the training state every X updates."),
-    )
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument(
-        "--gradient_checkpointing",
-        action="store_true",
-        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
-    )
-    parser.add_argument(
-        "--learning_rate",
-        type=float,
-        default=1e-4,
-        help="Initial learning rate (after the potential warmup period) to use.",
-    )
-    parser.add_argument(
-        "--scale_lr",
-        action="store_true",
-        default=False,
-        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
-    )
-    parser.add_argument(
-        "--lr_scheduler",
-        type=str,
-        default="constant",
-        help=(
-            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
-            ' "constant", "constant_with_warmup"]'
-        ),
-    )
-    parser.add_argument(
-        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
-    )
-    parser.add_argument(
-        "--lr_num_cycles",
-        type=int,
-        default=1,
-        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
-    )
-    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
-    parser.add_argument("--debug", action="store_true", help="Whether to debug this training script.")
-    parser.add_argument(
-        "--dataloader_num_workers",
-        type=int,
-        default=0,
-        help=(
-            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
-        ),
-    )
-    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
-    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
-    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
-    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
-    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
-    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
-    parser.add_argument(
-        "--hub_model_id",
-        type=str,
-        default=None,
-        help="The name of the repository to keep in sync with the local `output_dir`.",
-    )
-    parser.add_argument(
-        "--logging_dir",
-        type=str,
-        default="logs",
-        help=(
-            "[TensorBoard](https://www.tensorflow.org/tensorboard) or [VisualDL](https://www.paddlepaddle.org.cn/paddle/visualdl) log directory. Will default to"
-            "*output_dir/logs"
-        ),
-    )
-    parser.add_argument(
-        "--report_to", type=str, default="visualdl", choices=["tensorboard", "visualdl"], help="Log writer type."
-    )
-    parser.add_argument(
-        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
-    )
-    parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
-    if input_args is not None:
-        args = parser.parse_args(input_args)
-    else:
-        args = parser.parse_args()
-
-    # Sanity checks
-    if args.dataset_name is None and args.train_data_dir is None:
-        raise ValueError("Need either a dataset name or a training folder.")
-    args.logging_dir = os.path.join(args.output_dir, args.logging_dir)
-    if args.height is None or args.width is None and args.resolution is not None:
-        args.height = args.width = args.resolution
-
-    return args
-
-
-def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
-    if token is None:
-        token = HfFolder.get_token()
-    if organization is None:
-        username = whoami(token)["name"]
-        return f"{username}/{model_id}"
-    else:
-        return f"{organization}/{model_id}"
-
-
-DATASET_NAME_MAPPING = {
-    "lambdalabs/pokemon-blip-captions": ("image", "text"),
-}
-
-
-def main():
-    args = parse_args()
-    rank = paddle.distributed.get_rank()
-    is_main_process = rank == 0
-    num_processes = paddle.distributed.get_world_size()
-    if num_processes > 1:
-        paddle.distributed.init_parallel_env()
-
-    # If passed along, set the training seed now.
-    if args.seed is not None:
-        set_seed(args.seed)
-
-    # Handle the repository creation
-    if is_main_process:
-        if args.output_dir is not None:
-            os.makedirs(args.output_dir, exist_ok=True)
-        if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            create_repo(repo_name, exist_ok=True, token=args.hub_token)
-            repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
-
-            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
-                if "step_*" not in gitignore:
-                    gitignore.write("step_*\n")
-                if "epoch_*" not in gitignore:
-                    gitignore.write("epoch_*\n")
-
-    # Load the tokenizer
-    if args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)
-    elif args.pretrained_model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(url_or_path_join(args.pretrained_model_name_or_path, "tokenizer"))
-
-    # import correct text encoder class
-    text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path)
-
-    # Load scheduler and models
-    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
-    text_encoder = text_encoder_cls.from_pretrained(
-        url_or_path_join(args.pretrained_model_name_or_path, "text_encoder")
-    )
-    text_config = text_encoder.config if isinstance(text_encoder.config, dict) else text_encoder.config.to_dict()
-    if text_config.get("use_attention_mask", None) is not None and text_config["use_attention_mask"]:
-        use_attention_mask = True
-    else:
-        use_attention_mask = False
-    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
-    unet = UNet2DConditionModel.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="unet",
-    )
-
-    # We only train the additional adapter LoRA layers
-    freeze_params(vae.parameters())
-    freeze_params(text_encoder.parameters())
-    freeze_params(unet.parameters())
-
-    if args.enable_xformers_memory_efficient_attention and is_ppxformers_available():
-        try:
-            unet.enable_xformers_memory_efficient_attention()
-        except Exception as e:
-            logger.warning(
-                "Could not enable memory efficient attention. Make sure develop paddlepaddle is installed"
-                f" correctly and a GPU is available: {e}"
-            )
-    # now we will add new LoRA weights to the attention layers
-    # It's important to realize here how many attention weights will be added and of which sizes
-    # The sizes of the attention layers consist only of two different variables:
-    # 1) - the "hidden_size", which is increased according to `unet.config.block_out_channels`.
-    # 2) - the "cross attention size", which is set to `unet.config.cross_attention_dim`.
-
-    # Let's first see how many attention processors we will have to set.
-    # For Stable Diffusion, it should be equal to:
-    # - down blocks (2x attention layers) * (2x transformer layers) * (3x down blocks) = 12
-    # - mid blocks (2x attention layers) * (1x transformer layers) * (1x mid blocks) = 2
-    # - up blocks (2x attention layers) * (3x transformer layers) * (3x down blocks) = 18
-    # => 32 layers
-
-    # Set correct lora layers
-    unet_lora_attn_procs = {}
-    for name, attn_processor in unet.attn_processors.items():
-        cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
-        if name.startswith("mid_block"):
-            hidden_size = unet.config.block_out_channels[-1]
-        elif name.startswith("up_blocks"):
-            block_id = int(name[len("up_blocks.")])
-            hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
-        elif name.startswith("down_blocks"):
-            block_id = int(name[len("down_blocks.")])
-            hidden_size = unet.config.block_out_channels[block_id]
-
-        if isinstance(attn_processor, AttnProcessor):
-            lora_attn_processor_class = LoRAAttnProcessor
-        elif isinstance(attn_processor, AttnProcessor2_5):
-            lora_attn_processor_class = LoRAAttnProcessor2_5
-        else:
-            raise ValueError(f"Unknown attention processor type: {attn_processor.__class__.__name__}")
-
-        unet_lora_attn_procs[name] = lora_attn_processor_class(
-            hidden_size=hidden_size,
-            cross_attention_dim=cross_attention_dim,
-            rank=args.lora_rank,
-        )
-
-    unet.set_attn_processor(unet_lora_attn_procs)
-    unet_lora_layers = AttnProcsLayers(unet.attn_processors)
-
-    # The text encoder comes from 🤗 transformers, so we cannot directly modify it.
-    # So, instead, we monkey-patch the forward calls of its attention-blocks. For this,
-    # we first load a dummy pipeline with the text encoder and then do the monkey-patching.
-    text_encoder_lora_layers = None
-    if args.train_text_encoder:
-        text_lora_attn_procs = {}
-        for name, module in text_encoder.named_sublayers(include_self=True):
-            if name.endswith(TEXT_ENCODER_ATTN_MODULE):
-                text_lora_attn_procs[name] = LoRAAttnProcessor(
-                    hidden_size=module.out_proj.weight.shape[1],
-                    cross_attention_dim=None,
-                    rank=args.lora_rank,
-                )
-        text_encoder_lora_layers = AttnProcsLayers(text_lora_attn_procs)
-        temp_pipeline = DiffusionPipeline.from_pretrained(
-            args.pretrained_model_name_or_path, text_encoder=text_encoder
-        )
-        temp_pipeline._modify_text_encoder(text_lora_attn_procs)
-        text_encoder = temp_pipeline.text_encoder
-        del temp_pipeline
-
-    # Get the datasets: you can either provide your own training and evaluation files (see below)
-    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
-
-    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
-    # download the dataset.
-    if args.debug:
-        file_path = get_path_from_url_with_filelock(
-            "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/pokemon-blip-captions.tar.gz",
-            PPDIFFUSERS_CACHE,
-        )
-        dataset = DatasetDict.load_from_disk(file_path)
-        args.dataset_name = "lambdalabs/pokemon-blip-captions"
-    else:
-        if args.dataset_name is not None:
-            # Downloading and loading a dataset from the hub.
-            dataset = load_dataset(
-                args.dataset_name,
-                args.dataset_config_name,
-                cache_dir=args.cache_dir,
-            )
-        else:
-            data_files = {}
-            if args.train_data_dir is not None:
-                data_files["train"] = os.path.join(args.train_data_dir, "**")
-            dataset = load_dataset(
-                "imagefolder",
-                data_files=data_files,
-                cache_dir=args.cache_dir,
-            )
-            # See more about loading custom images at
-            # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
-
-    # Preprocessing the datasets.
-    # We need to tokenize inputs and targets.
-    column_names = dataset["train"].column_names
-
-    # 6. Get the column names for input/target.
-    dataset_columns = DATASET_NAME_MAPPING.get(args.dataset_name, None)
-    if args.image_column is None:
-        image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
-    else:
-        image_column = args.image_column
-        if image_column not in column_names:
-            raise ValueError(
-                f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}"
-            )
-    if args.caption_column is None:
-        caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
-    else:
-        caption_column = args.caption_column
-        if caption_column not in column_names:
-            raise ValueError(
-                f"--caption_column' value '{args.caption_column}' needs to be one of: {', '.join(column_names)}"
-            )
-
-    # Preprocessing the datasets.
-    # We need to tokenize input captions and transform the images.
-    def tokenize_captions(examples, is_train=True):
-        captions = []
-        for caption in examples[caption_column]:
-            if isinstance(caption, str):
-                captions.append(caption)
-            elif isinstance(caption, (list, np.ndarray)):
-                # take a random caption if there are multiple
-                captions.append(random.choice(caption) if is_train else caption[0])
-            else:
-                raise ValueError(
-                    f"Caption column `{caption_column}` should contain either strings or lists of strings."
-                )
-        inputs = tokenizer(
-            captions,
-            max_length=tokenizer.model_max_length,
-            padding="do_not_pad",
-            truncation=True,
-            return_attention_mask=False,
-        )
-        return inputs.input_ids
-
-    # Preprocessing the datasets.
-    train_transforms = transforms.Compose(
-        [
-            transforms.Resize((args.height, args.width), interpolation="bilinear"),
-            transforms.CenterCrop((args.height, args.width))
-            if args.center_crop
-            else transforms.RandomCrop((args.height, args.width)),
-            transforms.RandomHorizontalFlip() if args.random_flip else Lambda(lambda x: x),
-            transforms.ToTensor(),
-            transforms.Normalize([0.5], [0.5]),
-        ]
-    )
-
-    def preprocess_train(examples):
-        images = [image.convert("RGB") for image in examples[image_column]]
-        examples["pixel_values"] = [train_transforms(image) for image in images]
-        examples["input_ids"] = tokenize_captions(examples)
-        return examples
-
-    with main_process_first():
-        if args.max_train_samples is not None:
-            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
-        # Set the training transforms
-        train_dataset = dataset["train"].with_transform(preprocess_train)
-
-    def collate_fn(examples):
-        pixel_values = paddle.stack([example["pixel_values"] for example in examples]).cast("float32")
-        input_ids = [example["input_ids"] for example in examples]
-        input_ids = tokenizer.pad(
-            {"input_ids": input_ids}, padding="max_length", max_length=tokenizer.model_max_length, return_tensors="pd"
-        ).input_ids
-        return {
-            "input_ids": input_ids,
-            "pixel_values": pixel_values,
-        }
-
-    train_sampler = (
-        DistributedBatchSampler(train_dataset, batch_size=args.train_batch_size, shuffle=True)
-        if num_processes > 1
-        else BatchSampler(train_dataset, batch_size=args.train_batch_size, shuffle=True)
-    )
-    train_dataloader = DataLoader(
-        train_dataset, batch_sampler=train_sampler, collate_fn=collate_fn, num_workers=args.dataloader_num_workers
-    )
-
-    # Scheduler and math around the number of training steps.
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
-    if args.max_train_steps is None:
-        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
-    # Afterwards we recalculate our number of training epochs
-    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
-
-    if args.scale_lr:
-        args.learning_rate = (
-            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * num_processes
-        )
-
-    lr_scheduler = get_scheduler(
-        args.lr_scheduler,
-        learning_rate=args.learning_rate,
-        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
-        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
-        num_cycles=args.lr_num_cycles,
-        power=args.lr_power,
-    )
-
-    params_to_optimize = (
-        list(unet_lora_layers.parameters()) + list(text_encoder_lora_layers.parameters())
-        if args.train_text_encoder
-        else unet_lora_layers.parameters()
-    )
-    # Optimizer creation
-    optimizer = AdamW(
-        learning_rate=lr_scheduler,
-        parameters=params_to_optimize,
-        beta1=args.adam_beta1,
-        beta2=args.adam_beta2,
-        weight_decay=args.adam_weight_decay,
-        epsilon=args.adam_epsilon,
-        grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm) if args.max_grad_norm > 0 else None,
-    )
-
-    if num_processes > 1:
-        unet = paddle.DataParallel(unet)
-        if args.train_text_encoder:
-            text_encoder = paddle.DataParallel(text_encoder)
-
-    if is_main_process:
-        logger.info("-----------  Configuration Arguments -----------")
-        for arg, value in sorted(vars(args).items()):
-            logger.info("%s: %s" % (arg, value))
-        logger.info("------------------------------------------------")
-        writer = get_report_to(args)
-
-    # Train!
-    total_batch_size = args.train_batch_size * num_processes * args.gradient_accumulation_steps
-
-    logger.info("***** Running training *****")
-    logger.info(f"  Num examples = {len(train_dataset)}")
-    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
-    logger.info(f"  Num Epochs = {args.num_train_epochs}")
-    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
-    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
-    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
-    logger.info(f"  Total optimization steps = {args.max_train_steps}")
-
-    # Only show the progress bar once on each machine.
-    progress_bar = tqdm(range(args.max_train_steps), disable=not is_main_process)
-    progress_bar.set_description("Train Steps")
-    global_step = 0
-    vae.eval()
-    if args.train_text_encoder:
-        text_encoder.train()
-    else:
-        text_encoder.eval()
-
-    for epoch in range(args.num_train_epochs):
-        unet.train()
-        for step, batch in enumerate(train_dataloader):
-            # Convert images to latent space
-            latents = vae.encode(batch["pixel_values"]).latent_dist.sample()
-            latents = latents * vae.config.scaling_factor
-
-            # Sample noise that we'll add to the latents
-            noise = paddle.randn(latents.shape, dtype=latents.dtype)
-            if args.noise_offset:
-                # https://www.crosslabs.org/blog/diffusion-with-offset-noise
-                noise += args.noise_offset * paddle.randn(
-                    (latents.shape[0], latents.shape[1], 1, 1), dtype=latents.dtype
-                )
-            batch_size = latents.shape[0]
-            # Sample a random timestep for each image
-            timesteps = paddle.randint(0, noise_scheduler.config.num_train_timesteps, (batch_size,)).cast("int64")
-
-            # Add noise to the latents according to the noise magnitude at each timestep
-            # (this is the forward diffusion process)
-            noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
-
-            if num_processes > 1 and (
-                args.gradient_checkpointing or ((step + 1) % args.gradient_accumulation_steps != 0)
-            ):
-                # grad acc, no_sync when (step + 1) % args.gradient_accumulation_steps != 0:
-                # gradient_checkpointing, no_sync every where
-                # gradient_checkpointing + grad_acc, no_sync every where
-                unet_ctx_manager = unet.no_sync()
-            else:
-                unet_ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
-
-            if use_attention_mask:
-                attention_mask = (batch["input_ids"] != tokenizer.pad_token_id).cast("int64")
-            else:
-                attention_mask = None
-            encoder_hidden_states = text_encoder(batch["input_ids"], attention_mask=attention_mask)[0]
-
-            with unet_ctx_manager:
-                # Predict the noise residual / sample
-                model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
-
-                # Get the target for loss depending on the prediction type
-                if noise_scheduler.config.prediction_type == "epsilon":
-                    target = noise
-                elif noise_scheduler.config.prediction_type == "v_prediction":
-                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
-                else:
-                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
-
-                loss = F.mse_loss(model_pred, target, reduction="mean")
-
-                if args.gradient_accumulation_steps > 1:
-                    loss = loss / args.gradient_accumulation_steps
-                loss.backward()
-
-            if (step + 1) % args.gradient_accumulation_steps == 0:
-                if num_processes > 1 and args.gradient_checkpointing:
-                    fused_allreduce_gradients(params_to_optimize, None)
-                optimizer.step()
-                lr_scheduler.step()
-                optimizer.clear_grad()
-                progress_bar.update(1)
-                global_step += 1
-                step_loss = loss.item() * args.gradient_accumulation_steps
-                logs = {
-                    "epoch": str(epoch).zfill(4),
-                    "step_loss": round(step_loss, 10),
-                    "lr": lr_scheduler.get_lr(),
-                }
-                progress_bar.set_postfix(**logs)
-
-                if is_main_process:
-                    for name, val in logs.items():
-                        if name == "epoch":
-                            continue
-                        writer.add_scalar(f"train/{name}", val, global_step)
-
-                    if global_step % args.checkpointing_steps == 0:
-                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
-                        # We combine the text encoder and UNet LoRA parameters with a simple
-                        # custom logic. So, use `LoraLoaderMixin.save_lora_weights()`.
-                        LoraLoaderMixin.save_lora_weights(
-                            save_directory=save_path,
-                            unet_lora_layers=unet_lora_layers,
-                            text_encoder_lora_layers=text_encoder_lora_layers,
-                        )
-                        logger.info(f"Saved lora weights to {save_path}")
-
-                if global_step >= args.max_train_steps:
-                    break
-
-        if is_main_process:
-            if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
-                logger.info(
-                    f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
-                    f" {args.validation_prompt}."
-                )
-                # create pipeline
-                pipeline = DiffusionPipeline.from_pretrained(
-                    args.pretrained_model_name_or_path,
-                    unet=unwrap_model(unet),
-                    text_encoder=unwrap_model(text_encoder),
-                    safety_checker=None,
-                    requires_safety_checker=False,
-                )
-                pipeline.set_progress_bar_config(disable=True)
-
-                # run inference
-                generator = paddle.Generator().manual_seed(args.seed) if args.seed else None
-                images = [
-                    pipeline(args.validation_prompt, num_inference_steps=30, generator=generator).images[0]
-                    for _ in range(args.num_validation_images)
-                ]
-                np_images = np.stack([np.asarray(img) for img in images])
-
-                if args.report_to == "tensorboard":
-                    writer.add_images("validation", np_images, epoch, dataformats="NHWC")
-                else:
-                    writer.add_image("validation", np_images, epoch, dataformats="NHWC")
-
-                del pipeline
-                gc.collect()
-                if args.train_text_encoder:
-                    text_encoder.train()
-                unet.train()
-
-    # Save the lora layers
-    if is_main_process:
-        LoraLoaderMixin.save_lora_weights(
-            save_directory=args.output_dir,
-            unet_lora_layers=unet_lora_layers,
-            text_encoder_lora_layers=text_encoder_lora_layers,
-        )
-
-        if args.push_to_hub:
-            save_model_card(
-                repo_name,
-                images=images,
-                base_model=args.pretrained_model_name_or_path,
-                prompt=args.instance_prompt,
-                repo_folder=args.output_dir,
-            )
-            repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
-        # Final inference
-        # Load previous pipeline
-        pipeline = DiffusionPipeline.from_pretrained(
-            args.pretrained_model_name_or_path, safety_checker=None, requires_safety_checker=False
-        )
-        pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
-        # load attention processors
-        pipeline.load_lora_weights(args.output_dir)
-
-        # run inference
-        if args.validation_prompt and args.num_validation_images > 0:
-            generator = paddle.Generator().manual_seed(args.seed) if args.seed else None
-            images = [
-                pipeline(args.validation_prompt, num_inference_steps=30, generator=generator).images[0]
-                for _ in range(args.num_validation_images)
-            ]
-            np_images = np.stack([np.asarray(img) for img in images])
-
-            if args.report_to == "tensorboard":
-                writer.add_images("test", np_images, epoch, dataformats="NHWC")
-            else:
-                writer.add_image("test", np_images, epoch, dataformats="NHWC")
-
-        writer.close()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/ppdiffusers/examples/text_to_image_laion400m/README.md b/ppdiffusers/examples/text_to_image_laion400m/README.md
deleted file mode 100644
index ea5285344de7..000000000000
--- a/ppdiffusers/examples/text_to_image_laion400m/README.md
+++ /dev/null
@@ -1,331 +0,0 @@
-## Latent Diffusion Model 从零训练代码
-
-本教程带领大家如何开启32层的**Latent Diffusion Model**的训练（支持切换`中文`和`英文`分词器）。
-
-___注意___:
-___官方32层`CompVis/ldm-text2im-large-256`的Latent Diffusion Model使用的是vae，而不是vqvae！而Huggingface团队在设计目录结构的时候把文件夹名字错误的设置成了vqvae！为了与Huggingface团队保持一致，我们同样使用了vqvae文件夹命名！___
-
-## 1 本地运行
-### 1.1 安装依赖
-
-在运行这个训练代码前，我们需要安装下面的训练依赖。
-```bash
-# paddlepaddle-gpu>=2.4.1
-python -m pip install paddlepaddle-gpu==2.4.1.post112 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
-pip install -r requirements.txt
-```
-
-### 1.2 准备数据
-
-#### laion400m_en.filelist文件内部格式如下所示
-自己准备好处理后的数据，并且将文件放置于`/data/laion400m/`目录，其中里面的每个part的前三列为`caption文本描述, 占位符空, base64编码的图片`，`caption, _, img_b64 = vec[:3]`。
-
-注意，当前`laion400m_en.filelist`只存放了10条数据路径，如果想要更多数据的话，请运行`python write_filelist.py`代码，运行后会生成6万条数据路径。
-```
-/data/laion400m/part-00000.gz
-/data/laion400m/part-00001.gz
-/data/laion400m/part-00002.gz
-/data/laion400m/part-00003.gz
-/data/laion400m/part-00004.gz
-/data/laion400m/part-00005.gz
-/data/laion400m/part-00006.gz
-/data/laion400m/part-00007.gz
-/data/laion400m/part-00008.gz
-/data/laion400m/part-00009.gz
-```
-#### train.filelist.list训练文件内部格式如下所示
-我们提供了`laion400m_en.filelist`，当然也可以存放其他`filelist`
-```
-./data/filelist/laion400m_en.filelist
-```
-Tips: 我们可以选择下载demo数据
-- 删除当前目录下的`data`;
-- 下载demo数据`wget https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/laion400m_demo_data.tar.gz`；
-- 解压demo数据`tar -zxvf laion400m_demo_data.tar.gz`
-
-### 1.3 使用trainner开启训练
-#### 1.3.1 硬件要求
-Tips：
-- FP32 在 40GB 的显卡上可正常训练。
-
-#### 1.3.2 单机单卡训练
-```bash
-python -u train_txt2img_laion400m_trainer.py \
-    --do_train \
-    --output_dir ./laion400m_pretrain_output_trainer \
-    --per_device_train_batch_size 16 \
-    --gradient_accumulation_steps 2 \
-    --learning_rate 5e-5 \
-    --weight_decay 0.02 \
-    --max_steps 1000000000 \
-    --lr_scheduler_type "constant" \
-    --warmup_steps 0 \
-    --image_logging_steps 1000 \
-    --logging_steps 50 \
-    --save_steps 5000 \
-    --save_total_limit 50 \
-    --seed 23 \
-    --dataloader_num_workers 6 \
-    --vae_name_or_path CompVis/stable-diffusion-v1-4/vae \
-    --text_encoder_config_file config/ldmbert.json \
-    --unet_config_file config/unet.json \
-    --file_list ./data/filelist/train.filelist.list \
-    --num_inference_steps 200 \
-    --model_max_length 77 \
-    --tokenizer_name bert-base-uncased \
-    --max_grad_norm -1
-```
-
-
-`train_txt2img_laion400m_trainer.py`代码可传入的参数解释如下：
-> * `--vae_name_or_path`: 预训练`vae`模型名称或地址，`CompVis/stable-diffusion-v1-4/vae`为`kl-8.ckpt`，程序将自动从BOS上下载预训练好的权重。
-> * `--text_encoder_config_file`: `ldmbert`的config配置文件地址，默认为`./config/ldmbert.json`。
-> * `--unet_config_file`: `unet`的config配置文件地址，默认为`./config/unet.json`。
-> * `--pretrained_model_name_or_path`: 加载预训练模型的名称或本地路径，如`CompVis/ldm-text2im-large-256`，`pretrained_model_name_or_path`的优先级高于`vae_name_or_path`, `text_encoder_config_file`和`unet_config_file`。
-> * `--per_device_train_batch_size`: 训练时每张显卡所使用的`batch_size批量`，当我们的显存较小的时候，需要将这个值设置的小一点。
-> * `--gradient_accumulation_steps`: 梯度累积的步数，用户可以指定梯度累积的步数，在梯度累积的step中。减少多卡之间梯度的通信，减少更新的次数，扩大训练的batch_size。
-> * `--learning_rate`: 学习率。
-> * `--weight_decay`: AdamW优化器的`weight_decay`。
-> * `--max_steps`: 最大的训练步数。
-> * `--save_steps`: 每间隔多少步`（global step步数）`，保存模型。
-> * `--save_total_limit`: 最多保存多少个模型。
-> * `--lr_scheduler_type`: 要使用的学习率调度策略。默认为 `constant`。
-> * `--warmup_steps`: 用于从 0 到 `learning_rate` 的线性 warmup 的步数。
-> * `--image_logging_steps`: 每隔多少步，log训练过程中的图片，默认为`1000`步，注意`image_logging_steps`需要是`logging_steps`的整数倍。
-> * `--logging_steps`: logging日志的步数，默认为`50`步。
-> * `--output_dir`: 模型保存路径。
-> * `--seed`: 随机种子，为了可以复现训练结果，Tips：当前paddle设置该随机种子后仍无法完美复现。
-> * `--dataloader_num_workers`: Dataloader所使用的`num_workers`参数。
-> * `--file_list`: file_list文件地址。
-> * `--num_inference_steps`: 推理预测时候使用的步数。
-> * `--model_max_length`: `tokenizer`中的`model_max_length`参数，超过该长度将会被截断。
-> * `--tokenizer_name`: 我们需要使用的`tokenizer_name`，我们可以使用英文的分词器`bert-base-uncased`，也可以使用中文的分词器`ernie-1.0`。
-> * `--prediction_type`: 预测类型，可从`["epsilon", "v_prediction"]`选择。
-> * `--use_ema`: 是否对`unet`使用`ema`，默认为`False`。
-> * `--max_grad_norm`: 梯度剪裁的最大norm值，`-1`表示不使用梯度裁剪策略。
-> * `--recompute`: 是否开启重计算，(`bool`, 可选, 默认为 `False`)，在开启后我们可以增大batch_size，注意在小batch_size的条件下，开启recompute后显存变化不明显，只有当开大batch_size后才能明显感受到区别。
-> * `--fp16`: 是否使用 fp16 混合精度训练而不是 fp32 训练。(`bool`, 可选, 默认为 `False`)
-> * `--fp16_opt_level`: 混合精度训练模式，可为``O1``或``O2``模式，默认``O1``模式，默认O1. 只在fp16选项开启时候生效。
-> * `--enable_xformers_memory_efficient_attention`: 是否开启`xformers`，开启后训练速度会变慢，但是能够节省显存。注意我们需要安装develop版本的paddlepaddle！
-
-
-#### 1.3.3 单机多卡训练 (多机多卡训练，仅需在 paddle.distributed.launch 后加个 --ips IP1,IP2,IP3,IP4)
-```bash
-python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" train_txt2img_laion400m_trainer.py \
-    --do_train \
-    --output_dir ./laion400m_pretrain_output_trainer \
-    --per_device_train_batch_size 16 \
-    --gradient_accumulation_steps 2 \
-    --learning_rate 5e-5 \
-    --weight_decay 0.02 \
-    --max_steps 1000000000 \
-    --lr_scheduler_type "constant" \
-    --warmup_steps 0 \
-    --image_logging_steps 1000 \
-    --logging_steps 50 \
-    --save_steps 5000 \
-    --save_total_limit 50 \
-    --seed 23 \
-    --dataloader_num_workers 6 \
-    --vae_name_or_path CompVis/stable-diffusion-v1-4/vae \
-    --text_encoder_config_file config/ldmbert.json \
-    --unet_config_file config/unet.json \
-    --file_list ./data/filelist/train.filelist.list \
-    --num_inference_steps 200 \
-    --model_max_length 77 \
-    --tokenizer_name bert-base-uncased \
-    --max_grad_norm -1
-```
-
-### 1.4 自定义训练逻辑开启训练
-#### 1.4.1 单机单卡训练
-```bash
-python -u train_txt2img_laion400m_no_trainer.py \
-    --output_dir ./laion400m_pretrain_output_no_trainer \
-    --per_device_train_batch_size 16 \
-    --gradient_accumulation_steps 2 \
-    --learning_rate 5e-5 \
-    --weight_decay 0.02 \
-    --max_steps 1000000000 \
-    --lr_scheduler_type "constant" \
-    --warmup_steps 0 \
-    --image_logging_steps 1000 \
-    --logging_steps 50 \
-    --save_steps 5000 \
-    --seed 23 \
-    --dataloader_num_workers 6 \
-    --vae_name_or_path CompVis/stable-diffusion-v1-4/vae \
-    --text_encoder_config_file config/ldmbert.json \
-    --unet_config_file config/unet.json \
-    --file_list ./data/filelist/train.filelist.list \
-    --num_inference_steps 200 \
-    --model_max_length 77 \
-    --tokenizer_name bert-base-uncased \
-    --max_grad_norm -1
-```
-
-`train_txt2img_laion400m_no_trainer.py`代码可传入的参数解释如下：
-> 主要修改的参数
-> * `--vae_name_or_path`: 预训练`vae`模型名称或地址，`CompVis/stable-diffusion-v1-4/vae`为`kl-8.ckpt`，程序将自动从BOS上下载预训练好的权重。
-> * `--text_encoder_config_file`: `ldmbert`的config配置文件地址，默认为`./config/ldmbert.json`。
-> * `--unet_config_file`: `unet`的config配置文件地址，默认为`./config/unet.json`。
-> * `--pretrained_model_name_or_path`: 加载预训练模型的名称或本地路径，如`CompVis/ldm-text2im-large-256`，`pretrained_model_name_or_path`的优先级高于`vae_name_or_path`, `text_encoder_config_file`和`unet_config_file`。
-> * `--per_device_train_batch_size`: 训练时每张显卡所使用的`batch_size批量`，当我们的显存较小的时候，需要将这个值设置的小一点。
-> * `--gradient_accumulation_steps`: 梯度累积的步数，用户可以指定梯度累积的步数，在梯度累积的step中。减少多卡之间梯度的通信，减少更新的次数，扩大训练的batch_size。
-> * `--learning_rate`: 学习率。
-> * `--weight_decay`: AdamW优化器的`weight_decay`。
-> * `--max_steps`: 最大的训练步数。
-> * `--save_steps`: 每间隔多少步`（global step步数）`，保存模型。
-> * `--lr_scheduler_type`: 要使用的学习率调度策略。默认为 `constant`。
-> * `--warmup_steps`: 用于从 0 到 `learning_rate` 的线性 warmup 的步数。
-> * `--image_logging_steps`: 每隔多少步，log训练过程中的图片，默认为`1000`步，注意`image_logging_steps`需要是`logging_steps`的整数倍。
-> * `--logging_steps`: logging日志的步数，默认为`50`步。
-> * `--output_dir`: 模型保存路径。
-> * `--seed`: 随机种子，为了可以复现训练结果，Tips：当前paddle设置该随机种子后仍无法完美复现。
-> * `--dataloader_num_workers`: Dataloader所使用的`num_workers`参数。
-> * `--file_list`: file_list文件地址。
-> * `--num_inference_steps`: 推理预测时候使用的步数。
-> * `--model_max_length`: `tokenizer`中的`model_max_length`参数，超过该长度将会被截断。
-> * `--tokenizer_name`: 我们需要使用的`tokenizer_name`。
-> * `--use_ema`: 是否对`unet`使用`ema`，默认为`False`。
-> * `--max_grad_norm`: 梯度剪裁的最大norm值，`-1`表示不使用梯度裁剪策略。
-> * `--recompute`: 是否开启重计算，(`bool`, 可选, 默认为 `False`)，在开启后我们可以增大batch_size，注意在小batch_size的条件下，开启recompute后显存变化不明显，只有当开大batch_size后才能明显感受到区别。
-> * `--fp16`: 是否使用 fp16 混合精度训练而不是 fp32 训练。(`bool`, 可选, 默认为 `False`)
-> * `--fp16_opt_level`: 混合精度训练模式，可为``O1``或``O2``模式，默认``O1``模式，默认O1. 只在fp16选项开启时候生效。
-> * `--enable_xformers_memory_efficient_attention`: 是否开启`xformers`，开启后训练速度会变慢，但是能够节省显存。注意我们需要安装develop版本的paddlepaddle！
-
-#### 1.4.2 单机多卡训练
-```bash
-python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" train_txt2img_laion400m_no_trainer.py \
-    --output_dir ./laion400m_pretrain_output_no_trainer \
-    --per_device_train_batch_size 16 \
-    --gradient_accumulation_steps 2 \
-    --learning_rate 5e-5 \
-    --weight_decay 0.02 \
-    --max_steps 1000000000 \
-    --lr_scheduler_type "constant" \
-    --warmup_steps 0 \
-    --image_logging_steps 1000 \
-    --logging_steps 50 \
-    --save_steps 5000 \
-    --seed 23 \
-    --dataloader_num_workers 6 \
-    --vae_name_or_path CompVis/stable-diffusion-v1-4/vae \
-    --text_encoder_config_file config/ldmbert.json \
-    --unet_config_file config/unet.json \
-    --file_list ./data/filelist/train.filelist.list \
-    --num_inference_steps 200 \
-    --model_max_length 77 \
-    --tokenizer_name bert-base-uncased \
-    --max_grad_norm -1
-```
-
-## 2 模型推理
-
-待模型训练完毕，会在`output_dir`保存训练好的模型权重，我们可以使用`generate_pipelines.py`生成推理所使用的`Pipeline`。
-```bash
-python generate_pipelines.py \
-    --model_file ./laion400m_pretrain_output_no_trainer/model_state.pdparams \
-    --output_path ./ldm_pipelines \
-    --vae_name_or_path CompVis/stable-diffusion-v1-4/vae \
-    --text_encoder_config_file ./config/ldmbert.json \
-    --unet_config_file ./config/unet.json \
-    --tokenizer_name_or_path bert-base-uncased \
-    --model_max_length 77
-```
-`generate_pipelines.py`代码可传入的参数解释如下：
-> * `--model_file`: 我们使用`train_txt2img_laion400m_trainer.py`代码，训练好所得到的`model_state.pdparams`文件。
-> * `--output_path`: 生成的pipeline所要保存的路径。
-> * `--vae_name_or_path`: 使用的`vae`的名字或者本地路径，注意我们需要里面的`config.json`文件。
-> * `--text_encoder_config_file`: 文本编码器的`config`配置文件。
-> * `--unet_config_file`: `unet`的`config`配置文件。
-> * `--tokenizer_name_or_path`: 所使用的`tokenizer`名称或者本地路径，名称可以是`bert-base-uncased`, `bert-base-chinese`, `ernie-1.0`等。
-> * `--model_max_length`: `tokenizer`中的`model_max_length`参数，超过该长度将会被截断。
-
-
-输出的模型目录结构如下：
-```shell
-├── ldm_pipelines  # 我们指定的输出文件路径
-    ├── model_index.json # 模型index文件
-    ├── vqvae # vae权重文件夹！实际是vae模型，文件夹名字与HF保持了一致！
-        ├── model_state.pdparams
-        ├── config.json
-    ├── bert # ldmbert权重文件夹
-        ├── model_config.json
-        ├── model_state.pdparams
-    ├── unet # unet权重文件夹
-        ├── model_state.pdparams
-        ├── config.json
-    ├── scheduler # ddim scheduler文件夹
-        ├── scheduler_config.json
-    ├── tokenizer # bert tokenizer文件夹
-        ├── tokenizer_config.json
-        ├── special_tokens_map.json
-        ├── vocab.txt
-```
-
-在生成`Pipeline`的权重后，我们可以使用如下的代码进行推理。
-
-```python
-from ppdiffusers import LDMTextToImagePipeline
-model_name_or_path = "./ldm_pipelines"
-pipe = LDMTextToImagePipeline.from_pretrained(model_name_or_path)
-prompt = "a photo of an astronaut riding a horse on mars"
-image = pipe(prompt, guidance_scale=7.5).images[0]
-image.save("astronaut_rides_horse.png")
-```
-
-当然，我们也可以使用训练好的模型在`coco en 1k`数据集上生成图片。
-首先我们需要下载`mscoco.en.1k`文件。
-```bash
-wget https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/mscoco.en.1k
-```
-然后可以`generate_images.py`文件生成对应的图片。
-```bash
-python generate_images.py \
-    --model_name_or_path ./ldm_pipelines \
-    --file ./mscoco.en.1k \
-    --batch_size 16 \
-    --save_path ./outputs \
-    --guidance_scales 3 4 5 6 7 8 \
-    --seed 42 \
-    --scheduler_type ddim \
-    --height 256 \
-    --width 256 \
-    --num_inference_steps 50 \
-    --device gpu
-```
-`generate_images.py`代码可传入的参数解释如下：
-> * `--model_name_or_path`: 我们需要评估的模型名称或地址，这里我们使用上一步生成的`ldm_pipelines`。
-> * `--file`: 需要评估的文件，我们可以从[这里](https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/mscoco.en.1k)下载。
-> * `--batch_size`: 生成图片所使用的batch_size。
-> * `--save_path`: 生成的图片所要保存的路径。
-> * `--guidance_scales`: guidance_scales值，我们可以输入3 4 5 6 7 8。
-> * `--seed`: 为了保证不同guidance_scales值，能够使用相同的`latents`初始值, `-1`表示不使用随机种子。
-> * `--scheduler_type`: 采样器的类型，支持`ddim`, `pndm`, `euler-ancest` 和 `lms`。
-> * `--num_inference_steps`: 推理预测时候使用的步数。
-> * `--height`: 生成图片的高度。
-> * `--width`: 生成图片的宽度。
-> * `--device`: 使用的设备，可以是`gpu`, `cpu`, `gpu:0`, `gpu:1`等。
-
-
-输出的图片目录如下：
-```shell
-├── outputs  # 我们指定的输出文件路径
-    ├── mscoco.en_g3 # guidance_scales为3的输出图片
-        ├── 00000_000.png
-        ├── 00001_000.png
-        ......
-        ├── 00999_000.png
-    ├── mscoco.en_g4 # guidance_scales为4的输出图片
-        ├── 00000_000.png
-        ├── 00001_000.png
-        ......
-        ├── 00999_000.png
-    ......
-    ├── mscoco.en_g8 # guidance_scales为8的输出图片
-        ├── 00000_000.png
-        ├── 00001_000.png
-        ......
-        ├── 00999_000.png
-```
diff --git a/ppdiffusers/examples/text_to_image_laion400m/config/ldmbert.json b/ppdiffusers/examples/text_to_image_laion400m/config/ldmbert.json
deleted file mode 100644
index 63ba020549d2..000000000000
--- a/ppdiffusers/examples/text_to_image_laion400m/config/ldmbert.json
+++ /dev/null
@@ -1,15 +0,0 @@
-{
-    "vocab_size": 30522,
-    "max_position_embeddings": 77,
-    "encoder_layers": 32,
-    "encoder_ffn_dim": 5120,
-    "encoder_attention_heads": 8,
-    "head_dim": 64,
-    "activation_function": "gelu",
-    "d_model": 1280,
-    "dropout": 0.0,
-    "attention_dropout": 0.0,
-    "activation_dropout": 0.0,
-    "init_std": 0.02,
-    "pad_token_id": 0
-}
\ No newline at end of file
diff --git a/ppdiffusers/examples/text_to_image_laion400m/config/unet.json b/ppdiffusers/examples/text_to_image_laion400m/config/unet.json
deleted file mode 100644
index ec1ead47915d..000000000000
--- a/ppdiffusers/examples/text_to_image_laion400m/config/unet.json
+++ /dev/null
@@ -1,35 +0,0 @@
-{
-    "act_fn": "silu",
-    "attention_head_dim": 8,
-    "block_out_channels": [
-      320,
-      640,
-      1280,
-      1280
-    ],
-    "center_input_sample": false,
-    "cross_attention_dim": 1280,
-    "down_block_types": [
-      "CrossAttnDownBlock2D",
-      "CrossAttnDownBlock2D",
-      "CrossAttnDownBlock2D",
-      "DownBlock2D"
-    ],
-    "downsample_padding": 1,
-    "flip_sin_to_cos": true,
-    "freq_shift": 0,
-    "in_channels": 4,
-    "layers_per_block": 2,
-    "mid_block_scale_factor": 1,
-    "norm_eps": 1e-05,
-    "norm_num_groups": 32,
-    "out_channels": 4,
-    "sample_size": 32,
-    "up_block_types": [
-      "UpBlock2D",
-      "CrossAttnUpBlock2D",
-      "CrossAttnUpBlock2D",
-      "CrossAttnUpBlock2D"
-    ]
-  }
-  
\ No newline at end of file
diff --git a/ppdiffusers/examples/text_to_image_laion400m/data/filelist/laion400m_en.filelist b/ppdiffusers/examples/text_to_image_laion400m/data/filelist/laion400m_en.filelist
deleted file mode 100644
index a70eccdedbad..000000000000
--- a/ppdiffusers/examples/text_to_image_laion400m/data/filelist/laion400m_en.filelist
+++ /dev/null
@@ -1,10 +0,0 @@
-/data/laion400m/part-00000.gz
-/data/laion400m/part-00001.gz
-/data/laion400m/part-00002.gz
-/data/laion400m/part-00003.gz
-/data/laion400m/part-00004.gz
-/data/laion400m/part-00005.gz
-/data/laion400m/part-00006.gz
-/data/laion400m/part-00007.gz
-/data/laion400m/part-00008.gz
-/data/laion400m/part-00009.gz
\ No newline at end of file
diff --git a/ppdiffusers/examples/text_to_image_laion400m/data/filelist/laion_aes.filelist b/ppdiffusers/examples/text_to_image_laion400m/data/filelist/laion_aes.filelist
deleted file mode 100644
index 86b0e5191d63..000000000000
--- a/ppdiffusers/examples/text_to_image_laion400m/data/filelist/laion_aes.filelist
+++ /dev/null
@@ -1,50 +0,0 @@
-/root/laion_aes/part-00000
-/root/laion_aes/part-00001
-/root/laion_aes/part-00002
-/root/laion_aes/part-00003
-/root/laion_aes/part-00004
-/root/laion_aes/part-00005
-/root/laion_aes/part-00006
-/root/laion_aes/part-00007
-/root/laion_aes/part-00008
-/root/laion_aes/part-00009
-/root/laion_aes/part-00010
-/root/laion_aes/part-00011
-/root/laion_aes/part-00012
-/root/laion_aes/part-00013
-/root/laion_aes/part-00014
-/root/laion_aes/part-00015
-/root/laion_aes/part-00016
-/root/laion_aes/part-00017
-/root/laion_aes/part-00018
-/root/laion_aes/part-00019
-/root/laion_aes/part-00020
-/root/laion_aes/part-00021
-/root/laion_aes/part-00022
-/root/laion_aes/part-00023
-/root/laion_aes/part-00024
-/root/laion_aes/part-00025
-/root/laion_aes/part-00026
-/root/laion_aes/part-00027
-/root/laion_aes/part-00028
-/root/laion_aes/part-00029
-/root/laion_aes/part-00030
-/root/laion_aes/part-00031
-/root/laion_aes/part-00032
-/root/laion_aes/part-00033
-/root/laion_aes/part-00034
-/root/laion_aes/part-00035
-/root/laion_aes/part-00036
-/root/laion_aes/part-00037
-/root/laion_aes/part-00038
-/root/laion_aes/part-00039
-/root/laion_aes/part-00040
-/root/laion_aes/part-00041
-/root/laion_aes/part-00042
-/root/laion_aes/part-00043
-/root/laion_aes/part-00044
-/root/laion_aes/part-00045
-/root/laion_aes/part-00046
-/root/laion_aes/part-00047
-/root/laion_aes/part-00048
-/root/laion_aes/part-00049
\ No newline at end of file
diff --git a/ppdiffusers/examples/text_to_image_laion400m/data/filelist/laion_aes.filelist.list b/ppdiffusers/examples/text_to_image_laion400m/data/filelist/laion_aes.filelist.list
deleted file mode 100644
index 0e36e494e2a3..000000000000
--- a/ppdiffusers/examples/text_to_image_laion400m/data/filelist/laion_aes.filelist.list
+++ /dev/null
@@ -1 +0,0 @@
-./data/filelist/laion_aes.filelist
diff --git a/ppdiffusers/examples/text_to_image_laion400m/data/filelist/train.filelist.list b/ppdiffusers/examples/text_to_image_laion400m/data/filelist/train.filelist.list
deleted file mode 100644
index 4bc020729904..000000000000
--- a/ppdiffusers/examples/text_to_image_laion400m/data/filelist/train.filelist.list
+++ /dev/null
@@ -1 +0,0 @@
-./data/filelist/laion400m_en.filelist
diff --git a/ppdiffusers/examples/text_to_image_laion400m/data/filelist/write_filelist.py b/ppdiffusers/examples/text_to_image_laion400m/data/filelist/write_filelist.py
deleted file mode 100644
index 358bca25f4fd..000000000000
--- a/ppdiffusers/examples/text_to_image_laion400m/data/filelist/write_filelist.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-data = []
-for index in range(60000):
-    data.append("/data/laion400m/part-{:05}.gz\n".format(index))
-
-with open("laion400m_en.filelist", "w") as w:
-    w.writelines(data)
diff --git a/ppdiffusers/examples/text_to_image_laion400m/generate_images.py b/ppdiffusers/examples/text_to_image_laion400m/generate_images.py
deleted file mode 100644
index cf9bbc1327cc..000000000000
--- a/ppdiffusers/examples/text_to_image_laion400m/generate_images.py
+++ /dev/null
@@ -1,165 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-import random
-
-import paddle
-import pandas as pd
-from tqdm.auto import tqdm
-
-from ppdiffusers import (
-    DDIMScheduler,
-    EulerAncestralDiscreteScheduler,
-    LDMTextToImagePipeline,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-)
-
-
-def batchify(data, batch_size=16):
-    one_batch = []
-    for example in data:
-        one_batch.append(example)
-        if len(one_batch) == batch_size:
-            yield one_batch
-            one_batch = []
-    if one_batch:
-        yield one_batch
-
-
-def generate_images(
-    model_name_or_path,
-    batch_size=16,
-    file="coco30k.csv",
-    save_path="output",
-    seed=42,
-    scheduler_type="ddim",
-    eta=0.0,
-    num_inference_steps=50,
-    guidance_scales=[3, 4, 5, 6, 7, 8],
-    height=256,
-    width=256,
-    device="gpu",
-):
-    paddle.set_device(device)
-    pipe = LDMTextToImagePipeline.from_pretrained(model_name_or_path)
-    pipe.set_progress_bar_config(disable=True)
-    beta_start = pipe.scheduler.beta_start
-    beta_end = pipe.scheduler.beta_end
-    if scheduler_type == "pndm":
-        scheduler = PNDMScheduler(
-            beta_start=beta_start,
-            beta_end=beta_end,
-            beta_schedule="scaled_linear",
-            set_alpha_to_one=False,
-            steps_offset=1,
-            # Make sure the scheduler compatible with PNDM
-            skip_prk_steps=True,
-        )
-    elif scheduler_type == "lms":
-        scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
-    elif scheduler_type == "euler-ancestral":
-        scheduler = EulerAncestralDiscreteScheduler(
-            beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear"
-        )
-    elif scheduler_type == "ddim":
-        scheduler = DDIMScheduler(
-            beta_start=beta_start,
-            beta_end=beta_end,
-            beta_schedule="scaled_linear",
-            # Make sure the scheduler compatible with DDIM
-            clip_sample=False,
-            set_alpha_to_one=False,
-            steps_offset=1,
-        )
-    else:
-        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
-    pipe.scheduler = scheduler
-    # read file
-    df = pd.read_csv(file, sep="\t")
-    all_prompt = df["caption_en"].tolist()
-    for cfg in guidance_scales:
-        new_save_path = os.path.join(save_path, f"mscoco.en_g{cfg}")
-        os.makedirs(new_save_path, exist_ok=True)
-        if seed is not None and seed > 0:
-            random.seed(seed)
-        i = 0
-        for batch_prompt in tqdm(batchify(all_prompt, batch_size=batch_size)):
-            sd = random.randint(0, 2**32)
-            paddle.seed(sd)
-            images = pipe(
-                batch_prompt,
-                guidance_scale=float(cfg),
-                eta=eta,
-                height=height,
-                width=width,
-                num_inference_steps=num_inference_steps,
-            )[0]
-            for image in images:
-                path = os.path.join(new_save_path, "{:05d}_000.png".format(i))
-                image.save(path)
-                i += 1
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model_name_or_path", default=None, type=str, required=True, help="model_name_or_path.")
-    parser.add_argument(
-        "--file",
-        default="./coco30k.tsv",
-        type=str,
-        help="eval file.",
-    )
-    parser.add_argument(
-        "--seed",
-        default=42,
-        type=int,
-        help="random seed.",
-    )
-    parser.add_argument(
-        "--scheduler_type",
-        default="ddim",
-        type=str,
-        choices=["ddim", "lms", "pndm", "euler-ancest"],
-        help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim', 'euler-ancest']",
-    )
-    parser.add_argument("--device", default="gpu", type=str, help="device")
-    parser.add_argument("--batch_size", default=16, type=int, help="batch_size")
-    parser.add_argument("--num_inference_steps", default=50, type=int, help="num_inference_steps")
-    parser.add_argument("--save_path", default="output/1.5b_ldm/12w.pd", type=str, help="Path to the output file.")
-    parser.add_argument(
-        "--guidance_scales", default=[3, 4, 5, 6, 7, 8], nargs="+", type=str, help="guidance_scales list."
-    )
-    parser.add_argument("--height", default=256, type=int, help="height.")
-    parser.add_argument("--width", default=256, type=int, help="width.")
-    args = parser.parse_args()
-    print("-----------  Configuration Arguments -----------")
-    for arg, value in sorted(vars(args).items()):
-        print("%s: %s" % (arg, value))
-    print("------------------------------------------------")
-    generate_images(
-        model_name_or_path=args.model_name_or_path,
-        batch_size=args.batch_size,
-        file=args.file,
-        save_path=args.save_path,
-        seed=args.seed,
-        guidance_scales=args.guidance_scales,
-        num_inference_steps=args.num_inference_steps,
-        scheduler_type=args.scheduler_type,
-        height=args.height,
-        width=args.width,
-        device=args.device,
-    )
diff --git a/ppdiffusers/examples/text_to_image_laion400m/generate_pipelines.py b/ppdiffusers/examples/text_to_image_laion400m/generate_pipelines.py
deleted file mode 100644
index 6fa3717e0d4b..000000000000
--- a/ppdiffusers/examples/text_to_image_laion400m/generate_pipelines.py
+++ /dev/null
@@ -1,167 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import json
-
-import paddle
-
-from paddlenlp.transformers import AutoTokenizer
-from paddlenlp.utils.log import logger
-from ppdiffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    LDMBertModel,
-    LDMTextToImagePipeline,
-    UNet2DConditionModel,
-)
-from ppdiffusers.pipelines.latent_diffusion import LDMBertConfig
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_file", type=str, default="./model_state.pdparams", help="path to pretrained model_state.pdparams"
-    )
-    parser.add_argument("--output_path", type=str, default="./ldm_pipelines", help="the output path of pipeline.")
-    parser.add_argument(
-        "--vae_name_or_path",
-        type=str,
-        default="CompVis/stable-diffusion-v1-4/vae",
-        help="pretrained_vae_name_or_path.",
-    )
-    parser.add_argument(
-        "--text_encoder_config_file", type=str, default="./config/ldmbert.json", help="text_encoder_config_file."
-    )
-    parser.add_argument("--unet_config_file", type=str, default="./config/unet.json", help="unet_config_file.")
-    parser.add_argument(
-        "--tokenizer_name_or_path",
-        type=str,
-        default="bert-base-uncased",
-        help="Pretrained tokenizer name or path if not the same as model_name.",
-    )
-    parser.add_argument("--model_max_length", type=int, default=77, help="Pretrained tokenizer model_max_length.")
-    parser.add_argument("--device", type=str, default=None, help="Device to use. Like gpu:0 or cpu")
-
-    return parser.parse_args()
-
-
-def extract_paramaters(model_file="model_state.pdparams", dtype="float32"):
-    state_dict = paddle.load(model_file)
-    unet = {}
-    vae = {}
-    bert = {}
-    for k, v in state_dict.items():
-        unet_key = "unet."
-        if k.startswith(unet_key):
-            unet[k.replace(unet_key, "")] = v.astype(dtype)
-
-        vae_key = "vae."
-        vqvae_key = "vqvae."
-        if k.startswith(vae_key):
-            vae[k.replace(vae_key, "")] = v.astype(dtype)
-        elif k.startswith(vqvae_key):
-            vae[k.replace(vqvae_key, "")] = v.astype(dtype)
-
-        bert_key = "text_encoder."
-        if k.startswith(bert_key):
-            bert[k.replace(bert_key, "")] = v.astype(dtype)
-
-    return unet, vae, bert
-
-
-def read_json(file):
-    with open(file, "r", encoding="utf-8") as f:
-        data = json.load(f)
-    return data
-
-
-def check_keys(model, state_dict):
-    cls_name = model.__class__.__name__
-    missing_keys = []
-    mismatched_keys = []
-    for k, v in model.state_dict().items():
-        if k not in state_dict.keys():
-            missing_keys.append(k)
-        if list(v.shape) != list(state_dict[k].shape):
-            mismatched_keys.append(k)
-    if len(missing_keys):
-        missing_keys_str = ", ".join(missing_keys)
-        print(f"{cls_name} Found missing_keys {missing_keys_str}!")
-    if len(mismatched_keys):
-        mismatched_keys_str = ", ".join(mismatched_keys)
-        print(f"{cls_name} Found mismatched_keys {mismatched_keys_str}!")
-
-
-def build_pipelines(
-    model_file,
-    output_path,
-    vae_name_or_path,
-    unet_config_file,
-    text_encoder_config_file,
-    tokenizer_name_or_path="bert-base-uncased",
-    model_max_length=77,
-):
-    vae = AutoencoderKL.from_config(vae_name_or_path)
-    unet = UNet2DConditionModel(**read_json(unet_config_file))
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, model_max_length=model_max_length)
-    text_encoder_config = read_json(text_encoder_config_file)
-    vocab_size = text_encoder_config["vocab_size"]
-    max_position_embeddings = text_encoder_config["max_position_embeddings"]
-    if tokenizer.vocab_size != vocab_size:
-        logger.info(
-            f"The tokenizer has a vocab size of {tokenizer.vocab_size}, while the text encoder has a vocab size of {vocab_size}, we will use {tokenizer.vocab_size} as vocab_size!"
-        )
-        text_encoder_config["vocab_size"] = tokenizer.vocab_size
-
-    if tokenizer.model_max_length != max_position_embeddings:
-        logger.info(
-            f"The tokenizer's model_max_length {tokenizer.model_max_length}, while the text encoder's max_position_embeddings is {max_position_embeddings}, we will use {tokenizer.model_max_length} as max_position_embeddings!"
-        )
-        text_encoder_config["max_position_embeddings"] = tokenizer.model_max_length
-    cofnig = LDMBertConfig(**text_encoder_config)
-    text_encoder = LDMBertModel(cofnig)
-    scheduler = DDIMScheduler(
-        beta_start=0.00085,
-        beta_end=0.012,
-        beta_schedule="scaled_linear",
-        # Make sure the scheduler compatible with DDIM
-        clip_sample=False,
-        set_alpha_to_one=False,
-        steps_offset=1,
-    )
-    unet_dict, vae_dict, text_encoder_dict = extract_paramaters(model_file)
-    check_keys(unet, unet_dict)
-    check_keys(vae, vae_dict)
-    check_keys(text_encoder, text_encoder_dict)
-    unet.load_dict(unet_dict)
-    vae.load_dict(vae_dict)
-    text_encoder.load_dict(text_encoder_dict)
-    pipe = LDMTextToImagePipeline(bert=text_encoder, tokenizer=tokenizer, scheduler=scheduler, vqvae=vae, unet=unet)
-    pipe.save_pretrained(output_path)
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    if args.device is not None:
-        paddle.set_device(args.device)
-    build_pipelines(
-        model_file=args.model_file,
-        output_path=args.output_path,
-        vae_name_or_path=args.vae_name_or_path,
-        unet_config_file=args.unet_config_file,
-        text_encoder_config_file=args.text_encoder_config_file,
-        tokenizer_name_or_path=args.tokenizer_name_or_path,
-        model_max_length=args.model_max_length,
-    )
diff --git a/ppdiffusers/examples/text_to_image_laion400m/ldm/__init__.py b/ppdiffusers/examples/text_to_image_laion400m/ldm/__init__.py
deleted file mode 100644
index 279af727312e..000000000000
--- a/ppdiffusers/examples/text_to_image_laion400m/ldm/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# flake8: noqa
-
-from .ldm_args import DataArguments, ModelArguments, NoTrainerTrainingArguments
-from .ldm_trainer import LatentDiffusionTrainer
-from .model import LatentDiffusionModel
-from .text_image_pair_dataset import TextImagePair, worker_init_fn
diff --git a/ppdiffusers/examples/text_to_image_laion400m/ldm/ldm_args.py b/ppdiffusers/examples/text_to_image_laion400m/ldm/ldm_args.py
deleted file mode 100644
index 38d531f7231c..000000000000
--- a/ppdiffusers/examples/text_to_image_laion400m/ldm/ldm_args.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import types
-from dataclasses import asdict, dataclass, field
-from typing import Optional
-
-import paddle
-
-from paddlenlp.utils.log import logger
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    # use pretrained vae kl-8.ckpt (CompVis/stable-diffusion-v1-4/vae)
-    vae_name_or_path: Optional[str] = field(
-        default="CompVis/stable-diffusion-v1-4/vae", metadata={"help": "pretrained_vae_name_or_path"}
-    )
-    text_encoder_config_file: Optional[str] = field(
-        default="./config/ldmbert.json", metadata={"help": "text_encoder_config_file"}
-    )
-    unet_config_file: Optional[str] = field(default="./config/unet.json", metadata={"help": "unet_config_file"})
-    tokenizer_name: Optional[str] = field(
-        default="bert-base-uncased",
-        metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"},
-    )
-    model_max_length: Optional[int] = field(default=77, metadata={"help": "Pretrained tokenizer model_max_length"})
-    num_inference_steps: Optional[int] = field(default=200, metadata={"help": "num_inference_steps"})
-    use_ema: bool = field(default=False, metadata={"help": "Whether or not use ema"})
-    pretrained_model_name_or_path: str = field(
-        default=None, metadata={"help": "Path to pretrained model or model, when we want to resume training."}
-    )
-    image_logging_steps: Optional[int] = field(default=1000, metadata={"help": "Log image every X steps."})
-    enable_xformers_memory_efficient_attention: bool = field(
-        default=False, metadata={"help": "enable_xformers_memory_efficient_attention."}
-    )
-    to_static: bool = field(default=False, metadata={"help": "Whether or not to_static"})
-    prediction_type: Optional[str] = field(
-        default="epsilon",
-        metadata={
-            "help": "prediction_type, prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4 https://imagen.research.google/video/paper.pdf)"
-        },
-    )
-    benchmark: bool = field(
-        default=False,
-        metadata={"help": "Whether or not run benchmark."},
-    )
-    profiler_options: Optional[str] = field(
-        default=None,
-        metadata={"help": "profiler_options."},
-    )
-    noise_offset: Optional[int] = field(default=0, metadata={"help": "The scale of noise offset."})
-
-
-@dataclass
-class DataArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training.
-    """
-
-    file_list: str = field(
-        default="./data/filelist/train.filelist.list", metadata={"help": "The name of the file_list."}
-    )
-    resolution: int = field(
-        default=256,
-        metadata={
-            "help": "The resolution for input images, all the images in the train/validation dataset will be resized to this resolution."
-        },
-    )
-    num_records: int = field(default=10000000, metadata={"help": "num_records"})
-    buffer_size: int = field(
-        default=100,
-        metadata={"help": "Buffer size"},
-    )
-    shuffle_every_n_samples: int = field(
-        default=5,
-        metadata={"help": "shuffle_every_n_samples."},
-    )
-
-
-@dataclass
-class NoTrainerTrainingArguments:
-    output_dir: str = field(
-        default="outputs",
-        metadata={"help": "The output directory where the model predictions and checkpoints will be written."},
-    )
-    per_device_train_batch_size: int = field(
-        default=16, metadata={"help": "Batch size per GPU core/CPU for training."}
-    )
-
-    gradient_accumulation_steps: int = field(
-        default=2,
-        metadata={"help": "Number of updates steps to accumulate before performing a backward/update pass."},
-    )
-    learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate for AdamW."})
-    weight_decay: float = field(default=0.02, metadata={"help": "Weight decay for AdamW if we apply some."})
-    adam_beta1: float = field(default=0.9, metadata={"help": "Beta1 for AdamW optimizer"})
-    adam_beta2: float = field(default=0.999, metadata={"help": "Beta2 for AdamW optimizer"})
-    adam_epsilon: float = field(default=1e-8, metadata={"help": "Epsilon for AdamW optimizer."})
-    max_grad_norm: float = field(default=-1.0, metadata={"help": "Max gradient norm."})
-    num_train_epochs: int = field(default=100, metadata={"help": "Total number of training epochs to perform."})
-    max_steps: int = field(
-        default=1000000000,
-        metadata={"help": "If > 0: set total number of training steps to perform. Override num_train_epochs."},
-    )
-    lr_scheduler_type: str = field(
-        default="constant",
-        metadata={
-            "help": 'The scheduler type to use. support ["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"]'
-        },
-    )
-    warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."})
-
-    logging_dir: Optional[str] = field(default="logs", metadata={"help": "VisualDL log dir."})
-
-    logging_steps: int = field(default=50, metadata={"help": "Log every X updates steps."})
-
-    save_steps: int = field(default=5000, metadata={"help": "Save checkpoint every X updates steps."})
-
-    seed: int = field(default=23, metadata={"help": "Random seed that will be set at the beginning of training."})
-    dataloader_num_workers: int = field(
-        default=6,
-        metadata={
-            "help": "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
-        },
-    )
-    report_to: str = field(
-        default="visualdl", metadata={"help": "The list of integrations to report the results and logs to."}
-    )
-    recompute: bool = field(
-        default=False,
-        metadata={
-            "help": "Recompute the forward pass to calculate gradients. Used for saving memory. "
-            "Only support for networks with transformer blocks."
-        },
-    )
-
-    def __str__(self):
-        self_as_dict = asdict(self)
-        self_as_dict = {k: f"<{k.upper()}>" if k.endswith("_token") else v for k, v in self_as_dict.items()}
-
-        attrs_as_str = [f"{k}={v},\n" for k, v in sorted(self_as_dict.items())]
-        return f"{self.__class__.__name__}(\n{''.join(attrs_as_str)})"
-
-    __repr__ = __str__
-
-    def print_config(self, args=None, key=""):
-        """
-        print all config values.
-        """
-        logger.info("=" * 60)
-        if args is None:
-            args = self
-            key = "Training"
-
-        logger.info("{:^40}".format("{} Configuration Arguments".format(key)))
-        logger.info("{:30}:{}".format("paddle commit id", paddle.version.commit))
-
-        for a in dir(args):
-            if a[:2] != "__":  # don't print double underscore methods
-                v = getattr(args, a)
-                if not isinstance(v, types.MethodType):
-                    logger.info("{:30}:{}".format(a, v))
-
-        logger.info("")
diff --git a/ppdiffusers/examples/text_to_image_laion400m/ldm/ldm_trainer.py b/ppdiffusers/examples/text_to_image_laion400m/ldm/ldm_trainer.py
deleted file mode 100644
index f4c3cf5a845c..000000000000
--- a/ppdiffusers/examples/text_to_image_laion400m/ldm/ldm_trainer.py
+++ /dev/null
@@ -1,220 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import contextlib
-import sys
-import time
-
-import paddle.amp.auto_cast as autocast
-from paddle.io import DataLoader
-
-from paddlenlp.trainer import PrinterCallback, ProgressCallback, Trainer
-from paddlenlp.trainer.integrations import (
-    INTEGRATION_TO_CALLBACK,
-    TrainerCallback,
-    VisualDLCallback,
-    rewrite_logs,
-)
-from paddlenlp.utils import profiler
-from paddlenlp.utils.log import logger
-
-from .text_image_pair_dataset import TextImagePair, worker_init_fn
-
-
-class VisualDLWithImageCallback(VisualDLCallback):
-    def autocast_smart_context_manager(self, args):
-        if args.fp16 or args.bf16:
-            amp_dtype = "float16" if args.fp16 else "bfloat16"
-            ctx_manager = autocast(
-                True,
-                custom_black_list=[
-                    "reduce_sum",
-                    "c_softmax_with_cross_entropy",
-                ],
-                level=args.fp16_opt_level,
-                dtype=amp_dtype,
-            )
-        else:
-            ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
-
-        return ctx_manager
-
-    def on_step_end(self, args, state, control, model=None, **kwargs):
-        if hasattr(model, "on_train_batch_end"):
-            model.on_train_batch_end()
-        if args.image_logging_steps > 0 and state.global_step % args.image_logging_steps == 0:
-            control.should_log = True
-
-    def on_log(self, args, state, control, logs=None, **kwargs):
-        # log image on each node
-        inputs = kwargs.get("inputs", None)
-        model = kwargs.get("model", None)
-        image_logs = {}
-        if (
-            inputs is not None
-            and model is not None
-            and args.image_logging_steps > 0
-            and state.global_step % args.image_logging_steps == 0
-        ):
-            with self.autocast_smart_context_manager(args):
-                image_logs["reconstruction"] = model.decode_image(pixel_values=inputs["pixel_values"])
-                image_logs["ddim-samples-1.0"] = model.log_image(
-                    input_ids=inputs["input_ids"], guidance_scale=1.0, height=args.resolution, width=args.resolution
-                )
-                image_logs["ddim-samples-7.5"] = model.log_image(
-                    input_ids=inputs["input_ids"], guidance_scale=7.5, height=args.resolution, width=args.resolution
-                )
-
-        if not state.is_world_process_zero:
-            return
-
-        if self.vdl_writer is None:
-            self._init_summary_writer(args)
-
-        if self.vdl_writer is not None:
-            logs = rewrite_logs(logs)
-            for k, v in logs.items():
-                if isinstance(v, (int, float)):
-                    self.vdl_writer.add_scalar(k, v, state.global_step)
-                else:
-                    logger.warning(
-                        "Trainer is attempting to log a value of "
-                        f'"{v}" of type {type(v)} for key "{k}" as a scalar. '
-                        "This invocation of VisualDL's writer.add_scalar() "
-                        "is incorrect so we dropped this attribute."
-                    )
-            # log images
-            for k, v in image_logs.items():
-                self.vdl_writer.add_image(k, v, state.global_step, dataformats="NHWC")
-            self.vdl_writer.flush()
-
-
-class AverageStatistical(object):
-    def __init__(self):
-        self.reset()
-
-    def reset(self):
-        self.total_cnt = 0
-        self.time = 0
-
-    def record(self, val, cnt=1):
-        self.time += val
-        self.total_cnt += cnt
-
-    def get_average(self):
-        if self.total_cnt == 0:
-            return 0
-
-        return self.time / self.total_cnt
-
-    def get_average_per_sec(self):
-        if self.time == 0.0:
-            return 0.0
-
-        return float(self.total_cnt) / self.time
-
-    def get_total_cnt(self):
-        return self.total_cnt
-
-    def get_total_time(self):
-        return self.time
-
-
-class BenchmarkCallback(TrainerCallback):
-    def __init__(self, benchmark=True, profiler_options=None):
-        self.benchmark = benchmark
-        self.profiler_options = profiler_options
-
-    def on_train_begin(self, args, state, control, **kwargs):
-        assert args.gradient_accumulation_steps == 1 and not args.do_eval and not args.do_predict
-        if self.benchmark:
-            self.reader_cost_avg = AverageStatistical()
-
-    def on_epoch_begin(self, args, state, control, **kwargs):
-        if self.benchmark:
-            self.epoch_start = time.time()
-            self.batch_start = time.time()
-
-    def on_step_begin(self, args, state, control, **kwargs):
-        if self.benchmark:
-            self.reader_cost_avg.record(time.time() - self.batch_start)
-
-    def on_step_end(self, args, state, control, **kwargs):
-        if self.profiler_options is not None:
-            profiler.add_profiler_step(self.profiler_options)
-
-        if self.benchmark:
-            self.batch_start = time.time()
-            if control.should_log:
-                self.maybe_log_save_evaluate_start = time.time()
-
-    def on_log(self, args, state, control, logs=None, **kwargs):
-        if self.benchmark:
-            if logs is not None and "interval_steps_per_second" in logs:
-                self.batch_start = self.batch_start + (time.time() - self.maybe_log_save_evaluate_start)
-                ips = logs["interval_steps_per_second"] * args.train_batch_size
-                avg_batch_cost = 1 / logs["interval_steps_per_second"]
-                logger.info(
-                    "global step %d / %d, loss: %f, avg_reader_cost: %.5f sec, avg_batch_cost: %.5f sec, avg_samples: %.5f, ips: %.5f sample/sec"
-                    % (
-                        state.global_step,
-                        state.max_steps,
-                        logs["loss"],
-                        self.reader_cost_avg.get_average(),
-                        avg_batch_cost,
-                        args.train_batch_size,
-                        ips,
-                    )
-                )
-                self.reader_cost_avg.reset()
-
-    def on_epoch_end(self, args, state, control, **kwargs):
-        if self.benchmark:
-            train_epoch_cost = time.time() - self.epoch_start
-            logger.info("train epoch: %d, epoch_cost: %.5f s" % (state.epoch, train_epoch_cost))
-
-
-# register visualdl_with_image
-INTEGRATION_TO_CALLBACK.update({"custom_visualdl": VisualDLWithImageCallback})
-
-
-class LatentDiffusionTrainer(Trainer):
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        if self.args.benchmark or self.args.profiler_options is not None:
-            self.add_callback(
-                BenchmarkCallback(benchmark=self.args.benchmark, profiler_options=self.args.profiler_options)
-            )
-            if self.args.benchmark:
-                if self.args.disable_tqdm:
-                    self.pop_callback(PrinterCallback)
-                else:
-                    self.pop_callback(ProgressCallback)
-
-    def compute_loss(self, model, inputs, return_outputs=False):
-        loss = model(**inputs)
-        return loss
-
-    def get_train_dataloader(self):
-        if self.train_dataset is None:
-            raise ValueError("Trainer: training requires a train_dataset.")
-        if isinstance(self.train_dataset, TextImagePair):
-            return DataLoader(
-                self.train_dataset,
-                batch_size=self.args.train_batch_size,
-                num_workers=self.args.dataloader_num_workers,
-                worker_init_fn=worker_init_fn,
-            )
-        else:
-            return super().get_train_dataloader()
diff --git a/ppdiffusers/examples/text_to_image_laion400m/ldm/model.py b/ppdiffusers/examples/text_to_image_laion400m/ldm/model.py
deleted file mode 100644
index 7d0e7e35d3df..000000000000
--- a/ppdiffusers/examples/text_to_image_laion400m/ldm/model.py
+++ /dev/null
@@ -1,340 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import contextlib
-import inspect
-import os
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-
-from paddlenlp.transformers import AutoTokenizer
-from ppdiffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    DDPMScheduler,
-    LDMBertModel,
-    UNet2DConditionModel,
-    is_ppxformers_available,
-)
-from ppdiffusers.models.attention import AttentionBlock
-from ppdiffusers.models.ema import LitEma
-from ppdiffusers.pipelines.latent_diffusion import LDMBertConfig
-from ppdiffusers.training_utils import freeze_params
-
-try:
-    from ppdiffusers.models.attention import SpatialTransformer
-except ImportError:
-    from ppdiffusers.models.transformer_2d import Transformer2DModel as SpatialTransformer
-
-import json
-
-from paddlenlp.utils.log import logger
-from ppdiffusers.initializer import normal_, reset_initialized_parameter, zeros_
-from ppdiffusers.models.resnet import ResnetBlock2D
-
-
-def read_json(file):
-    with open(file, "r", encoding="utf-8") as f:
-        data = json.load(f)
-    return data
-
-
-class LatentDiffusionModel(nn.Layer):
-    def __init__(self, model_args):
-        super().__init__()
-        # init tokenizer
-        tokenizer_name_or_path = (
-            model_args.tokenizer_name
-            if model_args.pretrained_model_name_or_path is None
-            else os.path.join(model_args.pretrained_model_name_or_path, "tokenizer")
-        )
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            tokenizer_name_or_path, model_max_length=model_args.model_max_length
-        )
-
-        # init vae
-        vae_name_or_path = (
-            model_args.vae_name_or_path
-            if model_args.pretrained_model_name_or_path is None
-            else os.path.join(model_args.pretrained_model_name_or_path, "vqvae")
-        )
-        self.vae = AutoencoderKL.from_pretrained(vae_name_or_path)
-        freeze_params(self.vae.parameters())
-        logger.info("Freeze vae parameters!")
-
-        if model_args.pretrained_model_name_or_path is None:
-            assert (
-                model_args.text_encoder_config_file is not None and model_args.unet_config_file is not None
-            ), "we must supply text_encoder_config_file & unet_config_file"
-            # init text_encoder
-            text_encoder_config = read_json(model_args.text_encoder_config_file)
-            vocab_size = text_encoder_config["vocab_size"]
-            max_position_embeddings = text_encoder_config["max_position_embeddings"]
-            if self.tokenizer.vocab_size != vocab_size:
-                logger.info(
-                    f"The tokenizer has a vocab size of {self.tokenizer.vocab_size}, while the text encoder has a vocab size of {vocab_size}, we will use {self.tokenizer.vocab_size} as vocab_size!"
-                )
-                text_encoder_config["vocab_size"] = self.tokenizer.vocab_size
-
-            if self.tokenizer.model_max_length != max_position_embeddings:
-                logger.info(
-                    f"The tokenizer's model_max_length {self.tokenizer.model_max_length}, while the text encoder's max_position_embeddings is {max_position_embeddings}, we will use {self.tokenizer.model_max_length} as max_position_embeddings!"
-                )
-                text_encoder_config["max_position_embeddings"] = self.tokenizer.model_max_length
-            config = LDMBertConfig(**text_encoder_config)
-            self.text_encoder = LDMBertModel(config)
-            self.text_encoder_is_pretrained = False
-            # init unet2d
-            self.unet = UNet2DConditionModel(**read_json(model_args.unet_config_file))
-            self.unet_is_pretrained = False
-        else:
-            # init text_encoder
-            self.text_encoder = LDMBertModel.from_pretrained(
-                model_args.pretrained_model_name_or_path, subfolder="bert"
-            )
-
-            self.text_encoder_is_pretrained = True
-            # init unet2d
-            self.unet = UNet2DConditionModel.from_pretrained(
-                model_args.pretrained_model_name_or_path, subfolder="unet"
-            )
-            self.unet_is_pretrained = True
-
-        assert model_args.prediction_type in ["epsilon", "v_prediction"]
-        self.prediction_type = model_args.prediction_type
-        self.noise_scheduler = DDPMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            num_train_timesteps=1000,
-            prediction_type=self.prediction_type,
-        )
-        self.register_buffer("alphas_cumprod", self.noise_scheduler.alphas_cumprod)
-
-        if model_args.image_logging_steps > 0:
-            self.eval_scheduler = DDIMScheduler(
-                beta_start=0.00085,
-                beta_end=0.012,
-                beta_schedule="scaled_linear",
-                num_train_timesteps=1000,
-                clip_sample=False,
-                set_alpha_to_one=False,
-                steps_offset=1,
-                prediction_type=self.prediction_type,
-            )
-            self.eval_scheduler.set_timesteps(model_args.num_inference_steps)
-        self.init_weights()
-        self.use_ema = model_args.use_ema
-        self.noise_offset = model_args.noise_offset
-        if self.use_ema:
-            self.model_ema = LitEma(self.unet)
-
-        if model_args.enable_xformers_memory_efficient_attention and is_ppxformers_available():
-            try:
-                self.unet.enable_xformers_memory_efficient_attention()
-            except Exception as e:
-                logger.warn(
-                    "Could not enable memory efficient attention. Make sure develop paddlepaddle is installed"
-                    f" correctly and a GPU is available: {e}"
-                )
-
-        # make sure unet text_encoder in train mode, vae in eval mode
-        self.unet.train()
-        self.text_encoder.train()
-        self.vae.eval()
-
-    def add_noise(
-        self,
-        original_samples: paddle.Tensor,
-        noise: paddle.Tensor,
-        timesteps: paddle.Tensor,
-    ) -> paddle.Tensor:
-        sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5
-        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
-        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
-            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
-
-        sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.5
-        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
-        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
-            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
-
-        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
-        return noisy_samples
-
-    def get_velocity(self, sample: paddle.Tensor, noise: paddle.Tensor, timesteps: paddle.Tensor) -> paddle.Tensor:
-        sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5
-        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
-        while len(sqrt_alpha_prod.shape) < len(sample.shape):
-            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
-
-        sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.5
-        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
-        while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
-            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
-
-        velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
-        return velocity
-
-    def init_weights(self):
-        # init text_encoder
-        if not self.text_encoder_is_pretrained:
-            reset_initialized_parameter(self.text_encoder)
-            normal_(self.text_encoder.embeddings.word_embeddings.weight, 0, 0.02)
-            normal_(self.text_encoder.embeddings.position_embeddings.weight, 0, 0.02)
-        # init unet
-        if not self.unet_is_pretrained:
-            reset_initialized_parameter(self.unet)
-            zeros_(self.unet.conv_out.weight)
-            zeros_(self.unet.conv_out.bias)
-            for _, m in self.unet.named_sublayers():
-                if isinstance(m, AttentionBlock):
-                    zeros_(m.proj_attn.weight)
-                    zeros_(m.proj_attn.bias)
-                if isinstance(m, ResnetBlock2D):
-                    zeros_(m.conv2.weight)
-                    zeros_(m.conv2.bias)
-                if isinstance(m, SpatialTransformer):
-                    zeros_(m.proj_out.weight)
-                    zeros_(m.proj_out.bias)
-
-    @contextlib.contextmanager
-    def ema_scope(self, context=None):
-        if self.use_ema:
-            self.model_ema.store(self.unet.parameters())
-            self.model_ema.copy_to(self.unet)
-            if context is not None:
-                print(f"{context}: Switched to EMA weights")
-        try:
-            yield None
-        finally:
-            if self.use_ema:
-                self.model_ema.restore(self.unet.parameters())
-                if context is not None:
-                    print(f"{context}: Restored training weights")
-
-    def on_train_batch_end(self):
-        if self.use_ema:
-            self.model_ema(self.unet)
-
-    def forward(self, input_ids=None, pixel_values=None, **kwargs):
-        with paddle.no_grad():
-            # TODO add this
-            # with paddle.amp.auto_cast(enable=False):
-            self.vae.eval()
-            latents = self.vae.encode(pixel_values).latent_dist.sample()
-            latents = latents * 0.18215
-            noise = paddle.randn(latents.shape)
-            if self.noise_offset:
-                # https://www.crosslabs.org//blog/diffusion-with-offset-noise
-                noise += self.noise_offset * paddle.randn(
-                    (latents.shape[0], latents.shape[1], 1, 1), dtype=noise.dtype
-                )
-            timesteps = paddle.randint(0, self.noise_scheduler.num_train_timesteps, (latents.shape[0],)).astype(
-                "int64"
-            )
-            noisy_latents = self.add_noise(latents, noise, timesteps)
-
-        encoder_hidden_states = self.text_encoder(input_ids)[0]
-        noise_pred = self.unet(noisy_latents, timesteps, encoder_hidden_states).sample
-
-        # Get the target for loss depending on the prediction type
-        if self.prediction_type == "epsilon":
-            target = noise
-        elif self.prediction_type == "v_prediction":
-            target = self.get_velocity(latents, noise, timesteps)
-        else:
-            raise ValueError(f"Unknown prediction type {self.prediction_type}")
-
-        loss = F.mse_loss(noise_pred.cast("float32"), target.cast("float32"), reduction="none").mean([1, 2, 3]).mean()
-
-        return loss
-
-    @paddle.no_grad()
-    def decode_image(self, pixel_values=None, **kwargs):
-        self.eval()
-        if pixel_values.shape[0] > 8:
-            pixel_values = pixel_values[:8]
-        latents = self.vae.encode(pixel_values).latent_dist.sample()
-        image = self.vae.decode(latents).sample
-        image = (image / 2 + 0.5).clip(0, 1).transpose([0, 2, 3, 1])
-        image = (image * 255.0).cast("float32").numpy().round()
-        return image
-
-    @paddle.no_grad()
-    def log_image(self, input_ids=None, height=256, width=256, eta=0.0, guidance_scale=7.5, **kwargs):
-        self.eval()
-        with self.ema_scope():
-            if height % 8 != 0 or width % 8 != 0:
-                raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-            # only log 8 image
-            if input_ids.shape[0] > 8:
-                input_ids = input_ids[:8]
-
-            text_embeddings = self.text_encoder(input_ids)[0]
-            do_classifier_free_guidance = guidance_scale > 1.0
-            if do_classifier_free_guidance:
-                batch_size, max_length = input_ids.shape
-                uncond_input = self.tokenizer(
-                    [""] * batch_size,
-                    padding="max_length",
-                    truncation=True,
-                    max_length=max_length,
-                    return_tensors="pd",
-                )
-                uncond_embeddings = self.text_encoder(uncond_input.input_ids)[0]
-                text_embeddings = paddle.concat([uncond_embeddings, text_embeddings], axis=0)
-
-            latents = paddle.randn((input_ids.shape[0], self.unet.in_channels, height // 8, width // 8))
-            # ddim donot use this
-            latents = latents * self.eval_scheduler.init_noise_sigma
-
-            accepts_eta = "eta" in set(inspect.signature(self.eval_scheduler.step).parameters.keys())
-            extra_step_kwargs = {}
-            if accepts_eta:
-                extra_step_kwargs["eta"] = eta
-
-            for t in self.eval_scheduler.timesteps:
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-                # ddim donot use this
-                latent_model_input = self.eval_scheduler.scale_model_input(latent_model_input, t)
-
-                # predict the noise residual
-                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.eval_scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-            latents = 1 / 0.18215 * latents
-            image = self.vae.decode(latents).sample
-            image = (image / 2 + 0.5).clip(0, 1).transpose([0, 2, 3, 1]) * 255.0
-        return image.cast("float32").numpy().round()
-
-    def set_recompute(self, value=False):
-        def fn(layer):
-            # ldmbert
-            if hasattr(layer, "enable_recompute"):
-                layer.enable_recompute = value
-                print("Set", layer.__class__, "recompute", layer.enable_recompute)
-            # unet
-            if hasattr(layer, "gradient_checkpointing"):
-                layer.gradient_checkpointing = value
-                print("Set", layer.__class__, "recompute", layer.gradient_checkpointing)
-
-        self.apply(fn)
diff --git a/ppdiffusers/examples/text_to_image_laion400m/ldm/text_image_pair_dataset.py b/ppdiffusers/examples/text_to_image_laion400m/ldm/text_image_pair_dataset.py
deleted file mode 100644
index b41f0b799469..000000000000
--- a/ppdiffusers/examples/text_to_image_laion400m/ldm/text_image_pair_dataset.py
+++ /dev/null
@@ -1,236 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import base64
-import gzip
-import io
-import json
-import random
-
-import numpy as np
-import paddle
-import paddle.distributed as dist
-from paddle.io import IterableDataset, get_worker_info
-from paddle.vision import transforms
-from paddle.vision.transforms.transforms import _get_image_size
-from PIL import Image
-
-Image.MAX_IMAGE_PIXELS = 2300000000
-
-
-def parse_line(line, filename):
-    def parse_src(filename):
-        if "laion_aes" in filename:
-            return "laion_aes"
-        elif "laion400m" in filename:
-            return "laion400m"
-        else:
-            raise NotImplementedError(f"Unkown data source, {filename}")
-
-    try:
-        vec = line.strip().split("\t")
-        data_source = parse_src(filename)
-        if data_source == "laion400m":
-            caption, _, img_b64 = vec[:3]
-        elif data_source == "laion_aes":
-            text_json = json.loads(vec[2])
-            img_b64 = vec[5]
-            caption = text_json.get("caption_en", text_json.get("blip_caption_en", ""))
-        else:
-            _, captions, _, _, _, img_b64 = vec[:6]
-            caption = random.sample(captions.split("|"), 1)[0].replace("\1", "")
-
-        image = Image.open(io.BytesIO(base64.b64decode(img_b64))).convert("RGB")
-        if random.random() < 0.1:
-            caption = ""
-        return dict(image=image, caption=caption)
-    except Exception:
-        print(f"error when parse file {filename}")
-        # traceback.print_exc()
-        return None
-
-
-# donot use random.randint
-class RandomCrop(transforms.RandomCrop):
-    def _get_param(self, img, output_size):
-        w, h = _get_image_size(img)
-        th, tw = output_size
-        if w == tw and h == th:
-            return 0, 0, h, w
-
-        i = paddle.randint(0, h - th + 1).item()
-        j = paddle.randint(0, w - tw + 1).item()
-        return i, j, th, tw
-
-
-class TextImagePair(IterableDataset):
-    def __init__(
-        self,
-        file_list,
-        size,
-        num_records,
-        image_processing=None,
-        buffer_size=1000,
-        shuffle_every_n_samples=5,
-        interpolation="lanczos",
-        tokenizer=None,
-    ):
-        self.size = size
-        if image_processing is None:
-            self.image_processing = transforms.Compose(
-                [
-                    transforms.Resize(int(size / 0.9), interpolation),
-                    RandomCrop(size),
-                    transforms.ToTensor(),
-                    transforms.Normalize(0.5, 0.5),
-                ]
-            )
-        else:
-            self.image_processing = image_processing
-        self.text_processing = lambda caption: tokenizer(
-            caption,
-            padding="max_length",
-            truncation=True,
-            max_length=tokenizer.model_max_length,
-            return_tensors="pd",
-        ).input_ids[0]
-        self.file_list = []
-        file_weights = []
-        with open(file_list, "r") as f:
-            file_lists = f.read().strip().split("\n")
-            for file_l in file_lists:
-                file_l = file_l.split(" ")
-                if len(file_l) > 1:
-                    file_weight = float(file_l[1])
-                    file_weights.append(file_weight)
-                file_l = file_l[0]
-                with open(file_l, "r") as f:
-                    self.file_list.append(f.read().strip().split("\n"))
-        print([len(file_l) for file_l in self.file_list])
-        if len(file_weights) == len(self.file_list):
-            file_weights = np.array(file_weights)
-            file_weight_sum = np.sum(file_weights)
-            assert file_weight_sum > 0, "sum of file weights must > 0"
-            file_weights = file_weights / file_weight_sum
-            print(f"sample weights of files: {file_weights}")
-            self.file_weights_cumsum = np.cumsum(file_weights)
-            self.file_weights_cumsum = np.concatenate([[0.0], self.file_weights_cumsum])
-        else:
-            print("sample each file list with same probabiliy")
-            self.file_weights_cumsum = None
-
-        self.num_records = num_records
-        self.file_ids = [np.arange(len(filelist)) for filelist in self.file_list]
-        print(f"original lengths of self.file_ids: {[len(f) for f in self.file_ids]}")
-        self.buffer_size = buffer_size
-        self.shuffle_every_n_samples = shuffle_every_n_samples
-
-    def sample_loader(self, file_ids, filenames):
-        while True:
-            random.shuffle(file_ids)
-            for i in file_ids:
-                filename = filenames[i].strip("\n")
-                with gzip.open(filename, "rb") if filename.endswith(".gz") else open(filename, "rb") as f:
-                    # retry = 0
-                    while True:
-                        line = f.readline()
-
-                        if line == b"":
-                            break
-                        try:
-                            try:
-                                line = line.decode(encoding="utf-8")
-                            except Exception:
-                                line = line.decode(encoding="gb18030")
-                        except Exception:
-                            print(f"error on file {filename}")
-                            continue
-                        data = parse_line(line, filename)
-                        if data is None:
-                            # retry += 1
-                            # if retry > 100:
-                            #     break
-                            continue
-                        else:
-                            w, h = data["image"].size
-                            if w < self.size or h < self.size:
-                                continue
-                            yield {
-                                "pixel_values": self.image_processing(data["image"]),
-                                "input_ids": self.text_processing(data["caption"]),
-                            }
-
-    def random_load_from_multi_dataset(self):
-        print(f"lengths of self.file_ids in random_load: {[len(f) for f in self.file_ids]}")
-        sample_loader_per_dataset = [
-            iter(self.sample_loader(self.file_ids[i], self.file_list[i])) for i in range(len(self.file_ids))
-        ]
-
-        while True:
-            if self.file_weights_cumsum is None:
-                sample_loader = random.choice(sample_loader_per_dataset)
-            else:
-                rand_num = random.random()
-                for i in range(len(self.file_list)):
-                    if self.file_weights_cumsum[i] <= rand_num < self.file_weights_cumsum[i + 1]:
-                        break
-                sample_loader = sample_loader_per_dataset[i]
-                # debug
-                # print(self.file_list[i][0])
-            yield next(sample_loader)
-
-    def shuffle(self, iterator):
-        buffer_list = []
-        for _ in range(self.buffer_size):
-            buffer_list.append(next(iterator))
-        i = 0
-        while True:
-            if i % self.shuffle_every_n_samples == 0:
-                random.shuffle(buffer_list)
-            yield buffer_list.pop()
-            buffer_list.append(next(iterator))
-            i += 1
-
-    def __len__(self):
-        return self.num_records
-
-    def __iter__(self):
-        return self.shuffle(iter(self.random_load_from_multi_dataset()))
-
-
-def worker_init_fn(_):
-    worker_info = get_worker_info()
-    dataset = worker_info.dataset
-    worker_id = worker_info.id
-
-    local_rank = dist.get_rank()
-    world_size = dist.get_world_size()
-    num_workers = worker_info.num_workers
-    worker_id = worker_info.id
-    worker_global_id = local_rank * num_workers + worker_id
-
-    dataset.rng = np.random.RandomState(worker_global_id)
-    for i in range(len(dataset.file_ids)):
-
-        file_ids = dataset.file_ids[i]
-        num_chunks = world_size * num_workers
-        chunk_size = len(file_ids) // num_chunks
-
-        begin_id = worker_global_id * chunk_size
-        end_id = (worker_global_id + 1) * chunk_size
-        dataset.file_ids[i] = dataset.file_ids[i][begin_id:end_id]
-        print(
-            f"dataset {i}, local_rank: {local_rank}, worker_id: {worker_id}, worker_global_id: {worker_global_id}, file_range: ({begin_id}, {end_id})"
-        )
-    return np.random.seed(np.random.get_state()[1][0] + worker_id)
diff --git a/ppdiffusers/examples/text_to_image_laion400m/requirements.txt b/ppdiffusers/examples/text_to_image_laion400m/requirements.txt
deleted file mode 100644
index b6354c86366a..000000000000
--- a/ppdiffusers/examples/text_to_image_laion400m/requirements.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-paddlenlp>=2.4.5
-ppdiffusers>=0.6.3
-fastcore
-visualdl
-Pillow
\ No newline at end of file
diff --git a/ppdiffusers/examples/text_to_image_laion400m/scripts/README.md b/ppdiffusers/examples/text_to_image_laion400m/scripts/README.md
deleted file mode 100644
index 6f3a50350fdf..000000000000
--- a/ppdiffusers/examples/text_to_image_laion400m/scripts/README.md
+++ /dev/null
@@ -1,60 +0,0 @@
-# LDM权重转换脚本
-本目录下包含了两个脚本文件：
-- **convert_orig_ldm_ckpt_to_ppdiffusers.py**: LDM原版Pytorch权重转换为PPDiffusers版LDM权重。
-- **convert_ppdiffusers_to_orig_ldm_ckpt.py**: PPDiffusers版的LDM权重转换为原版LDM权重。
-
-## 1. LDM原版Pytorch权重转换为PPDiffusers版LDM权重
-### 1.1 转换权重
-假设已经有了原版权重`"ldm_1p4b_init0.ckpt"`
-```bash
-python convert_orig_ldm_ckpt_to_ppdiffusers.py \
-    --checkpoint_path ldm_1p4b_init0.ckpt \
-    --dump_path ldm_1p4b_init0_pytorch \
-    --original_config_file text2img_L32H1280_unet800M.yaml
-```
-
-### 1.2 推理预测
-```python
-import paddle
-from ppdiffusers import LDMTextToImagePipeline
-model_path = "./ldm_1p4b_init0_pytorch"
-pipe = LDMTextToImagePipeline.from_pretrained(model_path)
-prompt = "a blue tshirt"
-image = pipe(prompt, guidance_scale=7.5)[0][0]
-image.save("demo.jpg")
-```
-
-## 2. PPDiffusers版的LDM权重转换为原版LDM权重
-### 2.1 转换权重
-假设我们已经使用 `../generate_pipelines.py`生成了`ldm_pipelines`目录。
-```shell
-├── ldm_pipelines  # 我们指定的输出文件路径
-    ├── model_index.json # 模型index文件
-    ├── vqvae # vae权重文件夹！实际是vae模型，文件夹名字与HF保持了一致！
-        ├── model_state.pdparams
-        ├── config.json
-    ├── bert # ldmbert权重文件夹
-        ├── model_config.json
-        ├── model_state.pdparams
-    ├── unet # unet权重文件夹
-        ├── model_state.pdparams
-        ├── config.json
-    ├── scheduler # ddim scheduler文件夹
-        ├── scheduler_config.json
-    ├── tokenizer # bert tokenizer文件夹
-        ├── tokenizer_config.json
-        ├── special_tokens_map.json
-        ├── vocab.txt
-```
-
-```bash
-python convert_ppdiffusers_to_orig_ldm_ckpt.py \
-    --model_name_or_path ./ldm_pipelines \
-    --dump_path ldm_19w.ckpt
-```
-
-### 2.2 推理预测
-使用`CompVis`[原版txt2img.py](https://github.com/CompVis/latent-diffusion/blob/main/scripts/txt2img.py)脚本生成图片。
-```shell
-python ./txt2img.py --prompt "a blue t shirt" --ddim_eta 0.0 --n_samples 1 --n_iter 1 --scale 7.5  --ddim_steps 50
-```
diff --git a/ppdiffusers/examples/text_to_image_laion400m/scripts/convert_orig_ldm_ckpt_to_ppdiffusers.py b/ppdiffusers/examples/text_to_image_laion400m/scripts/convert_orig_ldm_ckpt_to_ppdiffusers.py
deleted file mode 100644
index 55a6a2cc198d..000000000000
--- a/ppdiffusers/examples/text_to_image_laion400m/scripts/convert_orig_ldm_ckpt_to_ppdiffusers.py
+++ /dev/null
@@ -1,775 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-
-import paddle
-import torch
-
-try:
-    from omegaconf import OmegaConf
-except ImportError:
-    raise ImportError(
-        "OmegaConf is required to convert the LDM checkpoints. Please install it with `pip install OmegaConf`."
-    )
-from paddlenlp.transformers import BertTokenizer
-from ppdiffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    EulerAncestralDiscreteScheduler,
-    LDMBertModel,
-    LDMTextToImagePipeline,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    UNet2DConditionModel,
-)
-
-paddle.set_device("cpu")
-
-
-def shave_segments(path, n_shave_prefix_segments=1):
-    """
-    Removes segments. Positive values shave the first segments, negative shave the last segments.
-    """
-    if n_shave_prefix_segments >= 0:
-        return ".".join(path.split(".")[n_shave_prefix_segments:])
-    else:
-        return ".".join(path.split(".")[:n_shave_prefix_segments])
-
-
-def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside resnets to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item.replace("in_layers.0", "norm1")
-        new_item = new_item.replace("in_layers.2", "conv1")
-
-        new_item = new_item.replace("out_layers.0", "norm2")
-        new_item = new_item.replace("out_layers.3", "conv2")
-
-        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
-        new_item = new_item.replace("skip_connection", "conv_shortcut")
-
-        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-
-        mapping.append({"old": old_item, "new": new_item})
-
-    return mapping
-
-
-def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside resnets to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item
-
-        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
-        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-
-        mapping.append({"old": old_item, "new": new_item})
-
-    return mapping
-
-
-def renew_attention_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside attentions to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item
-        mapping.append({"old": old_item, "new": new_item})
-
-    return mapping
-
-
-def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside attentions to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item
-
-        new_item = new_item.replace("norm.weight", "group_norm.weight")
-        new_item = new_item.replace("norm.bias", "group_norm.bias")
-
-        new_item = new_item.replace("q.weight", "query.weight")
-        new_item = new_item.replace("q.bias", "query.bias")
-
-        new_item = new_item.replace("k.weight", "key.weight")
-        new_item = new_item.replace("k.bias", "key.bias")
-
-        new_item = new_item.replace("v.weight", "value.weight")
-        new_item = new_item.replace("v.bias", "value.bias")
-
-        new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
-        new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
-
-        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-
-        mapping.append({"old": old_item, "new": new_item})
-
-    return mapping
-
-
-def assign_to_checkpoint(
-    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
-):
-    """
-    This does the final conversion step: take locally converted weights and apply a global renaming
-    to them. It splits attention layers, and takes into account additional replacements
-    that may arise.
-    Assigns the weights to the new checkpoint.
-    """
-    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
-
-    # Splits the attention layers into three variables.
-    if attention_paths_to_split is not None:
-        for path, path_map in attention_paths_to_split.items():
-            old_tensor = old_checkpoint[path]
-            channels = old_tensor.shape[0] // 3
-
-            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
-
-            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
-
-            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
-            query, key, value = old_tensor.split(channels // num_heads, dim=1)
-
-            checkpoint[path_map["query"]] = query.reshape(target_shape)
-            checkpoint[path_map["key"]] = key.reshape(target_shape)
-            checkpoint[path_map["value"]] = value.reshape(target_shape)
-
-    for path in paths:
-        new_path = path["new"]
-
-        # These have already been assigned
-        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
-            continue
-
-        # Global renaming happens here
-        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
-        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
-        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
-
-        if additional_replacements is not None:
-            for replacement in additional_replacements:
-                new_path = new_path.replace(replacement["old"], replacement["new"])
-
-        # proj_attn.weight has to be converted from conv 1D to linear
-        if "proj_attn.weight" in new_path:
-            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
-        else:
-            checkpoint[new_path] = old_checkpoint[path["old"]]
-
-
-def conv_attn_to_linear(checkpoint):
-    keys = list(checkpoint.keys())
-    attn_keys = ["query.weight", "key.weight", "value.weight"]
-    for key in keys:
-        if ".".join(key.split(".")[-2:]) in attn_keys:
-            if checkpoint[key].ndim > 2:
-                checkpoint[key] = checkpoint[key][:, :, 0, 0]
-        elif "proj_attn.weight" in key:
-            if checkpoint[key].ndim > 2:
-                checkpoint[key] = checkpoint[key][:, :, 0]
-
-
-def create_unet_diffusers_config(original_config):
-    """
-    Creates a config for the diffusers based on the config of the LDM model.
-    """
-    unet_params = original_config.model.params.unet_config.params
-
-    block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
-
-    down_block_types = []
-    resolution = 1
-    for i in range(len(block_out_channels)):
-        block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D"
-        down_block_types.append(block_type)
-        if i != len(block_out_channels) - 1:
-            resolution *= 2
-
-    up_block_types = []
-    for i in range(len(block_out_channels)):
-        block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D"
-        up_block_types.append(block_type)
-        resolution //= 2
-
-    config = dict(
-        sample_size=unet_params.image_size,
-        in_channels=unet_params.in_channels,
-        out_channels=unet_params.out_channels,
-        down_block_types=tuple(down_block_types),
-        up_block_types=tuple(up_block_types),
-        block_out_channels=tuple(block_out_channels),
-        layers_per_block=unet_params.num_res_blocks,
-        cross_attention_dim=unet_params.context_dim,
-        attention_head_dim=unet_params.num_heads,
-    )
-
-    return config
-
-
-def create_vae_diffusers_config(original_config):
-    """
-    Creates a config for the diffusers based on the config of the LDM model.
-    """
-    vae_params = original_config.model.params.first_stage_config.params.ddconfig
-    _ = original_config.model.params.first_stage_config.params.embed_dim
-
-    block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult]
-    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
-    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
-
-    config = dict(
-        sample_size=vae_params.resolution,
-        in_channels=vae_params.in_channels,
-        out_channels=vae_params.out_ch,
-        down_block_types=tuple(down_block_types),
-        up_block_types=tuple(up_block_types),
-        block_out_channels=tuple(block_out_channels),
-        latent_channels=vae_params.z_channels,
-        layers_per_block=vae_params.num_res_blocks,
-    )
-    return config
-
-
-def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False):
-    """
-    Takes a state dict and a config, and returns a converted checkpoint.
-    """
-
-    # extract state_dict for UNet
-    unet_state_dict = {}
-    keys = list(checkpoint.keys())
-
-    unet_key = "model.diffusion_model."
-    # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
-    if sum(k.startswith("model_ema") for k in keys) > 100:
-        print(f"Checkpoint {path} has both EMA and non-EMA weights.")
-        if extract_ema:
-            print(
-                "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
-                " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
-            )
-            for key in keys:
-                if key.startswith("model.diffusion_model"):
-                    flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
-                    unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
-        else:
-            print(
-                "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
-                " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
-            )
-
-    for key in keys:
-        if key.startswith(unet_key):
-            unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
-
-    new_checkpoint = {}
-
-    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
-    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
-    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
-    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
-
-    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
-    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
-
-    new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
-    new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
-    new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
-    new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
-
-    # Retrieves the keys for the input blocks only
-    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
-    input_blocks = {
-        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
-        for layer_id in range(num_input_blocks)
-    }
-
-    # Retrieves the keys for the middle blocks only
-    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
-    middle_blocks = {
-        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
-        for layer_id in range(num_middle_blocks)
-    }
-
-    # Retrieves the keys for the output blocks only
-    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
-    output_blocks = {
-        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
-        for layer_id in range(num_output_blocks)
-    }
-
-    for i in range(1, num_input_blocks):
-        block_id = (i - 1) // (config["layers_per_block"] + 1)
-        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
-
-        resnets = [
-            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
-        ]
-        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
-
-        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
-            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
-                f"input_blocks.{i}.0.op.weight"
-            )
-            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
-                f"input_blocks.{i}.0.op.bias"
-            )
-
-        paths = renew_resnet_paths(resnets)
-        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
-        assign_to_checkpoint(
-            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-        )
-
-        if len(attentions):
-            paths = renew_attention_paths(attentions)
-            meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
-            assign_to_checkpoint(
-                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-            )
-
-    resnet_0 = middle_blocks[0]
-    attentions = middle_blocks[1]
-    resnet_1 = middle_blocks[2]
-
-    resnet_0_paths = renew_resnet_paths(resnet_0)
-    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
-
-    resnet_1_paths = renew_resnet_paths(resnet_1)
-    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
-
-    attentions_paths = renew_attention_paths(attentions)
-    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
-    assign_to_checkpoint(
-        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-    )
-
-    for i in range(num_output_blocks):
-        block_id = i // (config["layers_per_block"] + 1)
-        layer_in_block_id = i % (config["layers_per_block"] + 1)
-        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
-        output_block_list = {}
-
-        for layer in output_block_layers:
-            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
-            if layer_id in output_block_list:
-                output_block_list[layer_id].append(layer_name)
-            else:
-                output_block_list[layer_id] = [layer_name]
-
-        if len(output_block_list) > 1:
-            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
-            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
-
-            resnet_0_paths = renew_resnet_paths(resnets)
-            paths = renew_resnet_paths(resnets)
-
-            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
-            assign_to_checkpoint(
-                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-            )
-
-            if ["conv.weight", "conv.bias"] in output_block_list.values():
-                index = list(output_block_list.values()).index(["conv.weight", "conv.bias"])
-                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
-                    f"output_blocks.{i}.{index}.conv.weight"
-                ]
-                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
-                    f"output_blocks.{i}.{index}.conv.bias"
-                ]
-
-                # Clear attentions as they have been attributed above.
-                if len(attentions) == 2:
-                    attentions = []
-
-            if len(attentions):
-                paths = renew_attention_paths(attentions)
-                meta_path = {
-                    "old": f"output_blocks.{i}.1",
-                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
-                }
-                assign_to_checkpoint(
-                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-                )
-        else:
-            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
-            for path in resnet_0_paths:
-                old_path = ".".join(["output_blocks", str(i), path["old"]])
-                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
-
-                new_checkpoint[new_path] = unet_state_dict[old_path]
-
-    return new_checkpoint
-
-
-def convert_ldm_vae_checkpoint(checkpoint, config):
-    # extract state dict for VAE
-    vae_state_dict = {}
-    vae_key = "first_stage_model."
-    keys = list(checkpoint.keys())
-    for key in keys:
-        if key.startswith(vae_key):
-            vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key)
-
-    new_checkpoint = {}
-
-    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
-    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
-    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
-    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
-    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
-    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
-
-    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
-    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
-    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
-    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
-    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
-    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
-
-    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
-    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
-    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
-    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
-
-    # Retrieves the keys for the encoder down blocks only
-    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
-    down_blocks = {
-        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
-    }
-
-    # Retrieves the keys for the decoder up blocks only
-    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
-    up_blocks = {
-        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
-    }
-
-    for i in range(num_down_blocks):
-        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
-
-        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
-            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
-                f"encoder.down.{i}.downsample.conv.weight"
-            )
-            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
-                f"encoder.down.{i}.downsample.conv.bias"
-            )
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
-    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
-    num_mid_res_blocks = 2
-    for i in range(1, num_mid_res_blocks + 1):
-        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
-    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
-    paths = renew_vae_attention_paths(mid_attentions)
-    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
-    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-    conv_attn_to_linear(new_checkpoint)
-
-    for i in range(num_up_blocks):
-        block_id = num_up_blocks - 1 - i
-        resnets = [
-            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
-        ]
-
-        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
-            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
-                f"decoder.up.{block_id}.upsample.conv.weight"
-            ]
-            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
-                f"decoder.up.{block_id}.upsample.conv.bias"
-            ]
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
-    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
-    num_mid_res_blocks = 2
-    for i in range(1, num_mid_res_blocks + 1):
-        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
-    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
-    paths = renew_vae_attention_paths(mid_attentions)
-    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
-    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-    conv_attn_to_linear(new_checkpoint)
-    return new_checkpoint
-
-
-def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet, diffusers_vae_unet_checkpoint, dtype="float32"):
-    need_transpose = []
-    for k, v in vae_or_unet.named_sublayers(include_self=True):
-        if isinstance(v, paddle.nn.Linear):
-            need_transpose.append(k + ".weight")
-    new_vae_or_unet = {}
-    for k, v in diffusers_vae_unet_checkpoint.items():
-        if k not in need_transpose:
-            new_vae_or_unet[k] = v.numpy().astype(dtype)
-        else:
-            new_vae_or_unet[k] = v.t().numpy().astype(dtype)
-    return new_vae_or_unet
-
-
-def check_keys(model, state_dict):
-    cls_name = model.__class__.__name__
-    missing_keys = []
-    mismatched_keys = []
-    for k, v in model.state_dict().items():
-        if k not in state_dict.keys():
-            missing_keys.append(k)
-        if list(v.shape) != list(state_dict[k].shape):
-            mismatched_keys.append(k)
-    if len(missing_keys):
-        missing_keys_str = ", ".join(missing_keys)
-        print(f"{cls_name} Found missing_keys {missing_keys_str}!")
-    if len(mismatched_keys):
-        mismatched_keys_str = ", ".join(mismatched_keys)
-        print(f"{cls_name} Found mismatched_keys {mismatched_keys_str}!")
-
-
-def get_default(params, key, default):
-    if key in params:
-        return params[key]
-    else:
-        return default
-
-
-def create_ldm_bert_config(original_config):
-    bert_params = dict(original_config.model.params.cond_stage_config.params)
-    config = dict(
-        vocab_size=get_default(bert_params, "vocab_size", 30522),
-        max_position_embeddings=get_default(bert_params, "max_seq_len", 77),
-        encoder_layers=get_default(bert_params, "n_layer", 32),
-        encoder_ffn_dim=get_default(bert_params, "n_embed", 1280) * 4,
-        encoder_attention_heads=8,
-        head_dim=64,
-        activation_function="gelu",
-        d_model=get_default(bert_params, "n_embed", 1280),
-        dropout=0.0,
-        attention_dropout=0.0,
-        activation_dropout=0.0,
-        init_std=0.02,
-        pad_token_id=0,
-    )
-    return config
-
-
-def convert_ldm_bert_to_ppdiffusers(checkpoint, config):
-    # extract state dict for bert
-    bert_state_dict = {}
-    bert_key = "cond_stage_model."
-    keys = list(checkpoint.keys())
-    for key in keys:
-        if key.startswith(bert_key):
-            bert_state_dict[key.replace(bert_key, "")] = checkpoint.get(key)
-
-    new_checkpoint = {}
-    new_checkpoint["embeddings.word_embeddings.weight"] = bert_state_dict["transformer.token_emb.weight"].numpy()
-    new_checkpoint["embeddings.position_embeddings.weight"] = bert_state_dict["transformer.pos_emb.emb.weight"].numpy()
-    for i in range(config["encoder_layers"]):
-        double_i = 2 * i
-        double_i_plus1 = 2 * i + 1
-        # convert norm
-        new_checkpoint[f"encoder.layers.{i}.norm1.weight"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i}.0.weight"
-        ].numpy()
-        new_checkpoint[f"encoder.layers.{i}.norm1.bias"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i}.0.bias"
-        ].numpy()
-
-        new_checkpoint[f"encoder.layers.{i}.self_attn.q_proj.weight"] = (
-            bert_state_dict[f"transformer.attn_layers.layers.{double_i}.1.to_q.weight"].t().numpy()
-        )
-        new_checkpoint[f"encoder.layers.{i}.self_attn.k_proj.weight"] = (
-            bert_state_dict[f"transformer.attn_layers.layers.{double_i}.1.to_k.weight"].t().numpy()
-        )
-        new_checkpoint[f"encoder.layers.{i}.self_attn.v_proj.weight"] = (
-            bert_state_dict[f"transformer.attn_layers.layers.{double_i}.1.to_v.weight"].t().numpy()
-        )
-        new_checkpoint[f"encoder.layers.{i}.self_attn.out_proj.weight"] = (
-            bert_state_dict[f"transformer.attn_layers.layers.{double_i}.1.to_out.weight"].t().numpy()
-        )
-        new_checkpoint[f"encoder.layers.{i}.self_attn.out_proj.bias"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i}.1.to_out.bias"
-        ].numpy()
-
-        new_checkpoint[f"encoder.layers.{i}.norm2.weight"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i_plus1}.0.weight"
-        ].numpy()
-        new_checkpoint[f"encoder.layers.{i}.norm2.bias"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i_plus1}.0.bias"
-        ].numpy()
-        new_checkpoint[f"encoder.layers.{i}.linear1.weight"] = (
-            bert_state_dict[f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.weight"].t().numpy()
-        )
-        new_checkpoint[f"encoder.layers.{i}.linear1.bias"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.bias"
-        ].numpy()
-        new_checkpoint[f"encoder.layers.{i}.linear2.weight"] = (
-            bert_state_dict[f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.weight"].t().numpy()
-        )
-        new_checkpoint[f"encoder.layers.{i}.linear2.bias"] = (
-            bert_state_dict[f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.bias"].t().numpy()
-        )
-
-    new_checkpoint["final_layer_norm.weight"] = bert_state_dict["transformer.norm.weight"].numpy()
-    new_checkpoint["final_layer_norm.bias"] = bert_state_dict["transformer.norm.bias"].numpy()
-
-    return new_checkpoint
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
-    )
-    # wget https://raw.githubusercontent.com/CompVis/latent-diffusion/main/configs/latent-diffusion/txt2img-1p4B-eval.yaml
-    parser.add_argument(
-        "--original_config_file",
-        default="text2img_L32H1280_unet800M.yaml",
-        type=str,
-        help="The YAML config file corresponding to the original architecture.",
-    )
-    parser.add_argument(
-        "--scheduler_type",
-        default="ddim",
-        type=str,
-        choices=["ddim", "lms", "pndm", "euler-ancest"],
-        help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim', 'euler-ancest']",
-    )
-    parser.add_argument(
-        "--extract_ema",
-        action="store_true",
-        help=(
-            "Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights"
-            " or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield"
-            " higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning."
-        ),
-    )
-    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
-
-    args = parser.parse_args()
-
-    if args.original_config_file is None:
-        os.system(
-            "wget https://raw.githubusercontent.com/CompVis/latent-diffusion/main/configs/latent-diffusion/txt2img-1p4B-eval.yaml"
-        )
-        args.original_config_file = "./txt2img-1p4B-eval.yaml"
-
-    original_config = OmegaConf.load(args.original_config_file)
-
-    checkpoint = torch.load(args.checkpoint_path, map_location="cpu")
-    checkpoint = checkpoint.get("state_dict", checkpoint)
-
-    # 1. Convert the UNet2DConditionModel model.
-    diffusers_unet_config = create_unet_diffusers_config(original_config)
-    diffusers_unet_checkpoint = convert_ldm_unet_checkpoint(
-        checkpoint, diffusers_unet_config, path=args.checkpoint_path, extract_ema=args.extract_ema
-    )
-    unet = UNet2DConditionModel(**diffusers_unet_config)
-    ppdiffusers_unet_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(unet, diffusers_unet_checkpoint)
-    check_keys(unet, ppdiffusers_unet_checkpoint)
-    unet.load_dict(ppdiffusers_unet_checkpoint)
-
-    # 2. Convert the VAE model.
-    vae_config = create_vae_diffusers_config(original_config)
-    diffusers_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
-    vae = AutoencoderKL(**vae_config)
-    ppdiffusers_vae_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(vae, diffusers_vae_checkpoint)
-    check_keys(vae, ppdiffusers_vae_checkpoint)
-    vae.load_dict(ppdiffusers_vae_checkpoint)
-
-    # 3. Convert the text model.
-    text_model_type = original_config.model.params.cond_stage_config.target.split(".")[-1]
-
-    if text_model_type != "BERTEmbedder":
-        print("We only support BERTEmbedder as text_encoder!")
-
-    # 4. Convert the Bert model.
-    bert_config = create_ldm_bert_config(original_config)
-    ppdiffusers_bert_checkpoint = convert_ldm_bert_to_ppdiffusers(checkpoint, bert_config)
-    bert = LDMBertModel(**bert_config)
-    check_keys(bert, ppdiffusers_bert_checkpoint)
-    bert.load_dict(ppdiffusers_bert_checkpoint)
-
-    # 5. Convert tokenizer.
-    tokenizer = BertTokenizer.from_pretrained(
-        "bert-base-uncased", model_max_length=bert_config["max_position_embeddings"]
-    )
-    if tokenizer.vocab_size != bert_config["vocab_size"]:
-        print("Vocab size mismatched! Please verify your tokenizer or text encoder!")
-
-    # 6. Convert scheduler.
-    num_train_timesteps = original_config.model.params.timesteps
-    beta_start = original_config.model.params.linear_start
-    beta_end = original_config.model.params.linear_end
-    if args.scheduler_type == "pndm":
-        scheduler = PNDMScheduler(
-            beta_start=beta_start,
-            beta_end=beta_end,
-            beta_schedule="scaled_linear",
-            # Make sure the scheduler compatible with DDIM
-            set_alpha_to_one=False,
-            steps_offset=1,
-            # Make sure the scheduler compatible with PNDM
-            skip_prk_steps=True,
-        )
-    elif args.scheduler_type == "lms":
-        scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
-    elif args.scheduler_type == "euler-ancestral":
-        scheduler = EulerAncestralDiscreteScheduler(
-            beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear"
-        )
-    elif args.scheduler_type == "ddim":
-        scheduler = DDIMScheduler(
-            beta_start=beta_start,
-            beta_end=beta_end,
-            beta_schedule="scaled_linear",
-            # Make sure the scheduler compatible with DDIM
-            clip_sample=False,
-            set_alpha_to_one=False,
-            steps_offset=1,
-        )
-    else:
-        raise ValueError(f"Scheduler of type {args.scheduler_type} doesn't exist!")
-
-    pipe = LDMTextToImagePipeline(vqvae=vae, bert=bert, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
-
-    pipe.save_pretrained(args.dump_path)
diff --git a/ppdiffusers/examples/text_to_image_laion400m/scripts/convert_ppdiffusers_to_orig_ldm_ckpt.py b/ppdiffusers/examples/text_to_image_laion400m/scripts/convert_ppdiffusers_to_orig_ldm_ckpt.py
deleted file mode 100644
index efa50695b266..000000000000
--- a/ppdiffusers/examples/text_to_image_laion400m/scripts/convert_ppdiffusers_to_orig_ldm_ckpt.py
+++ /dev/null
@@ -1,331 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Script for converting a PPDiffusers saved pipeline to a Latent Diffusion Model checkpoint.
-# *Only* converts the UNet, VAE, and LDMBert(Text Encoder).
-# Does not convert optimizer state or any other thing.
-
-import argparse
-
-import paddle
-import torch
-
-from ppdiffusers import LDMTextToImagePipeline
-
-# =================#
-# UNet Conversion #
-# =================#
-paddle.set_device("cpu")
-
-unet_conversion_map = [
-    # (stable-diffusion, HF Diffusers)
-    ("time_embed.0.weight", "time_embedding.linear_1.weight"),
-    ("time_embed.0.bias", "time_embedding.linear_1.bias"),
-    ("time_embed.2.weight", "time_embedding.linear_2.weight"),
-    ("time_embed.2.bias", "time_embedding.linear_2.bias"),
-    ("input_blocks.0.0.weight", "conv_in.weight"),
-    ("input_blocks.0.0.bias", "conv_in.bias"),
-    ("out.0.weight", "conv_norm_out.weight"),
-    ("out.0.bias", "conv_norm_out.bias"),
-    ("out.2.weight", "conv_out.weight"),
-    ("out.2.bias", "conv_out.bias"),
-]
-
-unet_conversion_map_resnet = [
-    # (stable-diffusion, HF Diffusers)
-    ("in_layers.0", "norm1"),
-    ("in_layers.2", "conv1"),
-    ("out_layers.0", "norm2"),
-    ("out_layers.3", "conv2"),
-    ("emb_layers.1", "time_emb_proj"),
-    ("skip_connection", "conv_shortcut"),
-]
-
-unet_conversion_map_layer = []
-# hardcoded number of downblocks and resnets/attentions...
-# would need smarter logic for other networks.
-for i in range(4):
-    # loop over downblocks/upblocks
-
-    for j in range(2):
-        # loop over resnets/attentions for downblocks
-        hf_down_res_prefix = f"down_blocks.{i}.resnets.{j}."
-        sd_down_res_prefix = f"input_blocks.{3*i + j + 1}.0."
-        unet_conversion_map_layer.append((sd_down_res_prefix, hf_down_res_prefix))
-
-        if i < 3:
-            # no attention layers in down_blocks.3
-            hf_down_atn_prefix = f"down_blocks.{i}.attentions.{j}."
-            sd_down_atn_prefix = f"input_blocks.{3*i + j + 1}.1."
-            unet_conversion_map_layer.append((sd_down_atn_prefix, hf_down_atn_prefix))
-
-    for j in range(3):
-        # loop over resnets/attentions for upblocks
-        hf_up_res_prefix = f"up_blocks.{i}.resnets.{j}."
-        sd_up_res_prefix = f"output_blocks.{3*i + j}.0."
-        unet_conversion_map_layer.append((sd_up_res_prefix, hf_up_res_prefix))
-
-        if i > 0:
-            # no attention layers in up_blocks.0
-            hf_up_atn_prefix = f"up_blocks.{i}.attentions.{j}."
-            sd_up_atn_prefix = f"output_blocks.{3*i + j}.1."
-            unet_conversion_map_layer.append((sd_up_atn_prefix, hf_up_atn_prefix))
-
-    if i < 3:
-        # no downsample in down_blocks.3
-        hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0.conv."
-        sd_downsample_prefix = f"input_blocks.{3*(i+1)}.0.op."
-        unet_conversion_map_layer.append((sd_downsample_prefix, hf_downsample_prefix))
-
-        # no upsample in up_blocks.3
-        hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0."
-        sd_upsample_prefix = f"output_blocks.{3*i + 2}.{1 if i == 0 else 2}."
-        unet_conversion_map_layer.append((sd_upsample_prefix, hf_upsample_prefix))
-
-hf_mid_atn_prefix = "mid_block.attentions.0."
-sd_mid_atn_prefix = "middle_block.1."
-unet_conversion_map_layer.append((sd_mid_atn_prefix, hf_mid_atn_prefix))
-
-for j in range(2):
-    hf_mid_res_prefix = f"mid_block.resnets.{j}."
-    sd_mid_res_prefix = f"middle_block.{2*j}."
-    unet_conversion_map_layer.append((sd_mid_res_prefix, hf_mid_res_prefix))
-
-
-def convert_unet_state_dict(unet_state_dict):
-    # buyer beware: this is a *brittle* function,
-    # and correct output requires that all of these pieces interact in
-    # the exact order in which I have arranged them.
-    mapping = {k: k for k in unet_state_dict.keys()}
-    for sd_name, hf_name in unet_conversion_map:
-        mapping[hf_name] = sd_name
-    for k, v in mapping.items():
-        if "resnets" in k:
-            for sd_part, hf_part in unet_conversion_map_resnet:
-                v = v.replace(hf_part, sd_part)
-            mapping[k] = v
-    for k, v in mapping.items():
-        for sd_part, hf_part in unet_conversion_map_layer:
-            v = v.replace(hf_part, sd_part)
-        mapping[k] = v
-    new_state_dict = {v: unet_state_dict[k] for k, v in mapping.items()}
-    return new_state_dict
-
-
-# ================#
-# VAE Conversion #
-# ================#
-
-vae_conversion_map = [
-    # (stable-diffusion, HF Diffusers)
-    ("nin_shortcut", "conv_shortcut"),
-    ("norm_out", "conv_norm_out"),
-    ("mid.attn_1.", "mid_block.attentions.0."),
-]
-
-for i in range(4):
-    # down_blocks have two resnets
-    for j in range(2):
-        hf_down_prefix = f"encoder.down_blocks.{i}.resnets.{j}."
-        sd_down_prefix = f"encoder.down.{i}.block.{j}."
-        vae_conversion_map.append((sd_down_prefix, hf_down_prefix))
-
-    if i < 3:
-        hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0."
-        sd_downsample_prefix = f"down.{i}.downsample."
-        vae_conversion_map.append((sd_downsample_prefix, hf_downsample_prefix))
-
-        hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0."
-        sd_upsample_prefix = f"up.{3-i}.upsample."
-        vae_conversion_map.append((sd_upsample_prefix, hf_upsample_prefix))
-
-    # up_blocks have three resnets
-    # also, up blocks in hf are numbered in reverse from sd
-    for j in range(3):
-        hf_up_prefix = f"decoder.up_blocks.{i}.resnets.{j}."
-        sd_up_prefix = f"decoder.up.{3-i}.block.{j}."
-        vae_conversion_map.append((sd_up_prefix, hf_up_prefix))
-
-# this part accounts for mid blocks in both the encoder and the decoder
-for i in range(2):
-    hf_mid_res_prefix = f"mid_block.resnets.{i}."
-    sd_mid_res_prefix = f"mid.block_{i+1}."
-    vae_conversion_map.append((sd_mid_res_prefix, hf_mid_res_prefix))
-
-vae_conversion_map_attn = [
-    # (stable-diffusion, HF Diffusers)
-    ("norm.", "group_norm."),
-    ("q.", "query."),
-    ("k.", "key."),
-    ("v.", "value."),
-    ("proj_out.", "proj_attn."),
-]
-
-
-def reshape_weight_for_sd(w):
-    # convert HF linear weights to SD conv2d weights
-    return w.reshape(*w.shape, 1, 1)
-
-
-def convert_vae_state_dict(vae_state_dict):
-    mapping = {k: k for k in vae_state_dict.keys()}
-    for k, v in mapping.items():
-        for sd_part, hf_part in vae_conversion_map:
-            v = v.replace(hf_part, sd_part)
-        mapping[k] = v
-    for k, v in mapping.items():
-        if "attentions" in k:
-            for sd_part, hf_part in vae_conversion_map_attn:
-                v = v.replace(hf_part, sd_part)
-            mapping[k] = v
-    new_state_dict = {v: vae_state_dict[k] for k, v in mapping.items()}
-    weights_to_convert = ["q", "k", "v", "proj_out"]
-    for k, v in new_state_dict.items():
-        for weight_name in weights_to_convert:
-            if f"mid.attn_1.{weight_name}.weight" in k:
-                print(f"Reshaping {k} for SD format")
-                new_state_dict[k] = reshape_weight_for_sd(v)
-    return new_state_dict
-
-
-# =========================#
-# Text Encoder Conversion #
-# =========================#
-# pretty much a no-op
-
-
-def convert_ppdiffusers_vae_unet_to_diffusers(vae_or_unet, ppdiffusers_vae_unet_checkpoint):
-    need_transpose = []
-    for k, v in vae_or_unet.named_sublayers(include_self=True):
-        if isinstance(v, paddle.nn.Linear):
-            need_transpose.append(k + ".weight")
-    new_vae_or_unet = {}
-    for k, v in ppdiffusers_vae_unet_checkpoint.items():
-        if k not in need_transpose:
-            new_vae_or_unet[k] = torch.from_numpy(v.numpy())
-        else:
-            new_vae_or_unet[k] = torch.from_numpy(v.t().numpy())
-    return new_vae_or_unet
-
-
-def convert_ldmbert_state_dict(ldmbert_state_dict, num_layers=32):
-    ppdiffusers_mapping_to_orig = {}
-    ppdiffusers_mapping_to_orig["embeddings.word_embeddings.weight"] = "cond_stage_model.transformer.token_emb.weight"
-    ppdiffusers_mapping_to_orig[
-        "embeddings.position_embeddings.weight"
-    ] = "cond_stage_model.transformer.pos_emb.emb.weight"
-    for i in range(num_layers):
-        double_i = 2 * i
-        double_i_plus1 = 2 * i + 1
-        ppdiffusers_mapping_to_orig[
-            f"encoder.layers.{i}.norm1.weight"
-        ] = f"cond_stage_model.transformer.attn_layers.layers.{double_i}.0.weight"
-        ppdiffusers_mapping_to_orig[
-            f"encoder.layers.{i}.norm1.bias"
-        ] = f"cond_stage_model.transformer.attn_layers.layers.{double_i}.0.bias"
-
-        ppdiffusers_mapping_to_orig[f"encoder.layers.{i}.self_attn.q_proj.weight"] = (
-            f"cond_stage_model.transformer.attn_layers.layers.{double_i}.1.to_q.weight",
-            "transpose",
-        )
-        ppdiffusers_mapping_to_orig[f"encoder.layers.{i}.self_attn.k_proj.weight"] = (
-            f"cond_stage_model.transformer.attn_layers.layers.{double_i}.1.to_k.weight",
-            "transpose",
-        )
-        ppdiffusers_mapping_to_orig[f"encoder.layers.{i}.self_attn.v_proj.weight"] = (
-            f"cond_stage_model.transformer.attn_layers.layers.{double_i}.1.to_v.weight",
-            "transpose",
-        )
-        ppdiffusers_mapping_to_orig[f"encoder.layers.{i}.self_attn.out_proj.weight"] = (
-            f"cond_stage_model.transformer.attn_layers.layers.{double_i}.1.to_out.weight",
-            "transpose",
-        )
-        ppdiffusers_mapping_to_orig[
-            f"encoder.layers.{i}.self_attn.out_proj.bias"
-        ] = f"cond_stage_model.transformer.attn_layers.layers.{double_i}.1.to_out.bias"
-
-        ppdiffusers_mapping_to_orig[
-            f"encoder.layers.{i}.norm2.weight"
-        ] = f"cond_stage_model.transformer.attn_layers.layers.{double_i_plus1}.0.weight"
-        ppdiffusers_mapping_to_orig[
-            f"encoder.layers.{i}.norm2.bias"
-        ] = f"cond_stage_model.transformer.attn_layers.layers.{double_i_plus1}.0.bias"
-        ppdiffusers_mapping_to_orig[f"encoder.layers.{i}.linear1.weight"] = (
-            f"cond_stage_model.transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.weight",
-            "transpose",
-        )
-        ppdiffusers_mapping_to_orig[
-            f"encoder.layers.{i}.linear1.bias"
-        ] = f"cond_stage_model.transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.bias"
-        ppdiffusers_mapping_to_orig[f"encoder.layers.{i}.linear2.weight"] = (
-            f"cond_stage_model.transformer.attn_layers.layers.{double_i_plus1}.1.net.2.weight",
-            "transpose",
-        )
-        ppdiffusers_mapping_to_orig[
-            f"encoder.layers.{i}.linear2.bias"
-        ] = f"cond_stage_model.transformer.attn_layers.layers.{double_i_plus1}.1.net.2.bias"
-
-    ppdiffusers_mapping_to_orig["final_layer_norm.weight"] = "cond_stage_model.transformer.norm.weight"
-    ppdiffusers_mapping_to_orig["final_layer_norm.bias"] = "cond_stage_model.transformer.norm.bias"
-
-    new_state_dict = {}
-    for k, v in ldmbert_state_dict.items():
-        new_name = ppdiffusers_mapping_to_orig[k]
-        need_transpose = False
-        if isinstance(new_name, (list, tuple)):
-            need_transpose = True
-            new_name = new_name[0]
-        new_state_dict[new_name] = torch.from_numpy(v.t().numpy()) if need_transpose else torch.from_numpy(v.numpy())
-
-    # dummpy weights, we donot use this!
-    new_state_dict["cond_stage_model.transformer.to_logits.weight"] = torch.zeros(
-        new_state_dict["cond_stage_model.transformer.token_emb.weight"].shape
-    )
-    new_state_dict["cond_stage_model.transformer.to_logits.bias"] = torch.zeros(
-        new_state_dict["cond_stage_model.transformer.token_emb.weight"].shape[0]
-    )
-    return new_state_dict
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_name_or_path", default=None, type=str, required=True, help="Path to the model to convert."
-    )
-    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
-    parser.add_argument("--half", action="store_true", help="Save weights in half precision.")
-
-    args = parser.parse_args()
-    pipe = LDMTextToImagePipeline.from_pretrained(args.model_name_or_path)
-
-    # Convert the UNet model
-    unet_state_dict = convert_ppdiffusers_vae_unet_to_diffusers(pipe.unet, pipe.unet.state_dict())
-    unet_state_dict = convert_unet_state_dict(unet_state_dict)
-    unet_state_dict = {"model.diffusion_model." + k: v for k, v in unet_state_dict.items()}
-
-    # Convert the VAE model
-    vae_state_dict = convert_ppdiffusers_vae_unet_to_diffusers(pipe.vqvae, pipe.vqvae.state_dict())
-    vae_state_dict = convert_vae_state_dict(vae_state_dict)
-    vae_state_dict = {"first_stage_model." + k: v for k, v in vae_state_dict.items()}
-
-    # Convert the ldmbert model
-    text_enc_dict = convert_ldmbert_state_dict(pipe.bert.state_dict(), num_layers=pipe.bert.config["encoder_layers"])
-
-    # Put together new checkpoint
-    state_dict = {**unet_state_dict, **vae_state_dict, **text_enc_dict}
-    if args.half:
-        state_dict = {k: v.half() for k, v in state_dict.items()}
-    state_dict = {"state_dict": state_dict}
-    torch.save(state_dict, args.dump_path)
diff --git a/ppdiffusers/examples/text_to_image_laion400m/scripts/plot_fid_clip_score.py b/ppdiffusers/examples/text_to_image_laion400m/scripts/plot_fid_clip_score.py
deleted file mode 100644
index a9d024779401..000000000000
--- a/ppdiffusers/examples/text_to_image_laion400m/scripts/plot_fid_clip_score.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import matplotlib.pyplot as plt
-
-clip_pd = [26.421875, 27.000000, 27.359375, 27.593750, 27.734375, 27.843750]
-fid_pd = [
-    61.63832793539143,
-    60.8674158133241,
-    61.06873714387473,
-    61.611543101882205,
-    62.41915003785908,
-    63.73414302529454,
-]
-
-clip_pt = [22.312500, 22.890625, 23.265625, 23.484375, 23.703125, 23.781250]
-fid_pt = [
-    84.44459421090801,
-    80.3668421393279,
-    78.9310124831315,
-    77.22472126942046,
-    76.33773728759894,
-    75.6021109021998,
-]
-
-plt.plot(clip_pd, fid_pd, label="Paddle line", linewidth=3, color="r", marker="o", markerfacecolor="blue")
-plt.plot(clip_pt, fid_pt, label="Pytorch line", linewidth=3, color="b", marker="o", markerfacecolor="red")
-plt.xlabel("CLIP Score")
-plt.ylabel("FID@1k")
-plt.title("12W Globel Step Pareto Curves - DDIM")
-plt.legend()
-plt.savefig("ddim-12w.png")
-plt.show()
diff --git a/ppdiffusers/examples/text_to_image_laion400m/scripts/text2img_L12H768_unet800M.yaml b/ppdiffusers/examples/text_to_image_laion400m/scripts/text2img_L12H768_unet800M.yaml
deleted file mode 100644
index ab5ac44d2b4f..000000000000
--- a/ppdiffusers/examples/text_to_image_laion400m/scripts/text2img_L12H768_unet800M.yaml
+++ /dev/null
@@ -1,104 +0,0 @@
-model:
-  base_learning_rate: 1.0e-4
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.00085
-    linear_end: 0.012
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: image
-    cond_stage_key: caption
-    image_size: 32
-    channels: 4
-    cond_stage_trainable: true
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 32
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions:
-        - 4
-        - 2
-        - 1
-        num_res_blocks: 2
-        channel_mult:
-        - 1
-        - 2
-        - 4
-        - 4
-        num_heads: 8
-        use_spatial_transformer: true
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: true
-        legacy: False
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        ckpt_path: pretrained_autoencoder/kl-f8.ckpt
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-#     cond_stage_config:
-#       target: ldm.modules.encoders.clip.CLIPEmbedder
-#       params:
-#         clip_name: ViT-B/16
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.BERTEmbedder
-      params:
-        n_embed: 768
-        n_layer: 12
-
-
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 8
-    num_workers: 4
-    train:
-      target: ldm.data.text_image_pair.TextImagePair
-      params:
-        file_list: data/filelist/train.filelist.list
-        size: 256
-        num_records: 10000000
-        buffer_size: 100
-
-lightning:
-  callbacks:
-    image_logger:
-      target: main.ImageLogger
-      params:
-        batch_frequency: 1000
-        max_images: 8
-        increase_log_steps: False
-        save_every_steps: 5000
-
-  trainer:
-    benchmark: True
-    accumulate_grad_batches: 16
diff --git a/ppdiffusers/examples/text_to_image_laion400m/scripts/text2img_L32H1280_unet800M.yaml b/ppdiffusers/examples/text_to_image_laion400m/scripts/text2img_L32H1280_unet800M.yaml
deleted file mode 100644
index 7a052ffa43a5..000000000000
--- a/ppdiffusers/examples/text_to_image_laion400m/scripts/text2img_L32H1280_unet800M.yaml
+++ /dev/null
@@ -1,105 +0,0 @@
-model:
-  base_learning_rate: 5.0e-5
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.00085
-    linear_end: 0.012
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: image
-    cond_stage_key: caption
-    image_size: 32
-    channels: 4
-    cond_stage_trainable: true
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-    # ckpt_path: ./pretrained_autoencoder/pretrained_text2img_unet.ckpt
-
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 32
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions:
-        - 4
-        - 2
-        - 1
-        num_res_blocks: 2
-        channel_mult:
-        - 1
-        - 2
-        - 4
-        - 4
-        num_heads: 8
-        use_spatial_transformer: true
-        transformer_depth: 1
-        context_dim: 1280
-        use_checkpoint: true
-        legacy: False
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        ckpt_path: pretrained_autoencoder/kl-f8.ckpt
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-#     cond_stage_config:
-#       target: ldm.modules.encoders.clip.CLIPEmbedder
-#       params:
-#         clip_name: ViT-B/16
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.BERTEmbedder
-      params:
-        n_embed: 1280
-        n_layer: 32
-
-
-data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 64
-    num_workers: 12
-    train:
-      target: ldm.data.text_image_pair.TextImagePair
-      params:
-        file_list: data/filelist/train.filelist.list
-        size: 256
-        num_records: 10000000
-        buffer_size: 100
-
-lightning:
-  callbacks:
-    image_logger:
-      target: main.ImageLogger
-      params:
-        batch_frequency: 1000
-        max_images: 8
-        increase_log_steps: False
-        save_every_steps: 5000
-
-  trainer:
-    benchmark: True
-    accumulate_grad_batches: 1
diff --git a/ppdiffusers/examples/text_to_image_laion400m/train_txt2img_laion400m_no_trainer.py b/ppdiffusers/examples/text_to_image_laion400m/train_txt2img_laion400m_no_trainer.py
deleted file mode 100644
index ff0c482ad17a..000000000000
--- a/ppdiffusers/examples/text_to_image_laion400m/train_txt2img_laion400m_no_trainer.py
+++ /dev/null
@@ -1,218 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import contextlib
-import itertools
-import math
-import os
-import sys
-import time
-
-import paddle
-import paddle.nn as nn
-from ldm import (
-    DataArguments,
-    LatentDiffusionModel,
-    ModelArguments,
-    NoTrainerTrainingArguments,
-    TextImagePair,
-    worker_init_fn,
-)
-from paddle.io import DataLoader
-from paddle.optimizer import AdamW
-
-from paddlenlp.trainer import PdArgumentParser, set_seed
-from paddlenlp.utils.log import logger
-from ppdiffusers.optimization import get_scheduler
-from ppdiffusers.training_utils import unwrap_model
-
-
-def get_writer(training_args):
-    if training_args.report_to == "visualdl":
-        from visualdl import LogWriter
-
-        writer = LogWriter(logdir=training_args.logging_dir)
-    elif training_args.report_to == "tensorboard":
-        from tensorboardX import SummaryWriter
-
-        writer = SummaryWriter(logdir=training_args.logging_dir)
-    else:
-        raise ValueError("writer_type must be in ['visualdl', 'tensorboard']")
-    return writer
-
-
-def main():
-    parser = PdArgumentParser((ModelArguments, DataArguments, NoTrainerTrainingArguments))
-    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-    training_args.image_logging_steps = model_args.image_logging_steps = (
-        math.ceil(model_args.image_logging_steps / training_args.logging_steps) * training_args.logging_steps
-    )
-    training_args.resolution = data_args.resolution
-    training_args.print_config(training_args, "Training")
-    training_args.print_config(model_args, "Model")
-    training_args.print_config(data_args, "Data")
-
-    rank = paddle.distributed.get_rank()
-    num_processes = paddle.distributed.get_world_size()
-
-    if num_processes > 1:
-        paddle.distributed.init_parallel_env()
-
-    training_args.logging_dir = os.path.join(training_args.output_dir, training_args.logging_dir)
-
-    if training_args.seed is not None:
-        set_seed(training_args.seed)
-
-    if training_args.output_dir is not None:
-        os.makedirs(training_args.output_dir, exist_ok=True)
-
-    model = LatentDiffusionModel(model_args)
-    model.set_recompute(training_args.recompute)
-    params_to_train = itertools.chain(model.text_encoder.parameters(), model.unet.parameters())
-
-    lr_scheduler = get_scheduler(
-        training_args.lr_scheduler_type,
-        learning_rate=training_args.learning_rate,
-        num_warmup_steps=training_args.warmup_steps * training_args.gradient_accumulation_steps,
-        num_training_steps=training_args.max_steps * training_args.gradient_accumulation_steps,
-    )
-
-    optimizer = AdamW(
-        learning_rate=lr_scheduler,
-        parameters=params_to_train,
-        beta1=training_args.adam_beta1,
-        beta2=training_args.adam_beta2,
-        weight_decay=training_args.weight_decay,
-        epsilon=training_args.adam_epsilon,
-        grad_clip=nn.ClipGradByGlobalNorm(training_args.max_grad_norm)
-        if training_args.max_grad_norm is not None and training_args.max_grad_norm > 0
-        else None,
-    )
-    train_dataset = TextImagePair(
-        file_list=data_args.file_list,
-        size=data_args.resolution,
-        num_records=data_args.num_records,
-        buffer_size=data_args.buffer_size,
-        shuffle_every_n_samples=data_args.shuffle_every_n_samples,
-        interpolation="lanczos",
-        tokenizer=model.tokenizer,
-    )
-
-    if num_processes > 1:
-        model = paddle.DataParallel(model)
-
-    train_dataloader = DataLoader(
-        train_dataset,
-        batch_size=training_args.per_device_train_batch_size,
-        num_workers=training_args.dataloader_num_workers,
-        worker_init_fn=worker_init_fn,
-    )
-
-    if rank == 0:
-        writer = get_writer(training_args)
-
-    # Train!
-    total_batch_size = (
-        training_args.per_device_train_batch_size * num_processes * training_args.gradient_accumulation_steps
-    )
-
-    logger.info("***** Running training *****")
-    logger.info(f"  Num examples = {len(train_dataset)}")
-    logger.info(f"  Num Epochs = {training_args.num_train_epochs}")
-    logger.info(f"  Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
-    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
-    logger.info(f"  Gradient Accumulation steps = {training_args.gradient_accumulation_steps}")
-
-    global_steps = 0
-    tic_train = time.time()
-
-    for epoch in range(training_args.num_train_epochs):
-        if epoch == training_args.num_train_epochs:
-            logger.info("***** Training Done *****")
-            break
-
-        for step, batch in enumerate(train_dataloader):
-            if (
-                num_processes > 1 and ((step + 1) % training_args.gradient_accumulation_steps != 0)
-            ) or training_args.recompute:
-                # grad acc, no_sync when (step + 1) % training_args.gradient_accumulation_steps != 0:
-                ctx_manager = model.no_sync()
-            else:
-                ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
-
-            with ctx_manager:
-                loss = model(**batch)
-                if training_args.gradient_accumulation_steps > 1:
-                    loss = loss / training_args.gradient_accumulation_steps
-                loss.backward()
-
-            if (step + 1) % training_args.gradient_accumulation_steps == 0:
-                optimizer.step()
-                lr_scheduler.step()
-                optimizer.clear_grad()
-                global_steps += 1
-                unwrap_model(model).on_train_batch_end()
-
-                # train log
-                if global_steps % training_args.logging_steps == 0:
-                    logs = {
-                        "train/loss": loss.item() * training_args.gradient_accumulation_steps,
-                        "train/lr_abs": lr_scheduler.get_lr(),
-                        "train/global_steps": global_steps,
-                    }
-                    if rank == 0:
-                        # add scalar
-                        for name, val in logs.items():
-                            writer.add_scalar(name, val, global_steps)
-                    log_str = "Train: global_steps {0:d}/{1:d}, epoch: {2:d}, batch: {3:d}, train_loss: {4:.10f}, lr_abs: {5:.10f}, speed: {6:.2f} s/it.".format(
-                        global_steps,
-                        training_args.max_steps,
-                        epoch,
-                        step + 1,
-                        logs["train/loss"],
-                        logs["train/lr_abs"],
-                        (time.time() - tic_train) / training_args.logging_steps,
-                    )
-                    logger.info(log_str)
-
-                    if global_steps % training_args.image_logging_steps == 0:
-                        reconstruction_img = unwrap_model(model).decode_image(pixel_values=batch["pixel_values"])
-                        ddim_10_img = unwrap_model(model).log_image(input_ids=batch["input_ids"], guidance_scale=1.0)
-                        ddim_75_img = unwrap_model(model).log_image(input_ids=batch["input_ids"], guidance_scale=7.5)
-                        if rank == 0:
-                            writer.add_image("reconstruction", reconstruction_img, global_steps, dataformats="NHWC")
-                            writer.add_image("ddim-samples-1.0", ddim_10_img, global_steps, dataformats="NHWC")
-                            writer.add_image("ddim-samples-7.5", ddim_75_img, global_steps, dataformats="NHWC")
-                    tic_train = time.time()
-
-                    if rank == 0 and global_steps % training_args.save_steps == 0:
-                        os.makedirs(
-                            os.path.join(training_args.output_dir, f"global-steps-{global_steps}"), exist_ok=True
-                        )
-                        paddle.save(
-                            model.state_dict(),
-                            os.path.join(
-                                training_args.output_dir, f"global-steps-{global_steps}", "model_state.pdparams"
-                            ),
-                        )
-
-                if global_steps >= training_args.max_steps:
-                    break
-    if rank == 0:
-        paddle.save(model.state_dict(), os.path.join(training_args.output_dir, "model_state.pdparams"))
-        writer.close()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/ppdiffusers/examples/text_to_image_laion400m/train_txt2img_laion400m_trainer.py b/ppdiffusers/examples/text_to_image_laion400m/train_txt2img_laion400m_trainer.py
deleted file mode 100644
index ede05519f3b7..000000000000
--- a/ppdiffusers/examples/text_to_image_laion400m/train_txt2img_laion400m_trainer.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import itertools
-import math
-import os
-
-import paddle
-from ldm import (
-    DataArguments,
-    LatentDiffusionModel,
-    LatentDiffusionTrainer,
-    ModelArguments,
-    TextImagePair,
-)
-
-from paddlenlp.trainer import PdArgumentParser, TrainingArguments, get_last_checkpoint
-from paddlenlp.utils.log import logger
-
-
-def main():
-    parser = PdArgumentParser((ModelArguments, DataArguments, TrainingArguments))
-    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-    # report to custom_visualdl
-    training_args.report_to = ["custom_visualdl"]
-    training_args.resolution = data_args.resolution
-    training_args.benchmark = model_args.benchmark
-    training_args.profiler_options = model_args.profiler_options
-    training_args.image_logging_steps = model_args.image_logging_steps = (
-        (math.ceil(model_args.image_logging_steps / training_args.logging_steps) * training_args.logging_steps)
-        if model_args.image_logging_steps > 0
-        else -1
-    )
-
-    training_args.print_config(model_args, "Model")
-    training_args.print_config(data_args, "Data")
-
-    paddle.set_device(training_args.device)
-
-    # Detecting last checkpoint.
-    last_checkpoint = None
-    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
-        last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome."
-            )
-        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
-            logger.info(
-                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-
-    model = LatentDiffusionModel(model_args)
-    train_dataset = TextImagePair(
-        file_list=data_args.file_list,
-        size=data_args.resolution,
-        num_records=data_args.num_records,
-        buffer_size=data_args.buffer_size,
-        shuffle_every_n_samples=data_args.shuffle_every_n_samples,
-        interpolation="lanczos",
-        tokenizer=model.tokenizer,
-    )
-
-    if model_args.to_static:
-        input_ids = paddle.static.InputSpec(name="input_ids", shape=[-1, model_args.model_max_length], dtype="int64")
-        pixel_values = paddle.static.InputSpec(
-            name="pixel_values", shape=[-1, 3, data_args.resolution, data_args.resolution], dtype="float32"
-        )
-        specs = [input_ids, pixel_values]
-        paddle.jit.ignore_module([os])
-        model = paddle.jit.to_static(model, input_spec=specs)
-        logger.info("Successfully to apply @to_static with specs: {}".format(specs))
-
-    trainer = LatentDiffusionTrainer(
-        model=model, args=training_args, train_dataset=train_dataset, tokenizer=model.tokenizer
-    )
-    # must set recompute after trainer init
-    trainer.model.set_recompute(training_args.recompute)
-    params_to_train = itertools.chain(trainer.model.text_encoder.parameters(), trainer.model.unet.parameters())
-    trainer.set_optimizer_grouped_parameters(params_to_train)
-
-    checkpoint = None
-    if training_args.resume_from_checkpoint is not None:
-        checkpoint = training_args.resume_from_checkpoint
-    elif last_checkpoint is not None:
-        checkpoint = last_checkpoint
-
-    # Training
-    trainer.train(resume_from_checkpoint=checkpoint)
-    trainer.save_model()
-    trainer.save_state()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/ppdiffusers/examples/textual_inversion/README.md b/ppdiffusers/examples/textual_inversion/README.md
deleted file mode 100644
index 1404eb00bc1f..000000000000
--- a/ppdiffusers/examples/textual_inversion/README.md
+++ /dev/null
@@ -1,285 +0,0 @@
-## Textual Inversion 微调代码
-
-[Textual inversion](https://arxiv.org/abs/2208.01618) 是一种个性化定制的文本生成图像(text2image)技术。我们只需要给模型提供 3-5 张图片，就可以训练个性化的Stable Diffusion模型。
-<p align="center">
-    <img src="https://textual-inversion.github.io/static/images/editing/colorful_teapot.JPG">
-</p>
-
-
-## 1 本地运行
-### 1.1 安装依赖
-
-在运行这个训练代码前，我们需要安装下面的训练依赖。
-
-```bash
-pip install -U ppdiffusers visualdl
-```
-
-### 1.2 Cat toy 训练 object 的例子
-
-在训练开始之前，我们需要准备需要训练的 3-5 张图片，在这里我们可以从[这里](https://huggingface.co/sd-dreambooth-library/cat-toy/tree/main/concept_images) 下载到所需要的图片，然后将里面的内容保存到一个文件夹`cat_toy_images`中。
-<p align="center">
-    <img src="https://user-images.githubusercontent.com/50394665/196325636-3ad872b2-4e84-4169-9831-8c8aa6d72a94.png" height=40% width=40%>
-</p>
-
-
-#### 1.2.1 硬件要求
-下面的代码需要具有16GB的显卡才可以进行微调成功。
-
-#### 1.2.2 单机单卡训练
-```bash
-export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-# 这里需要输入刚才下载的图片所保存的文件目录
-export DATA_DIR="cat_toy_images"
-
-python -u train_textual_inversion.py \
-  --pretrained_model_name_or_path=$MODEL_NAME \
-  --train_data_dir=$DATA_DIR \
-  --learnable_property="object" \
-  --placeholder_token="<cat-toy>" --initializer_token="toy" \
-  --resolution=512 \
-  --train_batch_size=1 \
-  --gradient_accumulation_steps=4 \
-  --max_train_steps=3000 \
-  --learning_rate=5.0e-04 \
-  --scale_lr \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --seed 42 \
-  --output_dir="textual_inversion_cat"
-```
-
-`train_textual_inversion.py`代码可传入的参数解释如下：
-> 主要修改的参数
-> * `--pretrained_model_name_or_path`: 所使用的Stable Diffusion模型权重名称或者本地下载的模型路径，目前支持了上表中的8种模型权重，我们可直接替换使用。
-> * `--train_data_dir`: 训练数据文件夹所在的地址，上述例子我们使用了`cat_toy_image`目录。
-> * `--placeholder_token`: 用来表示所需训练`物体(object)`的占位符token，上述例子我们使用了`<cat-toy>`这个占位符，建议用户以这种带有`< >`和`-`的形式设计占位符。
-> * `--initializer_token`: 用来初始化占位符token的一个token。通过给`placeholder_token`指定`initializer_token`，模型可以快速掌握新`placeholder_token`所表示的意思，从而加速模型的学习速度。注：上述例子我们使用`toy`这个单词初始化了`<cat-toy>`这个token。这样，在训练的时候模型就会有一定的先验知识，知道`<cat-toy>`是一种`玩具toy`。
-> * `--learnable_property`: 学习的一种方式，可以从`["object", "style"]`进行选择，其中`object`表示学习物体，我们可以让模型根据我们提供的3-5图片，学习到对应的新物体。`style`表示学习风格，同样我们可以使得模型学习到某种绘画的风格。
-> * `--num_train_epochs`: 训练的轮数，默认值为`100`。
-> * `--max_train_steps`: 最大的训练步数，当我们设置这个值后，它会重新计算所需的`num_train_epochs`轮数。
-> * `--save_steps`: 每间隔多少步`（global step步数）`，保存学习到的文件`learned_embeds.pdparams`。
-> * `--gradient_accumulation_steps`: 梯度累积的步数，用户可以指定梯度累积的步数，在梯度累积的step中。减少多卡之间梯度的通信，减少更新的次数，扩大训练的batch_size。
-> * `--enable_xformers_memory_efficient_attention`: 是否开启`xformers`，开启后训练速度会变慢，但是能够节省显存。注意我们需要安装develop版本的paddlepaddle！
-
-> 可以修改的参数
-> * `--language`: 模型的语言，`zh`、`en`或`zh_en`，当我们使用中文模型时候，请设置成`zh`。
-> * `--learning_rate`: 学习率。
-> * `--scale_lr`: 是否根据GPU数量，梯度累积步数，以及批量数对学习率进行缩放。缩放公式：`learning_rate * gradient_accumulation_steps * train_batch_size * num_processes`。
-> * `--lr_scheduler`: 要使用的学习率调度策略。默认为 `constant`。
-> * `--lr_warmup_steps`: 用于从 0 到 `learning_rate` 的线性 warmup 的步数。
-> * `--train_batch_size`: 训练时每张显卡所使用的`batch_size批量`，当我们的显存较小的时候，需要将这个值设置的小一点。
-> * `--center_crop`: 在调整图片宽和高之前是否将裁剪图像居中，默认值为`False`。
-> * `--height`: 输入给模型的图片`高度`，由于用户输入的并不是固定大小的图片，因此代码中会将原始大小的图片压缩成指定`高度`的图片，默认值为`None`。
-> * `--width`: 输入给模型的图片`宽度`，由于用户输入的并不是固定大小的图片，因此代码中会将原始大小的图片压缩成指定`宽度`的图片，默认值为`None`。
-> * `--resolution`: 输入给模型图片的`分辨率`，当`高度`或`宽度`为`None`时，我们将会使用`resolution`，默认值为`512`。
-> * `--repeats`: 由于图片数量只有3-5张，因此我们需要重复训练图片数据，默认设置为重复`100遍`。
-> * `--gradient_checkpointing`: 是否开启`gradient_checkpointing`功能，在一定程度上能够更显显存，但是会减慢训练速度。
-> * `--output_dir`: 模型训练完所保存的路径，默认设置为`text-inversion-model`文件夹，建议用户每训练一个模型可以修改一下输出路径，防止先前已有的模型被覆盖了。
-> * `--validation_prompt`: 训练过程中评估所使用的prompt文本。
-> * `--validation_epochs`: 每隔多少个epoch评估模型。
-
-
-> 基本无需修改的参数
-> * `--seed`: 随机种子，为了可以复现训练结果，Tips：当前paddle设置该随机种子后仍无法完美复现。
-> * `--adam_beta1`: `AdamW` 优化器时的 `beta1` 超参数。默认为 `0.9`。
-> * `--adam_beta2`: `AdamW` 优化器时的 `beta2` 超参数。默认为 `0.999`。
-> * `--adam_weight_decay`: `AdamW` 优化器时的 `weight_decay` 超参数。 默认为`0.02`。
-> * `--adam_weight_decay`: `AdamW` 优化器时的 `epsilon` 超参数。默认为 `1e-8`。
-> * `--max_grad_norm`: 最大梯度范数（用于梯度裁剪）。默认为 `-1` 表示不使用。
-> * `--logging_dir`: Tensorboard 或 VisualDL 记录日志的地址，注意：该地址会与输出目录进行拼接，即，最终的日志地址为`<output_dir>/<logging_dir>`。
-> * `--report_to`: 用于记录日志的工具，可选`["tensorboard", "visualdl"]`，默认为`visualdl`，如果选用`tensorboard`，请使用命令安装`pip install tensorboardX`。
-> * `--push_to_hub`: 是否将模型上传到 `huggingface hub`，默认值为 `False`。
-> * `--hub_token`: 上传到 `huggingface hub` 所需要使用的 `token`，如果我们已经登录了，那么我们就无需填写。
-> * `--hub_model_id`: 上传到 `huggingface hub` 的模型库名称， 如果为 `None` 的话表示我们将使用 `output_dir` 的名称作为模型库名称。
-
-
-#### 1.2.3 单机多卡训练
-通过设置`--gpus`，我们可以指定 GPU 为 `0,1,2,3` 卡。这里我们只训练了`1000step`，因为这里的`1000 step x 4卡`近似于`单卡训练 4000 step`。
-
-```bash
-export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-# 这里需要输入刚才下载的图片所保存的文件目录
-export DATA_DIR="cat_toy_images"
-
-python -u -m paddle.distributed.launch --gpus "0,1,2,3" train_textual_inversion.py \
-  --pretrained_model_name_or_path=$MODEL_NAME \
-  --train_data_dir=$DATA_DIR \
-  --learnable_property="object" \
-  --placeholder_token="<cat-toy>" --initializer_token="toy" \
-  --resolution=512 \
-  --train_batch_size=1 \
-  --gradient_accumulation_steps=4 \
-  --max_train_steps=1000 \
-  --learning_rate=5.0e-04 \
-  --scale_lr \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --seed 42 \
-  --output_dir="textual_inversion_cat" \
-  --validation_prompt "A <cat-toy> backpack" \
-  --validation_epochs 1
-```
-
-
-#### 1.2.4 预测生成图片
-
-（1）加载output_dir保存的模型权重
-当训练完成后，模型将自动保存到`output_dir`目录，在上述例子中，我们的模型最终保存到了`textual_inversion_cat`文件夹。我们可以使用`StableDiffusionPipeline`快速加载该模型。
-
-```python
-from ppdiffusers import StableDiffusionPipeline
-
-# 我们所需加载的模型地址，这里我们输入了训练时候使用的 output_dir 地址
-model_path = "textual_inversion_cat"
-pipe = StableDiffusionPipeline.from_pretrained(model_path)
-
-# 注意<cat-toy>这就是我们训练模型时候定义的token。
-prompt = "A <cat-toy> backpack"
-
-image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]
-
-image.save("cat-backpack.png")
-```
-<p align="center">
-    <img src="https://user-images.githubusercontent.com/50394665/196341119-f1121a79-9a93-4ab9-90f6-98b1dc477b51.png" width=40% height=40%>
-</p>
-
-（2）加载已有的`learned_embeds.pdparams`权重
-
-```python
-import paddle
-from ppdiffusers import StableDiffusionPipeline
-# 我们所需加载的模型地址，这里我们加载了我们微调模型所使用的权重
-model_path = "CompVis/stable-diffusion-v1-4"
-pipe = StableDiffusionPipeline.from_pretrained(model_path)
-
-# 需要加载的风格或物体的权重
-learned_embeded_path = "./textual_inversion_cat/learned_embeds-steps-1000.pdparams"
-for token, embeds in paddle.load(learned_embeded_path).items():
-    pipe.tokenizer.add_tokens(token)
-    pipe.text_encoder.resize_token_embeddings(len(pipe.tokenizer))
-    token_id = pipe.tokenizer.convert_tokens_to_ids(token)
-    with paddle.no_grad():
-        pipe.text_encoder.get_input_embeddings().weight[token_id] = embeds
-
-print(token)
-# <cat-toy>
-prompt = "A <cat-toy> backpack"
-image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]
-
-image.save("cat-backpack.png")
-```
-
-### 1.3 huang-guang-jian 训练 style 的例子
-
-在训练开始之前，我们需要准备需要训练的 3-5 张图片，在这里我们可以从[这里](https://huggingface.co/sd-concepts-library/huang-guang-jian
-) 下载到所需要的图片，然后将里面的内容保存到一个文件夹`huang_guang_jian_images`中。
-<p align="center">
-    <img src="https://user-images.githubusercontent.com/50394665/196342920-8ee67ce9-d8ff-41b5-844e-1c57763680a5.png" width=40% height=40%>
-</p>
-
-
-#### 1.3.1 单机单卡训练
-```bash
-export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-# 这里需要输入刚才下载的图片所保存的文件目录
-export DATA_DIR="huang_guang_jian_images"
-
-python -u train_textual_inversion.py \
-  --pretrained_model_name_or_path=$MODEL_NAME \
-  --train_data_dir=$DATA_DIR \
-  --learnable_property="style" \
-  --placeholder_token="<huang-guang-jian-style>" --initializer_token="style" \
-  --resolution=512 \
-  --train_batch_size=1 \
-  --gradient_accumulation_steps=4 \
-  --max_train_steps=3000 \
-  --learning_rate=5.0e-04 \
-  --scale_lr \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --seed 42 \
-  --output_dir="huang_guang_jian_style"
-```
-
-参数解释，请参考1.2部分内容。
-
-#### 1.3.2 单机多卡训练
-通过设置`--gpus`，我们可以指定 GPU 为 `0,1,2,3` 卡。这里我们只训练了`1000step`，因为这里的`1000 step x 4卡`近似于`单卡训练 4000 step`。
-
-```bash
-export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-# 这里需要输入刚才下载的图片所保存的文件目录
-export DATA_DIR="huang_guang_jian_images"
-
-python -u -m paddle.distributed.launch --gpus "0,1,2,3" train_textual_inversion.py \
-  --pretrained_model_name_or_path=$MODEL_NAME \
-  --train_data_dir=$DATA_DIR \
-  --learnable_property="style" \
-  --placeholder_token="<huang-guang-jian-style>" --initializer_token="style" \
-  --resolution=512 \
-  --train_batch_size=1 \
-  --gradient_accumulation_steps=4 \
-  --max_train_steps=1000 \
-  --learning_rate=5.0e-04 \
-  --scale_lr \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --seed 42 \
-  --output_dir="huang_guang_jian_style"
-```
-
-#### 1.3.3 预测生成图片
-
-（1）加载output_dir保存的模型权重
-当训练完成后，模型将自动保存到`output_dir`目录，在上述例子中，我们的模型最终保存到了`huang_guang_jian_style`文件夹。我们可以使用`StableDiffusionPipeline`快速加载该模型。
-
-```python
-from ppdiffusers import StableDiffusionPipeline
-
-# 我们所需加载的模型地址，这里我们输入了训练时候使用的 output_dir 地址
-model_path = "huang_guang_jian_style"
-pipe = StableDiffusionPipeline.from_pretrained(model_path)
-
-# 注意<huang-guang-jian-style>这就是我们训练模型时候定义的token。
-prompt = "A pretty girl in <huang-guang-jian-style>"
-image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]
-
-image.save("huang-guang-jian-girl.png")
-```
-
-（2）加载已有的`learned_embeds.pdparams`权重
-
-```python
-import paddle
-from ppdiffusers import StableDiffusionPipeline
-# 我们所需加载的模型地址，这里我们加载了我们微调模型所使用的权重
-model_path = "CompVis/stable-diffusion-v1-4"
-pipe = StableDiffusionPipeline.from_pretrained(model_path)
-
-# 需要加载的风格或物体的权重
-learned_embeded_path = "./huang_guang_jian_style/learned_embeds-steps-1000.pdparams"
-for token, embeds in paddle.load(learned_embeded_path).items():
-    pipe.tokenizer.add_tokens(token)
-    pipe.text_encoder.resize_token_embeddings(len(pipe.tokenizer))
-    token_id = pipe.tokenizer.convert_tokens_to_ids(token)
-    with paddle.no_grad():
-        pipe.text_encoder.get_input_embeddings().weight[token_id] = embeds
-
-print(token)
-# <huang-guang-jian-style>
-prompt = "A pretty girl in <huang-guang-jian-style>"
-image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]
-
-image.save("huang-guang-jian-girl.png")
-```
-<p align="center">
-    <img src="https://user-images.githubusercontent.com/50394665/196343736-2cca0efb-28c6-44fc-a0e0-44f582f805c1.png" width=40% height=40%>
-</p>
-
-## 2 参考资料
-- https://github.com/huggingface/diffusers/tree/main/examples/textual_inversion
-- https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_textual_inversion_training.ipynb
-- https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/stable_conceptualizer_inference.ipynb
-- https://huggingface.co/sd-concepts-library
diff --git a/ppdiffusers/examples/textual_inversion/requirements.txt b/ppdiffusers/examples/textual_inversion/requirements.txt
deleted file mode 100644
index d77a600a0daf..000000000000
--- a/ppdiffusers/examples/textual_inversion/requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-paddlenlp>=2.6.0rc0
-Pillow
-ppdiffusers>=0.16.1
\ No newline at end of file
diff --git a/ppdiffusers/examples/textual_inversion/train_textual_inversion.py b/ppdiffusers/examples/textual_inversion/train_textual_inversion.py
deleted file mode 100644
index 8c8c15f9f277..000000000000
--- a/ppdiffusers/examples/textual_inversion/train_textual_inversion.py
+++ /dev/null
@@ -1,930 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import contextlib
-import gc
-import glob
-import math
-import os
-import random
-import sys
-from pathlib import Path
-from typing import Optional
-
-import numpy as np
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from huggingface_hub import HfFolder, Repository, create_repo, whoami
-from paddle.distributed.fleet.utils.hybrid_parallel_util import (
-    fused_allreduce_gradients,
-)
-from paddle.io import BatchSampler, DataLoader, Dataset, DistributedBatchSampler
-from paddle.optimizer import AdamW
-from paddle.vision.transforms import RandomHorizontalFlip
-from PIL import Image
-from tqdm.auto import tqdm
-
-from paddlenlp.trainer import set_seed
-from paddlenlp.transformers import AutoTokenizer, PretrainedConfig
-from paddlenlp.utils.log import logger
-from ppdiffusers import (
-    AutoencoderKL,
-    DDPMScheduler,
-    DiffusionPipeline,
-    DPMSolverMultistepScheduler,
-    UNet2DConditionModel,
-    is_ppxformers_available,
-)
-from ppdiffusers.optimization import get_scheduler
-from ppdiffusers.training_utils import freeze_params, unfreeze_params, unwrap_model
-from ppdiffusers.utils import PIL_INTERPOLATION, check_min_version
-
-check_min_version("0.16.1")
-
-
-def url_or_path_join(*path_list):
-    return os.path.join(*path_list) if os.path.isdir(os.path.join(*path_list)) else "/".join(path_list)
-
-
-def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str):
-    try:
-        text_encoder_config = PretrainedConfig.from_pretrained(
-            url_or_path_join(pretrained_model_name_or_path, "text_encoder")
-        )
-        model_class = text_encoder_config.architectures[0]
-    except Exception:
-        model_class = "LDMBertModel"
-    if model_class == "CLIPTextModel":
-        from paddlenlp.transformers import CLIPTextModel
-
-        return CLIPTextModel
-    elif model_class == "RobertaSeriesModelWithTransformation":
-        from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import (
-            RobertaSeriesModelWithTransformation,
-        )
-
-        return RobertaSeriesModelWithTransformation
-    elif model_class == "BertModel":
-        from paddlenlp.transformers import BertModel
-
-        return BertModel
-    elif model_class == "LDMBertModel":
-        from ppdiffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import (
-            LDMBertModel,
-        )
-
-        return LDMBertModel
-    else:
-        raise ValueError(f"{model_class} is not supported.")
-
-
-def set_recompute(model, value=False):
-    def fn(layer):
-        # ldmbert
-        if hasattr(layer, "enable_recompute"):
-            layer.enable_recompute = value
-            print("Set", layer.__class__, "recompute", layer.enable_recompute)
-        # # unet
-        # if hasattr(layer, "gradient_checkpointing"):
-        #     layer.gradient_checkpointing = value
-        #     print("Set", layer.__class__, "recompute", layer.gradient_checkpointing)
-
-    model.apply(fn)
-
-
-def get_report_to(args):
-    if args.report_to == "visualdl":
-        from visualdl import LogWriter
-
-        writer = LogWriter(logdir=args.logging_dir)
-    elif args.report_to == "tensorboard":
-        from tensorboardX import SummaryWriter
-
-        writer = SummaryWriter(logdir=args.logging_dir)
-    else:
-        raise ValueError("report_to must be in ['visualdl', 'tensorboard']")
-    return writer
-
-
-def save_progress(text_encoder, placeholder_token_ids, args, save_path):
-    logger.info("Saving embeddings")
-    learned_embeds = (
-        unwrap_model(text_encoder)
-        .get_input_embeddings()
-        .weight[min(placeholder_token_ids) : max(placeholder_token_ids) + 1]
-    )
-    learned_embeds_dict = {args.placeholder_token: learned_embeds.detach()}
-    paddle.save(learned_embeds_dict, save_path)
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="Simple example of a training script.")
-    parser.add_argument(
-        "--save_steps",
-        type=int,
-        default=500,
-        help="Save learned_embeds.pdparams every X updates steps.",
-    )
-    parser.add_argument(
-        "--only_save_embeds",
-        action="store_true",
-        default=True,
-        help="Save only the embeddings for the new concept.",
-    )
-    parser.add_argument(
-        "--num_vectors",
-        type=int,
-        default=1,
-        help="How many textual inversion vectors shall be used to learn the concept.",
-    )
-    parser.add_argument(
-        "--pretrained_model_name_or_path",
-        type=str,
-        default=None,
-        required=True,
-        help="Path to pretrained model or model identifier from local models.",
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        type=str,
-        default=None,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--train_data_dir", type=str, default=None, required=True, help="A folder containing the training data."
-    )
-    parser.add_argument(
-        "--placeholder_token",
-        type=str,
-        default=None,
-        required=True,
-        help="A token to use as a placeholder for the concept.",
-    )
-    parser.add_argument(
-        "--initializer_token", type=str, default=None, required=True, help="A token to use as initializer word."
-    )
-    parser.add_argument("--learnable_property", type=str, default="object", help="Choose between 'object' and 'style'")
-    parser.add_argument("--repeats", type=int, default=100, help="How many times to repeat the training data.")
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        default="text-inversion-model",
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
-    parser.add_argument(
-        "--height",
-        type=int,
-        default=None,
-        help=(
-            "The height for input images, all the images in the train/validation dataset will be resized to this"
-            " height"
-        ),
-    )
-    parser.add_argument(
-        "--width",
-        type=int,
-        default=None,
-        help=(
-            "The width for input images, all the images in the train/validation dataset will be resized to this"
-            " width"
-        ),
-    )
-    parser.add_argument(
-        "--resolution",
-        type=int,
-        default=512,
-        help=(
-            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
-            " resolution"
-        ),
-    )
-    parser.add_argument(
-        "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution."
-    )
-    parser.add_argument(
-        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
-    )
-    parser.add_argument("--num_train_epochs", type=int, default=100)
-    parser.add_argument(
-        "--max_train_steps",
-        type=int,
-        default=5000,
-        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
-    )
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument(
-        "--gradient_checkpointing",
-        action="store_true",
-        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
-    )
-    parser.add_argument(
-        "--learning_rate",
-        type=float,
-        default=1e-4,
-        help="Initial learning rate (after the potential warmup period) to use.",
-    )
-    parser.add_argument(
-        "--scale_lr",
-        action="store_true",
-        default=False,
-        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
-    )
-    parser.add_argument(
-        "--lr_scheduler",
-        type=str,
-        default="constant",
-        help=(
-            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
-            ' "constant", "constant_with_warmup"]'
-        ),
-    )
-    parser.add_argument(
-        "--dataloader_num_workers",
-        type=int,
-        default=0,
-        help=(
-            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
-        ),
-    )
-    parser.add_argument(
-        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
-    )
-    parser.add_argument(
-        "--lr_num_cycles",
-        type=int,
-        default=1,
-        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
-    )
-    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
-    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
-    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
-    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
-    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
-    parser.add_argument("--max_grad_norm", default=-1, type=float, help="Max gradient norm.")
-    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
-    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
-    parser.add_argument(
-        "--hub_model_id",
-        type=str,
-        default=None,
-        help="The name of the repository to keep in sync with the local `output_dir`.",
-    )
-    parser.add_argument(
-        "--logging_dir",
-        type=str,
-        default="logs",
-        help=(
-            "[TensorBoard](https://www.tensorflow.org/tensorboard) or [VisualDL](https://www.paddlepaddle.org.cn/paddle/visualdl) log directory. Will default to"
-            "*output_dir/logs"
-        ),
-    )
-    parser.add_argument(
-        "--report_to",
-        type=str,
-        default="visualdl",
-        help=(
-            'The integration to report the results and logs to. Supported platforms are `"visualdl"`'
-            ' (default), `"tensorboard"`.'
-        ),
-    )
-    parser.add_argument("--language", default="en", choices=["en", "zh", "zh_en"], help="Model language.")
-    parser.add_argument(
-        "--validation_prompt",
-        type=str,
-        default=None,
-        help="A prompt that is used during validation to verify that the model is learning.",
-    )
-    parser.add_argument(
-        "--num_validation_images",
-        type=int,
-        default=4,
-        help="Number of images that should be generated during validation with `validation_prompt`.",
-    )
-    parser.add_argument(
-        "--validation_epochs",
-        type=int,
-        default=50,
-        help=(
-            "Run validation every X epochs. Validation consists of running the prompt"
-            " `args.validation_prompt` multiple times: `args.num_validation_images`"
-            " and logging the images."
-        ),
-    )
-    parser.add_argument(
-        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
-    )
-    parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
-
-    args = parser.parse_args()
-
-    if args.train_data_dir is None:
-        raise ValueError("You must specify a train data directory.")
-
-    if args.language == "en":
-        if "chinese-en" in args.pretrained_model_name_or_path.lower():
-            args.language = "zh_en"
-            logger.info("Detect Chinese-English Model, we will set language to 'zh_en'. ")
-        elif "chinese" in args.pretrained_model_name_or_path.lower():
-            args.language = "zh"
-            logger.info("Detect Chinese Model, we will set language to 'zh'. ")
-
-    args.logging_dir = os.path.join(args.output_dir, args.logging_dir)
-    if args.height is None or args.width is None and args.resolution is not None:
-        args.height = args.width = args.resolution
-    return args
-
-
-imagenet_templates_small = [
-    "a photo of a {}",
-    "a rendering of a {}",
-    "a cropped photo of the {}",
-    "the photo of a {}",
-    "a photo of a clean {}",
-    "a photo of a dirty {}",
-    "a dark photo of the {}",
-    "a photo of my {}",
-    "a photo of the cool {}",
-    "a close-up photo of a {}",
-    "a bright photo of the {}",
-    "a cropped photo of a {}",
-    "a photo of the {}",
-    "a good photo of the {}",
-    "a photo of one {}",
-    "a close-up photo of the {}",
-    "a rendition of the {}",
-    "a photo of the clean {}",
-    "a rendition of a {}",
-    "a photo of a nice {}",
-    "a good photo of a {}",
-    "a photo of the nice {}",
-    "a photo of the small {}",
-    "a photo of the weird {}",
-    "a photo of the large {}",
-    "a photo of a cool {}",
-    "a photo of a small {}",
-]
-
-imagenet_style_templates_small = [
-    "a painting in the style of {}",
-    "a rendering in the style of {}",
-    "a cropped painting in the style of {}",
-    "the painting in the style of {}",
-    "a clean painting in the style of {}",
-    "a dirty painting in the style of {}",
-    "a dark painting in the style of {}",
-    "a picture in the style of {}",
-    "a cool painting in the style of {}",
-    "a close-up painting in the style of {}",
-    "a bright painting in the style of {}",
-    "a cropped painting in the style of {}",
-    "a good painting in the style of {}",
-    "a close-up painting in the style of {}",
-    "a rendition in the style of {}",
-    "a nice painting in the style of {}",
-    "a small painting in the style of {}",
-    "a weird painting in the style of {}",
-    "a large painting in the style of {}",
-]
-
-zh_imagenet_templates_small = [
-    "一张{}的照片",
-    "{}的渲染",
-    "{}裁剪过的照片",
-    "一张干净的{}的照片",
-    "{}的黑暗照片",
-    "我的{}的照片",
-    "酷的{}的照片",
-    "{}的特写照片",
-    "{}的明亮照片",
-    "{}的裁剪照片",
-    "{}的照片",
-    "{}的好照片",
-    "一张{}的照片",
-    "干净的照片{}",
-    "一张漂亮的{}的照片",
-    "漂亮的照片{}",
-    "一张很酷的照片{}",
-    "一张奇怪的照片{}",
-]
-
-zh_imagenet_style_templates_small = [
-    "一幅{}风格的画",
-    "{}风格的渲染",
-    "{}风格的裁剪画",
-    "{}风格的绘画",
-    "{}风格的一幅干净的画",
-    "{}风格的黑暗画作",
-    "{}风格的图片",
-    "{}风格的一幅很酷的画",
-    "{}风格的特写画",
-    "一幅{}风格的明亮画作",
-    "{}风格的一幅好画",
-    "{}风格的特写画",
-    "{}风格的艺术画",
-    "一幅{}风格的漂亮画",
-    "一幅{}风格的奇怪的画",
-]
-
-
-class TextualInversionDataset(Dataset):
-    def __init__(
-        self,
-        data_root,
-        tokenizer,
-        learnable_property="object",  # [object, style]
-        height=512,
-        width=512,
-        repeats=100,
-        interpolation="bicubic",
-        flip_p=0.5,
-        set="train",
-        placeholder_token="*",
-        center_crop=False,
-        language="en",
-    ):
-        self.data_root = data_root
-        self.tokenizer = tokenizer
-        self.learnable_property = learnable_property
-        self.height = height
-        self.width = width
-        self.placeholder_token = placeholder_token
-        self.center_crop = center_crop
-        self.flip_p = flip_p
-
-        if not Path(data_root).exists():
-            raise ValueError(f"{data_root} dir doesn't exists.")
-
-        ext = ["png", "jpg", "jpeg", "bmp", "PNG", "JPG", "JPEG", "BMP"]
-        self.image_paths = []
-        for e in ext:
-            self.image_paths.extend(glob.glob(os.path.join(data_root, "*." + e)))
-
-        self.num_images = len(self.image_paths)
-        self._length = self.num_images
-
-        if set == "train":
-            self._length = self.num_images * repeats
-
-        self.interpolation = {
-            "linear": PIL_INTERPOLATION["linear"],
-            "bilinear": PIL_INTERPOLATION["bilinear"],
-            "bicubic": PIL_INTERPOLATION["bicubic"],
-            "lanczos": PIL_INTERPOLATION["lanczos"],
-        }[interpolation]
-
-        self.templates = []
-        if learnable_property == "style":
-            if "en" in language:
-                self.templates.extend(imagenet_style_templates_small)
-            if "zh" in language:
-                self.templates.extend(zh_imagenet_style_templates_small)
-        else:
-            if "en" in language:
-                self.templates.extend(imagenet_templates_small)
-            if "zh" in language:
-                self.templates.extend(zh_imagenet_templates_small)
-
-        self.flip_transform = RandomHorizontalFlip(prob=self.flip_p)
-
-    def __len__(self):
-        return self._length
-
-    def __getitem__(self, i):
-        example = {}
-        image = Image.open(self.image_paths[i % self.num_images])
-
-        if not image.mode == "RGB":
-            image = image.convert("RGB")
-
-        placeholder_string = self.placeholder_token
-        text = random.choice(self.templates).format(placeholder_string)
-
-        example["input_ids"] = self.tokenizer(
-            text,
-            padding="max_length",
-            truncation=True,
-            max_length=self.tokenizer.model_max_length,
-            return_attention_mask=False,
-        ).input_ids
-
-        # default to score-sde preprocessing
-        img = np.array(image).astype(np.uint8)
-
-        if self.center_crop:
-            crop = min(img.shape[0], img.shape[1])
-            h, w, = (
-                img.shape[0],
-                img.shape[1],
-            )
-            img = img[(h - crop) // 2 : (h + crop) // 2, (w - crop) // 2 : (w + crop) // 2]
-
-        image = Image.fromarray(img)
-        image = image.resize((self.width, self.height), resample=self.interpolation)
-
-        image = self.flip_transform(image)
-        image = np.array(image).astype(np.uint8)
-        image = (image / 127.5 - 1.0).astype(np.float32).transpose([2, 0, 1])
-
-        example["pixel_values"] = image
-        return example
-
-
-def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
-    if token is None:
-        token = HfFolder.get_token()
-    if organization is None:
-        username = whoami(token)["name"]
-        return f"{username}/{model_id}"
-    else:
-        return f"{organization}/{model_id}"
-
-
-def main():
-    paddle_dtype = paddle.float32
-    args = parse_args()
-    rank = paddle.distributed.get_rank()
-    is_main_process = rank == 0
-    num_processes = paddle.distributed.get_world_size()
-    if num_processes > 1:
-        paddle.distributed.init_parallel_env()
-
-    # If passed along, set the training seed now.
-    if args.seed is not None:
-        set_seed(args.seed)
-
-    # Handle the repository creation
-    if is_main_process:
-        if args.output_dir is not None:
-            os.makedirs(args.output_dir, exist_ok=True)
-        if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            create_repo(repo_name, exist_ok=True, token=args.hub_token)
-            repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
-
-            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
-                if "step_*" not in gitignore:
-                    gitignore.write("step_*\n")
-                if "epoch_*" not in gitignore:
-                    gitignore.write("epoch_*\n")
-
-    # Load tokenizer
-    if args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)
-    elif args.pretrained_model_name_or_path:
-        # support windows "\"
-        tokenizer = AutoTokenizer.from_pretrained(url_or_path_join(args.pretrained_model_name_or_path, "tokenizer"))
-    # Load scheduler and models
-    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
-
-    # Add the placeholder token in tokenizer
-    placeholder_tokens = [args.placeholder_token]
-    if args.num_vectors < 1:
-        raise ValueError(f"--num_vectors has to be larger or equal to 1, but is {args.num_vectors}")
-
-    # add dummy tokens for multi-vector
-    additional_tokens = []
-    for i in range(1, args.num_vectors):
-        additional_tokens.append(f"{args.placeholder_token}_{i}")
-    placeholder_tokens += additional_tokens
-
-    num_added_tokens = tokenizer.add_tokens(placeholder_tokens)
-    if num_added_tokens != args.num_vectors:
-        raise ValueError(
-            f"The tokenizer already contains the token {args.placeholder_token}. Please pass a different"
-            " `placeholder_token` that is not already in the tokenizer."
-        )
-
-    # Convert the initializer_token, placeholder_token to ids
-    initializer_token_ids = tokenizer.encode(args.initializer_token, add_special_tokens=False)["input_ids"]
-    if len(initializer_token_ids) < 1:
-        raise ValueError("The initializer token must be a greater equal than one.")
-
-    placeholder_token_ids = tokenizer.convert_tokens_to_ids(placeholder_tokens)
-
-    text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path)
-
-    text_encoder = text_encoder_cls.from_pretrained(
-        url_or_path_join(args.pretrained_model_name_or_path, "text_encoder")
-    )
-    text_config = text_encoder.config if isinstance(text_encoder.config, dict) else text_encoder.config.to_dict()
-    if text_config.get("use_attention_mask", None) is not None and text_config["use_attention_mask"]:
-        use_attention_mask = True
-    else:
-        use_attention_mask = False
-    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
-    unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet")
-
-    # Resize the token embeddings as we are adding new special tokens to the tokenizer
-    text_encoder.resize_token_embeddings(len(tokenizer))
-
-    # Initialise the newly added placeholder token with the embeddings of the initializer token
-    with paddle.no_grad():
-        token_embeds = text_encoder.get_input_embeddings()
-        # we will compute mean
-        for token_id in placeholder_token_ids:
-            token_embeds.weight[token_id] = paddle.stack(
-                [token_embeds.weight[each] for each in initializer_token_ids]
-            ).mean(0)
-
-    # Freeze vae and unet
-    freeze_params(vae.parameters())
-    freeze_params(unet.parameters())
-    # Freeze all parameters except for the token embeddings in text encoder
-    freeze_params(text_encoder.parameters())
-    unfreeze_params(text_encoder.get_input_embeddings().parameters())
-
-    if args.gradient_checkpointing:
-        # unet.enable_gradient_checkpointing()
-        set_recompute(text_encoder, True)
-
-    if args.enable_xformers_memory_efficient_attention and is_ppxformers_available():
-        try:
-            unet.enable_xformers_memory_efficient_attention()
-        except Exception as e:
-            logger.warn(
-                "Could not enable memory efficient attention. Make sure develop paddlepaddle is installed"
-                f" correctly and a GPU is available: {e}"
-            )
-
-    train_dataset = TextualInversionDataset(
-        data_root=args.train_data_dir,
-        tokenizer=tokenizer,
-        height=args.height,
-        width=args.width,
-        placeholder_token=args.placeholder_token,
-        repeats=args.repeats,
-        learnable_property=args.learnable_property,
-        center_crop=args.center_crop,
-        set="train",
-        language=args.language,
-        interpolation="bilinear",
-    )
-
-    def collate_fn(examples):
-        input_ids = [example["input_ids"] for example in examples]
-        pixel_values = paddle.to_tensor([example["pixel_values"] for example in examples], dtype="float32")
-        input_ids = tokenizer.pad(
-            {"input_ids": input_ids}, padding="max_length", max_length=tokenizer.model_max_length, return_tensors="pd"
-        ).input_ids
-        return {
-            "input_ids": input_ids,
-            "pixel_values": pixel_values,
-        }
-
-    train_sampler = (
-        DistributedBatchSampler(train_dataset, batch_size=args.train_batch_size, shuffle=True)
-        if num_processes > 1
-        else BatchSampler(train_dataset, batch_size=args.train_batch_size, shuffle=True)
-    )
-    train_dataloader = DataLoader(
-        train_dataset, batch_sampler=train_sampler, collate_fn=collate_fn, num_workers=args.dataloader_num_workers
-    )
-
-    # Scheduler and math around the number of training steps.
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
-    if args.max_train_steps is None:
-        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
-    # Afterwards we recalculate our number of training epochs
-    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
-
-    if args.scale_lr:
-        args.learning_rate = (
-            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * num_processes
-        )
-
-    # Initialize the lr_scheduler
-    lr_scheduler = get_scheduler(
-        args.lr_scheduler,
-        learning_rate=args.learning_rate,
-        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
-        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
-        num_cycles=args.lr_num_cycles,
-        power=args.lr_power,
-    )
-    # Initialize the optimizer
-    optimizer = AdamW(
-        learning_rate=lr_scheduler,
-        parameters=text_encoder.get_input_embeddings().parameters(),  # only optimize the embeddings
-        beta1=args.adam_beta1,
-        beta2=args.adam_beta2,
-        weight_decay=args.adam_weight_decay,
-        epsilon=args.adam_epsilon,
-        grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm) if args.max_grad_norm > 0 else None,
-    )
-
-    if num_processes > 1:
-        text_encoder = paddle.DataParallel(text_encoder)
-
-    if is_main_process:
-        logger.info("-----------  Configuration Arguments -----------")
-        for arg, value in sorted(vars(args).items()):
-            logger.info("%s: %s" % (arg, value))
-        logger.info("------------------------------------------------")
-        writer = get_report_to(args)
-
-    # Train!
-    total_batch_size = args.train_batch_size * num_processes * args.gradient_accumulation_steps
-
-    logger.info("***** Running training *****")
-    logger.info(f"  Num examples = {len(train_dataset)}")
-    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
-    logger.info(f"  Num Epochs = {args.num_train_epochs}")
-    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
-    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
-    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
-    logger.info(f"  Total optimization steps = {args.max_train_steps}")
-
-    # Only show the progress bar once on each machine.
-    progress_bar = tqdm(range(args.max_train_steps), disable=not is_main_process)
-    progress_bar.set_description("Train Steps")
-    global_step = 0
-
-    # keep original embeddings as reference
-    orig_embeds_params = unwrap_model(text_encoder).get_input_embeddings().weight.clone()
-
-    index_no_updates = paddle.ones((len(tokenizer),), dtype=paddle.bool)
-    index_no_updates[min(placeholder_token_ids) : max(placeholder_token_ids) + 1] = False
-    index_no_updates = index_no_updates.cast("int64").sum()
-    # Keep vae and unet in eval model as we don't train these
-    vae.eval()
-    unet.train()
-    text_encoder.train()
-
-    for epoch in range(args.num_train_epochs):
-        for step, batch in enumerate(train_dataloader):
-            # Convert images to latent space
-            latents = vae.encode(batch["pixel_values"]).latent_dist.sample()
-            latents = latents * vae.config.scaling_factor
-
-            # Sample noise that we'll add to the latents
-            noise = paddle.randn(latents.shape, dtype=latents.dtype)
-            if args.noise_offset:
-                # https://www.crosslabs.org//blog/diffusion-with-offset-noise
-                noise += args.noise_offset * paddle.randn(
-                    (latents.shape[0], latents.shape[1], 1, 1), dtype=latents.dtype
-                )
-            batch_size = latents.shape[0]
-            # Sample a random timestep for each image
-            timesteps = paddle.randint(0, noise_scheduler.config.num_train_timesteps, (batch_size,)).cast("int64")
-
-            # Add noise to the latents according to the noise magnitude at each timestep
-            # (this is the forward diffusion process)
-            noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
-
-            if num_processes > 1 and (
-                args.gradient_checkpointing or ((step + 1) % args.gradient_accumulation_steps != 0)
-            ):
-                # grad acc, no_sync when (step + 1) % args.gradient_accumulation_steps != 0:
-                # gradient_checkpointing, no_sync every where
-                # gradient_checkpointing + grad_acc, no_sync every where
-                # unet_ctx_manager = unet.no_sync()
-                text_encoder_ctx_manager = text_encoder.no_sync()
-            else:
-                # unet_ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
-                text_encoder_ctx_manager = (
-                    contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
-                )
-
-            with text_encoder_ctx_manager:
-                # Get the text embedding for conditioning
-                if use_attention_mask:
-                    attention_mask = (batch["input_ids"] != tokenizer.pad_token_id).cast("int64")
-                else:
-                    attention_mask = None
-                encoder_hidden_states = text_encoder(batch["input_ids"], attention_mask=attention_mask)[0]
-
-                # with unet_ctx_manager:
-                # Predict the noise or sample
-                model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
-
-                # Get the target for loss depending on the prediction type
-                if noise_scheduler.config.prediction_type == "epsilon":
-                    target = noise
-                elif noise_scheduler.config.prediction_type == "v_prediction":
-                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
-                else:
-                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
-                loss = F.mse_loss(model_pred, target, reduction="mean")
-
-                if args.gradient_accumulation_steps > 1:
-                    loss = loss / args.gradient_accumulation_steps
-                loss.backward()
-
-            if (step + 1) % args.gradient_accumulation_steps == 0:
-                if num_processes > 1 and args.gradient_checkpointing:
-                    fused_allreduce_gradients(unwrap_model(text_encoder).get_input_embeddings().parameters(), None)
-                optimizer.step()
-                lr_scheduler.step()
-                optimizer.clear_grad()
-                # Let's make sure we don't update any embedding weights besides the newly added token
-                with paddle.no_grad():
-                    unwrap_model(text_encoder).get_input_embeddings().weight[:index_no_updates] = orig_embeds_params[
-                        :index_no_updates
-                    ]
-
-                progress_bar.update(1)
-                global_step += 1
-                step_loss = loss.item() * args.gradient_accumulation_steps
-                logs = {
-                    "epoch": str(epoch).zfill(4),
-                    "step_loss": round(step_loss, 10),
-                    "lr": lr_scheduler.get_lr(),
-                }
-                progress_bar.set_postfix(**logs)
-                if is_main_process:
-                    for name, val in logs.items():
-                        if name == "epoch":
-                            continue
-                        writer.add_scalar(f"train/{name}", val, global_step)
-
-                    if global_step % args.save_steps == 0:
-                        save_path = os.path.join(args.output_dir, f"learned_embeds-steps-{global_step}.pdparams")
-                        save_progress(text_encoder, placeholder_token_ids, args, save_path)
-
-                if global_step >= args.max_train_steps:
-                    break
-
-        if is_main_process:
-            if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
-                logger.info(
-                    f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
-                    f" {args.validation_prompt}."
-                )
-                # create pipeline
-                pipeline = DiffusionPipeline.from_pretrained(
-                    args.pretrained_model_name_or_path,
-                    text_encoder=unwrap_model(text_encoder),
-                    tokenizer=tokenizer,
-                    paddle_dtype=paddle_dtype,
-                    safety_checker=None,
-                    requires_safety_checker=False,
-                )
-                pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
-                pipeline.set_progress_bar_config(disable=True)
-
-                # run inference
-                generator = paddle.Generator().manual_seed(args.seed) if args.seed else None
-                images = [
-                    pipeline(args.validation_prompt, num_inference_steps=25, generator=generator).images[0]
-                    for _ in range(args.num_validation_images)
-                ]
-                np_images = np.stack([np.asarray(img) for img in images])
-
-                if args.report_to == "tensorboard":
-                    writer.add_images("test", np_images, epoch, dataformats="NHWC")
-                else:
-                    writer.add_image("test", np_images, epoch, dataformats="NHWC")
-
-                del pipeline
-                gc.collect()
-                vae.eval()
-                unet.train()
-                text_encoder.train()
-
-    if is_main_process:
-        writer.close()
-        if args.push_to_hub and args.only_save_embeds:
-            logger.warn("Enabling full model saving because --push_to_hub=True was specified.")
-            save_full_model = True
-        else:
-            save_full_model = not args.only_save_embeds
-        if save_full_model:
-            pipeline = DiffusionPipeline.from_pretrained(
-                args.pretrained_model_name_or_path,
-                text_encoder=unwrap_model(text_encoder),
-                tokenizer=tokenizer,
-            )
-            pipeline.save_pretrained(args.output_dir)
-        # Save the newly trained embeddings
-        save_path = os.path.join(args.output_dir, "learned_embeds.pdparams")
-        save_progress(text_encoder, placeholder_token_ids, args, save_path)
-
-        if args.push_to_hub:
-            repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/ppdiffusers/examples/tomesd/README.md b/ppdiffusers/examples/tomesd/README.md
deleted file mode 100644
index 768caf4716c9..000000000000
--- a/ppdiffusers/examples/tomesd/README.md
+++ /dev/null
@@ -1,120 +0,0 @@
-# ToME SD
-![A diffusion block with ToMe applied and the resulting images at different merge ratios.](https://raw.githubusercontent.com/dbolya/tomesd/main/examples/assets/method.jpg)
-
-[Token Merging for Fast Stable Diffusion](https://arxiv.org/pdf/2303.17604.pdf) 是一种 token 合并技术，它通过合并冗余的 token 从而可以减少 transformer 的计算量。该项技术可以应用到所有含有 transformer 结构的扩散模型中，比如：StableDiffusion、ControlNet 等模型。
-
-ToMe for SD 生成的图像有着如下优势：
-- 生成的结果能够接近原始图像；
-- 生成速度提高了 2 倍；
-- 即使合并了一半以上的token （60%），显存减少了约 5.7 倍。
-
-**Note:** 下面是原作者repo中贴出的fid、时间和显存占用对比表。
-| Method                      | r% | FID ↓  | Time (s/im) ↓            | Memory (GB/im) ↓        |
-|-----------------------------|----|:------|:--------------------------|:------------------------|
-| Baseline _(Original Model)_ | 0  | 33.12 | 3.09                      | 3.41                    |
-| w/ **ToMe for SD**        | 10 | 32.86 | 2.56 (**1.21x** _faster_) | 2.99 (**1.14x** _less_) |
-|                             | 20 | 32.86 | 2.29 (**1.35x** _faster_) | 2.17 (**1.57x** _less_) |
-|                             | 30 | 32.80 | 2.06 (**1.50x** _faster_) | 1.71 (**1.99x** _less_) |
-|                             | 40 | 32.87 | 1.85 (**1.67x** _faster_) | 1.26 (**2.71x** _less_) |
-|                             | 50 | 33.02 | 1.65 (**1.87x** _faster_) | 0.89 (**3.83x** _less_) |
-|                             | 60 | 33.37 | 1.52 (**2.03x** _faster_) | 0.60 (**5.68x** _less_) |
-
-配置信息：
-- GPU：4090
-- 分辨率：512x512
-- Scheduler：PLMS
-- 精度：FP16
-- 推理步数：50
-- 数据集：ImageNet-1k
-
-## 使用例子
-安装develop版本的ppdiffusers
-```sh
-pip install "ppdiffusers>=0.16.1"
-```
-
-下面是 StableDiffusion + ToME 技术的例子
-
-```python
-import paddle
-from ppdiffusers import StableDiffusionPipeline
-
-pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", safety_checker=None, paddle_dtype=paddle.float16)
-
-# 我们可以开启 xformers
-# pipe.enable_xformers_memory_efficient_attention()
-
-# Apply ToMe with a 50% merging ratio
-pipe.apply_tome(ratio=0.5) # Can also use pipe.unet in place of pipe here
-
-generator = paddle.Generator().manual_seed(0)
-image = pipe("a photo of an astronaut riding a horse on mars", generator=generator).images[0]
-image.save("astronaut.png")
-```
-
-
-下面是 ControlNet + ToME 技术的例子
-```python
-import paddle
-from ppdiffusers import ControlNetModel, StableDiffusionControlNetPipeline
-from ppdiffusers.utils import load_image
-
-controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
-pipe = StableDiffusionControlNetPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet, paddle_dtype=paddle.float16
-)
-
-# Apply ToMe with a 50% merging ratio
-pipe.apply_tome(ratio=0.5) # Can also use pipe.unet in place of pipe here
-
-# 我们可以开启 xformers
-# pipe.enable_xformers_memory_efficient_attention()
-generator = paddle.Generator().manual_seed(0)
-prompt = "bird"
-image = load_image(
-    "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
-)
-
-image = pipe(prompt, image, generator=generator).images[0]
-
-image.save("bird.png")
-```
-
-## 速度比较
-测试代码参考自 https://github.com/huggingface/diffusers/pull/2303
-
-|Batch Size|Vanilla Attention|Vanilla Attention + TOME 0.5/0.749|xFormers Cutlass + TOME 0.5/0.749 |
-|:----|:----|:----|:----|
-|1|2.08 s|2.15 s / 2.06 s|1.99 s / 1.95 s|
-|10|14.15 s|10.94 s / 10.04 s|9.21 s / 8.87 s|
-|16|21.93 s|16.73 s / 15.31 s|13.98 s / 13.95 s|
-|32|42.93 s|32.88 s / 29.48 s|26.82 s / 29.08 s|
-|64|OOM|63.79 s / 58.21 s|52.86 s / 50.8 s|
-
-配置信息：
-- GPU：A100
-- 分辨率：512x512
-- Scheduler：DPMSolverMultistepScheduler
-- 精度：FP16
-- 推理步数：50
-
-## Citation
-
-If you use ToMe for SD or this codebase in your work, please cite:
-```
-@article{bolya2023tomesd,
-  title={Token Merging for Fast Stable Diffusion},
-  author={Bolya, Daniel and Hoffman, Judy},
-  journal={arXiv},
-  year={2023}
-}
-```
-If you use ToMe in general please cite the original work:
-```
-@inproceedings{bolya2023tome,
-  title={Token Merging: Your {ViT} but Faster},
-  author={Bolya, Daniel and Fu, Cheng-Yang and Dai, Xiaoliang and Zhang, Peizhao and Feichtenhofer, Christoph and Hoffman, Judy},
-  booktitle={International Conference on Learning Representations},
-  year={2023}
-}
-```
diff --git a/ppdiffusers/examples/unconditional_image_generation/README.md b/ppdiffusers/examples/unconditional_image_generation/README.md
deleted file mode 100644
index bec582c2f8aa..000000000000
--- a/ppdiffusers/examples/unconditional_image_generation/README.md
+++ /dev/null
@@ -1,123 +0,0 @@
-## 训练样本
-
-### 安装依赖
-
-运行脚本之前，请确保安装库的训练依赖：
-
-
-切换到 example 目录并且运行：
-```bash
-pip install -r requirements.txt
-```
-
-
-### Unconditional Flowers
-
-下面的命令是使用Oxford Flowers dataset来训练一个DDPM UNet模型：
-
-```bash
-python -u -m paddle.distributed.launch --gpus "0,1,2,3"  train_unconditional.py \
-  --dataset_name="huggan/flowers-102-categories" \
-  --cache_dir 'data' \
-  --resolution=64 --center_crop --random_flip \
-  --output_dir="ddpm-ema-flowers-64" \
-  --train_batch_size=16 \
-  --num_epochs=100 \
-  --gradient_accumulation_steps=1 \
-  --use_ema \
-  --learning_rate=1e-4 \
-  --lr_warmup_steps=500
-```
-
-完整的训练需要在4xV100 GPUs上训练2小时.
-
-<img src="https://user-images.githubusercontent.com/26864830/180248660-a0b143d0-b89a-42c5-8656-2ebf6ece7e52.png" width="700" />
-
-
-### Unconditional Pokemon
-
-下面的命令是Pokemon dataset上训练一个DDPM UNet模型：
-
-```bash
-python -u -m paddle.distributed.launch --gpus "0,1,2,3" train_unconditional.py \
-  --dataset_name="huggan/pokemon" \
-  --resolution=64 --center_crop --random_flip \
-  --output_dir="ddpm-ema-pokemon-64" \
-  --train_batch_size=16 \
-  --num_epochs=100 \
-  --gradient_accumulation_steps=1 \
-  --use_ema \
-  --learning_rate=1e-4 \
-  --lr_warmup_steps=500
-```
-
-完整的训练需要在4xV100 GPUs上训练2小时.
-
-<img src="https://user-images.githubusercontent.com/26864830/180248200-928953b4-db38-48db-b0c6-8b740fe6786f.png" width="700" />
-
-
-### 使用你自己的数据
-
-
-
-要使用自己的数据集，有两种方法：
-
--您可以将自己的文件夹提供为`--train_data_dir`
-
--或者，您可以将数据集上传到hub，然后简单地传递`--dataset_name`参数。
-
-下面，我们将对两者进行更详细的解释。
-
-#### 将数据集作为文件夹提供
-
-如果为自己的文件夹提供图像，脚本需要以下目录结构:
-
-```bash
-data_dir/xxx.png
-data_dir/xxy.png
-data_dir/[...]/xxz.png
-```
-
-换句话说，脚本将负责收集文件夹中的所有图像。然后可以像这样运行脚本:
-
-```bash
-python train_unconditional.py \
-    --train_data_dir <path-to-train-directory> \
-    <other-arguments>
-```
-
-这个脚本将会使用 [`ImageFolder`](https://huggingface.co/docs/datasets/v2.0.0/en/image_process#imagefolder) 特征，并且自动把这些目录变成Dataset对象。
-
-#### 把你的数据上传到hub上
-
-使用[`ImageFolder`](https://huggingface.co/docs/datasets/v2.0.0/en/image_process#imagefolder)中提供的功能将图像数据集上传到hub中心非常容易。只需执行以下操作:
-
-```python
-from datasets import load_dataset
-
-# example 1: local folder
-dataset = load_dataset("imagefolder", data_dir="path_to_your_folder")
-
-# example 2: local files (supported formats are tar, gzip, zip, xz, rar, zstd)
-dataset = load_dataset("imagefolder", data_files="path_to_zip_file")
-
-# example 3: remote files (supported formats are tar, gzip, zip, xz, rar, zstd)
-dataset = load_dataset("imagefolder", data_files="https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_3367a.zip")
-
-# example 4: providing several splits
-dataset = load_dataset("imagefolder", data_files={"train": ["path/to/file1", "path/to/file2"], "test": ["path/to/file3", "path/to/file4"]})
-```
-
-`ImageFolder将创建包含PIL编码图像的“image”列。
-
-下一步，将数据集推到hub上
-
-```python
-# assuming you have ran the huggingface-cli login command in a terminal
-dataset.push_to_hub("name_of_your_dataset")
-
-# if you want to push to a private repo, simply pass private=True:
-dataset.push_to_hub("name_of_your_dataset", private=True)
-```
-
-就这样！现在，只需将“--dataset_name”参数设置为hub上数据集的名称，即可训练模型。
diff --git a/ppdiffusers/examples/unconditional_image_generation/requirements.txt b/ppdiffusers/examples/unconditional_image_generation/requirements.txt
deleted file mode 100644
index 0b4b5979887e..000000000000
--- a/ppdiffusers/examples/unconditional_image_generation/requirements.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-datasets
-visualdl
-paddlenlp>=2.6.0rc0
-Pillow
-ppdiffusers>=0.16.1
\ No newline at end of file
diff --git a/ppdiffusers/examples/unconditional_image_generation/train_unconditional.py b/ppdiffusers/examples/unconditional_image_generation/train_unconditional.py
deleted file mode 100644
index e6fe7a509a8f..000000000000
--- a/ppdiffusers/examples/unconditional_image_generation/train_unconditional.py
+++ /dev/null
@@ -1,616 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import inspect
-import logging
-import math
-import os
-from pathlib import Path
-from typing import Optional
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from datasets import load_dataset
-from huggingface_hub import HfFolder, create_repo, whoami
-from paddle.vision import transforms
-from tqdm.auto import tqdm
-
-from paddlenlp.trainer import set_seed
-from paddlenlp.utils.log import logger
-from ppdiffusers import DDPMPipeline, DDPMScheduler, UNet2DModel
-from ppdiffusers.optimization import get_scheduler
-from ppdiffusers.training_utils import EMAModel, unwrap_model
-from ppdiffusers.utils import check_min_version, is_ppxformers_available
-
-# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.16.1")
-
-
-def _extract_into_tensor(arr, timesteps, broadcast_shape):
-    """
-    Extract values from a 1-D numpy array for a batch of indices.
-
-    :param arr: the 1-D numpy array.
-    :param timesteps: a tensor of indices into the array to extract.
-    :param broadcast_shape: a larger shape of K dimensions with the batch
-                            dimension equal to the length of timesteps.
-    :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
-    """
-    if not isinstance(arr, paddle.Tensor):
-        arr = paddle.to_tensor(arr)
-    res = arr[timesteps].cast("float32")
-    while len(res.shape) < len(broadcast_shape):
-        res = res[..., None]
-    return res.expand(broadcast_shape)
-
-
-def get_report_to(args):
-    if args.logger == "visualdl":
-        from visualdl import LogWriter
-
-        writer = LogWriter(logdir=args.logging_dir)
-    elif args.logger == "tensorboard":
-        from tensorboardX import SummaryWriter
-
-        writer = SummaryWriter(logdir=args.logging_dir)
-    else:
-        raise ValueError("logger must be in ['visualdl', 'tensorboard']")
-    return writer
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="Simple example of a training script.")
-    parser.add_argument(
-        "--dataset_name",
-        type=str,
-        default=None,
-        help=(
-            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
-            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
-            " or to a folder containing files that HF Datasets can understand."
-        ),
-    )
-    parser.add_argument(
-        "--dataset_config_name",
-        type=str,
-        default=None,
-        help="The config of the Dataset, leave as None if there's only one config.",
-    )
-    parser.add_argument(
-        "--model_config_name_or_path",
-        type=str,
-        default=None,
-        help="The config of the UNet model to train, leave as None to use standard DDPM configuration.",
-    )
-    parser.add_argument(
-        "--train_data_dir",
-        type=str,
-        default=None,
-        help=(
-            "A folder containing the training data. Folder contents must follow the structure described in"
-            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
-            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
-        ),
-    )
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        default="ddpm-model-64",
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-    parser.add_argument("--overwrite_output_dir", action="store_true")
-    parser.add_argument(
-        "--cache_dir",
-        type=str,
-        default=None,
-        help="The directory where the downloaded models and datasets will be stored.",
-    )
-    parser.add_argument(
-        "--resolution",
-        type=int,
-        default=64,
-        help=(
-            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
-            " resolution"
-        ),
-    )
-    parser.add_argument(
-        "--center_crop",
-        default=False,
-        action="store_true",
-        help=(
-            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
-            " cropped. The images will be resized to the resolution first before cropping."
-        ),
-    )
-    parser.add_argument(
-        "--random_flip",
-        default=False,
-        action="store_true",
-        help="whether to randomly flip images horizontally",
-    )
-    parser.add_argument(
-        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
-    )
-    parser.add_argument(
-        "--eval_batch_size", type=int, default=16, help="The number of images to generate for evaluation."
-    )
-    parser.add_argument(
-        "--dataloader_num_workers",
-        type=int,
-        default=0,
-        help=(
-            "The number of subprocesses to use for data loading. 0 means that the data will be loaded in the main"
-            " process."
-        ),
-    )
-    parser.add_argument("--num_epochs", type=int, default=100)
-    parser.add_argument("--save_images_epochs", type=int, default=10, help="How often to save images during training.")
-    parser.add_argument(
-        "--save_model_epochs", type=int, default=10, help="How often to save the model during training."
-    )
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument(
-        "--learning_rate",
-        type=float,
-        default=1e-4,
-        help="Initial learning rate (after the potential warmup period) to use.",
-    )
-    parser.add_argument(
-        "--lr_scheduler",
-        type=str,
-        default="cosine",
-        help=(
-            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
-            ' "constant", "constant_with_warmup"]'
-        ),
-    )
-    parser.add_argument(
-        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
-    )
-    parser.add_argument("--adam_beta1", type=float, default=0.95, help="The beta1 parameter for the Adam optimizer.")
-    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
-    parser.add_argument(
-        "--adam_weight_decay", type=float, default=1e-6, help="Weight decay magnitude for the Adam optimizer."
-    )
-    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
-    parser.add_argument(
-        "--use_ema",
-        action="store_true",
-        help="Whether to use Exponential Moving Average for the final model weights.",
-    )
-    parser.add_argument("--ema_inv_gamma", type=float, default=1.0, help="The inverse gamma value for the EMA decay.")
-    parser.add_argument("--ema_power", type=float, default=3 / 4, help="The power value for the EMA decay.")
-    parser.add_argument("--ema_max_decay", type=float, default=0.9999, help="The maximum decay magnitude for EMA.")
-    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
-    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
-    parser.add_argument(
-        "--hub_model_id",
-        type=str,
-        default=None,
-        help="The name of the repository to keep in sync with the local `output_dir`.",
-    )
-    parser.add_argument(
-        "--hub_private_repo", action="store_true", help="Whether or not to create a private repository."
-    )
-    parser.add_argument(
-        "--logger",
-        type=str,
-        default="visualdl",
-        choices=["visualdl", "tensorboard"],
-        help=(
-            "Whether to use [tensorboard](https://www.tensorflow.org/tensorboard) or [wandb](https://www.wandb.ai)"
-            " for experiment tracking and logging of model metrics and model checkpoints"
-        ),
-    )
-    parser.add_argument(
-        "--logging_dir",
-        type=str,
-        default="logs",
-        help=(
-            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
-            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
-        ),
-    )
-    parser.add_argument(
-        "--prediction_type",
-        type=str,
-        default="epsilon",
-        choices=["epsilon", "sample"],
-        help="Whether the model should predict the 'epsilon'/noise error or directly the reconstructed image 'x0'.",
-    )
-    parser.add_argument("--ddpm_num_steps", type=int, default=1000)
-    parser.add_argument("--ddpm_num_inference_steps", type=int, default=1000)
-    parser.add_argument("--ddpm_beta_schedule", type=str, default="linear")
-    parser.add_argument(
-        "--checkpointing_steps",
-        type=int,
-        default=500,
-        help=(
-            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
-            " training using `--resume_from_checkpoint`."
-        ),
-    )
-    parser.add_argument(
-        "--checkpoints_total_limit",
-        type=int,
-        default=None,
-        help=(
-            "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`."
-            " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state"
-            " for more docs"
-        ),
-    )
-    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
-    parser.add_argument(
-        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
-    )
-    args = parser.parse_args()
-
-    if args.dataset_name is None and args.train_data_dir is None:
-        raise ValueError("You must specify either a dataset name from the hub or a train data directory.")
-
-    return args
-
-
-def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
-    if token is None:
-        token = HfFolder.get_token()
-    if organization is None:
-        username = whoami(token)["name"]
-        return f"{username}/{model_id}"
-    else:
-        return f"{organization}/{model_id}"
-
-
-def main():
-    args = parse_args()
-    rank = paddle.distributed.get_rank()
-    is_main_process = rank == 0
-    num_processes = paddle.distributed.get_world_size()
-    if num_processes > 1:
-        paddle.distributed.init_parallel_env()
-
-    # If passed along, set the training seed now.
-    if args.seed is not None:
-        set_seed(args.seed)
-    # `accelerate` 0.16.0 will have better support for customized saving
-
-    # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
-    def save_model_hook(models, weights, output_dir):
-        if args.use_ema:
-            ema_model.save_pretrained(os.path.join(output_dir, "unet_ema"))
-
-        for i, model in enumerate(models):
-            model.save_pretrained(os.path.join(output_dir, "unet"))
-
-            # make sure to pop weight so that corresponding model is not saved again
-            weights.pop()
-
-    def load_model_hook(models, input_dir):
-        if args.use_ema:
-            load_model = EMAModel.from_pretrained(os.path.join(input_dir, "unet_ema"), UNet2DModel)
-            ema_model.load_state_dict(load_model.state_dict())
-            del load_model
-
-        for i in range(len(models)):
-            # pop models so that they are not loaded again
-            model = models.pop()
-
-            # load ppdiffusers style into model
-            load_model = UNet2DModel.from_pretrained(input_dir, subfolder="unet")
-            model.register_to_config(**load_model.config)
-
-            model.load_state_dict(load_model.state_dict())
-            del load_model
-
-        # accelerator.register_save_state_pre_hook(save_model_hook)
-        # accelerator.register_load_state_pre_hook(load_model_hook)
-
-    # Make one log on every process with the configuration for debugging.
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO,
-    )
-
-    # Handle the repository creation
-    if is_main_process:
-        if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            create_repo(repo_name, exist_ok=True, token=args.hub_token)
-            # repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
-
-            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
-                if "step_*" not in gitignore:
-                    gitignore.write("step_*\n")
-                if "epoch_*" not in gitignore:
-                    gitignore.write("epoch_*\n")
-        elif args.output_dir is not None:
-            os.makedirs(args.output_dir, exist_ok=True)
-
-    # Initialize the model
-    if args.model_config_name_or_path is None:
-        model = UNet2DModel(
-            sample_size=args.resolution,
-            in_channels=3,
-            out_channels=3,
-            layers_per_block=2,
-            block_out_channels=(128, 128, 256, 256, 512, 512),
-            down_block_types=(
-                "DownBlock2D",
-                "DownBlock2D",
-                "DownBlock2D",
-                "DownBlock2D",
-                "AttnDownBlock2D",
-                "DownBlock2D",
-            ),
-            up_block_types=(
-                "UpBlock2D",
-                "AttnUpBlock2D",
-                "UpBlock2D",
-                "UpBlock2D",
-                "UpBlock2D",
-                "UpBlock2D",
-            ),
-        )
-    else:
-        config = UNet2DModel.load_config(args.model_config_name_or_path)
-        model = UNet2DModel.from_config(config)
-
-    # Create EMA for the model.
-    if args.use_ema:
-        ema_model = EMAModel(
-            model.parameters(),
-            decay=args.ema_max_decay,
-            use_ema_warmup=True,
-            inv_gamma=args.ema_inv_gamma,
-            power=args.ema_power,
-            model_cls=UNet2DModel,
-            model_config=model.config,
-        )
-
-    if args.enable_xformers_memory_efficient_attention and is_ppxformers_available():
-        try:
-            model.enable_xformers_memory_efficient_attention()
-        except Exception as e:
-            logger.warn(
-                "Could not enable memory efficient attention. Make sure develop paddlepaddle is installed"
-                f" correctly and a GPU is available: {e}"
-            )
-    # Initialize the scheduler
-    accepts_prediction_type = "prediction_type" in set(inspect.signature(DDPMScheduler.__init__).parameters.keys())
-    if accepts_prediction_type:
-        noise_scheduler = DDPMScheduler(
-            num_train_timesteps=args.ddpm_num_steps,
-            beta_schedule=args.ddpm_beta_schedule,
-            prediction_type=args.prediction_type,
-        )
-    else:
-        noise_scheduler = DDPMScheduler(num_train_timesteps=args.ddpm_num_steps, beta_schedule=args.ddpm_beta_schedule)
-
-    # Get the datasets: you can either provide your own training and evaluation files (see below)
-    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
-
-    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
-    # download the dataset.
-    if args.dataset_name is not None:
-        dataset = load_dataset(
-            args.dataset_name,
-            args.dataset_config_name,
-            cache_dir=args.cache_dir,
-            split="train",
-        )
-    else:
-        dataset = load_dataset("imagefolder", data_dir=args.train_data_dir, cache_dir=args.cache_dir, split="train")
-        # See more about loading custom images at
-
-    # Preprocessing the datasets and DataLoaders creation.
-    augmentations = transforms.Compose(
-        [
-            transforms.Resize(args.resolution, interpolation="bilinear"),
-            transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution),
-            transforms.RandomHorizontalFlip() if args.random_flip else transforms.Lambda(lambda x: x),
-            transforms.ToTensor(),
-            transforms.Normalize([0.5], [0.5]),
-        ]
-    )
-
-    def transform_images(examples):
-        images = [augmentations(image.convert("RGB")) for image in examples["image"]]
-        return {"input": images}
-
-    # logger.info(f"Dataset size: {len(dataset)}")
-
-    dataset.set_transform(transform_images)
-    train_dataloader = paddle.io.DataLoader(
-        dataset, batch_size=args.train_batch_size, shuffle=True, num_workers=args.dataloader_num_workers
-    )
-
-    if num_processes > 1:
-        model = paddle.DataParallel(model)
-
-    # Initialize the learning rate scheduler
-    lr_scheduler = get_scheduler(
-        args.lr_scheduler,
-        learning_rate=args.learning_rate,
-        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
-        num_training_steps=(len(train_dataloader) * args.num_epochs),
-    )
-
-    # Initialize the optimizer
-    optimizer = paddle.optimizer.AdamW(
-        learning_rate=lr_scheduler,
-        parameters=model.parameters(),
-        beta1=args.adam_beta1,
-        beta2=args.adam_beta2,
-        weight_decay=args.adam_weight_decay,
-        epsilon=args.adam_epsilon,
-        grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm) if args.max_grad_norm > 0 else None,
-    )
-
-    if is_main_process:
-        logger.info("-----------  Configuration Arguments -----------")
-        for arg, value in sorted(vars(args).items()):
-            logger.info("%s: %s" % (arg, value))
-        logger.info("------------------------------------------------")
-        writer = get_report_to(args)
-
-    # Prepare everything with our `accelerator`.
-    total_batch_size = args.train_batch_size * num_processes * args.gradient_accumulation_steps
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
-    max_train_steps = args.num_epochs * num_update_steps_per_epoch
-
-    logger.info("***** Running training *****")
-    logger.info(f"  Num examples = {len(dataset)}")
-    logger.info(f"  Num Epochs = {args.num_epochs}")
-    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
-    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
-    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
-    logger.info(f"  Total optimization steps = {max_train_steps}")
-
-    global_step = 0
-    first_epoch = 0
-
-    # Train!
-    for epoch in range(first_epoch, args.num_epochs):
-        model.train()
-        progress_bar = tqdm(total=num_update_steps_per_epoch, disable=not is_main_process)
-        progress_bar.set_description(f"Epoch {epoch}")
-        for step, batch in enumerate(train_dataloader):
-            clean_images = batch["input"]
-            # Sample noise that we'll add to the images
-            noise = paddle.randn(clean_images.shape)
-            bsz = clean_images.shape[0]
-            # Sample a random timestep for each image
-            timesteps = paddle.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,)).cast("int64")
-
-            # Add noise to the clean images according to the noise magnitude at each timestep
-            # (this is the forward diffusion process)
-            noisy_images = noise_scheduler.add_noise(clean_images, noise, timesteps)
-
-            # Predict the noise residual
-            model_output = model(noisy_images, timesteps).sample
-
-            if args.prediction_type == "epsilon":
-                loss = F.mse_loss(model_output, noise)  # this could have different weights!
-            elif args.prediction_type == "sample":
-                alpha_t = _extract_into_tensor(
-                    noise_scheduler.alphas_cumprod, timesteps, (clean_images.shape[0], 1, 1, 1)
-                )
-                snr_weights = alpha_t / (1 - alpha_t)
-                loss = snr_weights * F.mse_loss(
-                    model_output, clean_images, reduction="none"
-                )  # use SNR weighting from distillation paper
-                loss = loss.mean()
-            else:
-                raise ValueError(f"Unsupported prediction type: {args.prediction_type}")
-
-            loss.backward()
-
-            optimizer.step()
-            lr_scheduler.step()
-            optimizer.clear_grad()
-
-            # Checks if the accelerator has performed an optimization step behind the scenes
-
-            if args.use_ema:
-                ema_model.step(model.parameters())
-            progress_bar.update(1)
-            global_step += 1
-
-            if global_step % args.checkpointing_steps == 0:
-                if is_main_process:
-                    save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
-                    if args.use_ema:
-                        unwrap_model(ema_model).save_pretrained(os.path.join(save_path, "unet_ema"))
-                    unwrap_model(model).save_pretrained(os.path.join(save_path, "unet"))
-
-                    logger.info(f"Saved state to {save_path}")
-
-            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_lr(), "step": global_step}
-            if args.use_ema:
-                logs["ema_decay"] = ema_model.cur_decay_value
-            progress_bar.set_postfix(**logs)
-
-        progress_bar.close()
-
-        # Generate sample images for visual inspection
-        if is_main_process:
-            # writer.close()
-            if epoch % args.save_images_epochs == 0 or epoch == args.num_epochs - 1:
-                unet = unwrap_model(model)
-                if args.use_ema:
-                    ema_model.store(unet.parameters())
-                    ema_model.copy_to(unet.parameters())
-                pipeline = DDPMPipeline(
-                    unet=unet,
-                    scheduler=noise_scheduler,
-                )
-
-                generator = paddle.Generator().manual_seed(0)
-                # run pipeline in inference (sample random noise and denoise)
-                images = pipeline(
-                    generator=generator,
-                    batch_size=args.eval_batch_size,
-                    num_inference_steps=args.ddpm_num_inference_steps,
-                    output_type="numpy",
-                ).images
-
-                if args.use_ema:
-                    ema_model.restore(unet.parameters())
-                # denormalize the images and save to tensorboard
-                images_processed = (images * 255).round().astype("uint8")
-                if args.report_to == "tensorboard":
-                    writer.add_images("test", images_processed.transpose(0, 3, 1, 2), epoch, dataformats="NHWC")
-                else:
-                    writer.add_image("test", images_processed.transpose(0, 3, 1, 2), epoch, dataformats="NHWC")
-
-            if epoch % args.save_model_epochs == 0 or epoch == args.num_epochs - 1:
-                # save the model
-                # save the model
-                unet = unwrap_model(model)
-
-                if args.use_ema:
-                    ema_model.store(unet.parameters())
-                    ema_model.copy_to(unet.parameters())
-
-                pipeline = DDPMPipeline(
-                    unet=unet,
-                    scheduler=noise_scheduler,
-                )
-
-                pipeline.save_pretrained(args.output_dir)
-
-                if args.use_ema:
-                    ema_model.restore(unet.parameters())
-
-    if is_main_process:
-        writer.close()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/ppdiffusers/ppdiffusers/__init__.py b/ppdiffusers/ppdiffusers/__init__.py
deleted file mode 100644
index 5eba7e20dbc3..000000000000
--- a/ppdiffusers/ppdiffusers/__init__.py
+++ /dev/null
@@ -1,257 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# flake8: noqa
-
-
-from . import patches
-from .configuration_utils import ConfigMixin
-from .utils import (
-    OptionalDependencyNotAvailable,
-    is_einops_available,
-    is_fastdeploy_available,
-    is_inflect_available,
-    is_k_diffusion_available,
-    is_k_diffusion_version,
-    is_librosa_available,
-    is_note_seq_available,
-    is_paddle_available,
-    is_paddle_version,
-    is_paddlenlp_available,
-    is_paddlenlp_version,
-    is_ppxformers_available,
-    is_safetensors_available,
-    is_scipy_available,
-    is_torch_available,
-    is_unidecode_available,
-    is_visualdl_available,
-    logging,
-)
-from .version import VERSION as __version__
-
-try:
-    if not is_fastdeploy_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from .utils.dummy_fastdeploy_objects import *  # noqa F403
-else:
-    from .pipelines import FastDeployRuntimeModel
-
-try:
-    if not is_paddle_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from .utils.dummy_paddle_objects import *  # noqa F403
-else:
-    from .models import (
-        AutoencoderKL,
-        ControlNetModel,
-        LitEma,
-        ModelMixin,
-        MultiAdapter,
-        PriorTransformer,
-        T2IAdapter,
-        T5FilmDecoder,
-        Transformer2DModel,
-        UNet1DModel,
-        UNet2DConditionModel,
-        UNet2DModel,
-        UNet3DConditionModel,
-        VQModel,
-    )
-    from .optimization import (
-        get_constant_schedule,
-        get_constant_schedule_with_warmup,
-        get_cosine_schedule_with_warmup,
-        get_cosine_with_hard_restarts_schedule_with_warmup,
-        get_linear_schedule_with_warmup,
-        get_polynomial_decay_schedule_with_warmup,
-        get_scheduler,
-    )
-    from .pipelines import (
-        AudioPipelineOutput,
-        DanceDiffusionPipeline,
-        DDIMPipeline,
-        DDPMPipeline,
-        DiffusionPipeline,
-        DiTPipeline,
-        ImagePipelineOutput,
-        KarrasVePipeline,
-        LDMPipeline,
-        LDMSuperResolutionPipeline,
-        PNDMPipeline,
-        RePaintPipeline,
-        ScoreSdeVePipeline,
-        TextPipelineOutput,
-    )
-    from .schedulers import (
-        DDIMInverseScheduler,
-        DDIMScheduler,
-        DDPMScheduler,
-        DEISMultistepScheduler,
-        DPMSolverMultistepScheduler,
-        DPMSolverSinglestepScheduler,
-        DPMSolverUniDiffuserScheduler,
-        EulerAncestralDiscreteScheduler,
-        EulerDiscreteScheduler,
-        HeunDiscreteScheduler,
-        IPNDMScheduler,
-        KarrasVeScheduler,
-        KDPM2AncestralDiscreteScheduler,
-        KDPM2DiscreteScheduler,
-        PNDMScheduler,
-        RePaintScheduler,
-        SchedulerMixin,
-        ScoreSdeVeScheduler,
-        UnCLIPScheduler,
-        UniPCMultistepScheduler,
-        VQDiffusionScheduler,
-    )
-    from .schedulers.preconfig import (
-        PreconfigEulerAncestralDiscreteScheduler,
-        PreconfigLMSDiscreteScheduler,
-    )
-    from .training_utils import EMAModel
-
-try:
-    if not (is_paddle_available() and is_scipy_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from .utils.dummy_paddle_and_scipy_objects import *  # noqa F403
-else:
-    from .schedulers import LMSDiscreteScheduler
-
-
-try:
-    if not (is_paddle_available() and is_paddlenlp_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from .utils.dummy_paddle_and_paddlenlp_objects import *  # noqa F403
-else:
-    from .pipelines import (
-        AltDiffusionImg2ImgPipeline,
-        AltDiffusionPipeline,
-        AudioLDMPipeline,
-        CycleDiffusionPipeline,
-        IFImg2ImgPipeline,
-        IFImg2ImgSuperResolutionPipeline,
-        IFInpaintingPipeline,
-        IFInpaintingSuperResolutionPipeline,
-        IFPipeline,
-        IFSuperResolutionPipeline,
-        LDMTextToImagePipeline,
-        PaintByExamplePipeline,
-        SemanticStableDiffusionPipeline,
-        StableDiffusionAdapterPipeline,
-        StableDiffusionAttendAndExcitePipeline,
-        StableDiffusionControlNetPipeline,
-        StableDiffusionDepth2ImgPipeline,
-        StableDiffusionImageVariationPipeline,
-        StableDiffusionImg2ImgPipeline,
-        StableDiffusionInpaintPipeline,
-        StableDiffusionInpaintPipelineLegacy,
-        StableDiffusionInstructPix2PixPipeline,
-        StableDiffusionLatentUpscalePipeline,
-        StableDiffusionMegaPipeline,
-        StableDiffusionModelEditingPipeline,
-        StableDiffusionPanoramaPipeline,
-        StableDiffusionPipeline,
-        StableDiffusionPipelineAllinOne,
-        StableDiffusionPipelineSafe,
-        StableDiffusionPix2PixZeroPipeline,
-        StableDiffusionSAGPipeline,
-        StableDiffusionUpscalePipeline,
-        StableUnCLIPImg2ImgPipeline,
-        StableUnCLIPPipeline,
-        TextToVideoSDPipeline,
-        TextToVideoZeroPipeline,
-        UnCLIPImageVariationPipeline,
-        UnCLIPPipeline,
-        UniDiffuserPipeline,
-        VersatileDiffusionDualGuidedPipeline,
-        VersatileDiffusionImageVariationPipeline,
-        VersatileDiffusionPipeline,
-        VersatileDiffusionTextToImagePipeline,
-        VQDiffusionPipeline,
-    )
-    from .pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertModel
-    from .pipelines.unidiffuser.caption_decoder import CaptionDecoder
-
-try:
-    if not (is_paddle_available() and is_paddlenlp_available() and is_k_diffusion_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from .utils.dummy_paddle_and_paddlenlp_and_k_diffusion_objects import *  # noqa F403
-else:
-    from .pipelines import StableDiffusionKDiffusionPipeline
-
-try:
-    if not (is_paddle_available() and is_paddlenlp_available() and is_fastdeploy_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from .utils.dummy_paddle_and_paddlenlp_and_fastdeploy_objects import *  # noqa F403
-else:
-    from .pipelines import (
-        FastDeployCycleDiffusionPipeline,
-        FastDeployStableDiffusionControlNetPipeline,
-        FastDeployStableDiffusionImageVariationPipeline,
-        FastDeployStableDiffusionImg2ImgPipeline,
-        FastDeployStableDiffusionInpaintPipeline,
-        FastDeployStableDiffusionInpaintPipelineLegacy,
-        FastDeployStableDiffusionMegaPipeline,
-        FastDeployStableDiffusionPipeline,
-        FastDeployStableDiffusionUpscalePipeline,
-    )
-
-try:
-    if not (is_paddle_available() and is_librosa_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from .utils.dummy_paddle_and_librosa_objects import *  # noqa F403
-else:
-    from .pipelines import AudioDiffusionPipeline, Mel
-
-
-try:
-    if not (is_paddle_available() and is_paddlenlp_available() and is_note_seq_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from .utils.dummy_paddle_and_paddlenlp_and_note_seq_objects import *  # noqa F403
-else:
-    from .pipelines import SpectrogramDiffusionPipeline
-
-
-try:
-    if not (is_paddle_available() and is_paddlenlp_available() and is_einops_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from .utils.dummy_paddle_and_paddlenlp_and_einops_objects import *  # noqa F403
-else:
-    from .pipelines import UniDiffuserPipeline
-
-try:
-    if not (is_paddle_available() and is_einops_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from .utils.dummy_paddle_and_einops_objects import *  # noqa F403
-else:
-    from .models import UViTModel
-
-try:
-    if not (is_note_seq_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from .utils.dummy_note_seq_objects import *  # noqa F403
-else:
-    from .pipelines import MidiProcessor
diff --git a/ppdiffusers/ppdiffusers/commands/__init__.py b/ppdiffusers/ppdiffusers/commands/__init__.py
deleted file mode 100644
index c7d87f7ecd04..000000000000
--- a/ppdiffusers/ppdiffusers/commands/__init__.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from abc import ABC, abstractmethod
-from argparse import ArgumentParser
-
-
-class BasePPDiffusersCLICommand(ABC):
-    @staticmethod
-    @abstractmethod
-    def register_subcommand(parser: ArgumentParser):
-        raise NotImplementedError()
-
-    @abstractmethod
-    def run(self):
-        raise NotImplementedError()
diff --git a/ppdiffusers/ppdiffusers/commands/env.py b/ppdiffusers/ppdiffusers/commands/env.py
deleted file mode 100644
index 0ad95fd64734..000000000000
--- a/ppdiffusers/ppdiffusers/commands/env.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import platform
-from argparse import ArgumentParser
-
-from ..utils import is_paddle_available, is_paddlenlp_available
-from ..version import VERSION as version
-from . import BasePPDiffusersCLICommand
-
-
-def info_command_factory(_):
-    return EnvironmentCommand()
-
-
-class EnvironmentCommand(BasePPDiffusersCLICommand):
-    @staticmethod
-    def register_subcommand(parser: ArgumentParser):
-        download_parser = parser.add_parser("env")
-        download_parser.set_defaults(func=info_command_factory)
-
-    def run(self):
-
-        pd_version = "not installed"
-        pd_cuda_available = "NA"
-        if is_paddle_available():
-            import paddle
-
-            pd_version = paddle.__version__
-            pd_cuda_available = paddle.device.is_compiled_with_cuda()
-
-        paddlenlp_version = "not installed"
-        if is_paddlenlp_available:
-            import paddlenlp
-
-            paddlenlp_version = paddlenlp.__version__
-
-        info = {
-            "`ppdiffusers` version": version,
-            "Platform": platform.platform(),
-            "Python version": platform.python_version(),
-            "Paddle version (GPU?)": f"{pd_version} ({pd_cuda_available})",
-            "PaddleNLP version": paddlenlp_version,
-            "Using GPU in script?": "<fill in>",
-            "Using distributed or parallel set-up in script?": "<fill in>",
-        }
-
-        print("\nCopy-and-paste the text below in your GitHub issue and FILL OUT the two last points.\n")
-        print(self.format_dict(info))
-
-        return info
-
-    @staticmethod
-    def format_dict(d):
-        return "\n".join([f"- {prop}: {val}" for prop, val in d.items()]) + "\n"
diff --git a/ppdiffusers/ppdiffusers/commands/ppdiffusers_cli.py b/ppdiffusers/ppdiffusers/commands/ppdiffusers_cli.py
deleted file mode 100644
index 7575e5902a50..000000000000
--- a/ppdiffusers/ppdiffusers/commands/ppdiffusers_cli.py
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from argparse import ArgumentParser
-
-from .env import EnvironmentCommand
-
-
-def main():
-    parser = ArgumentParser("PPDiffusers CLI tool", usage="ppdiffusers-cli <command> [<args>]")
-    commands_parser = parser.add_subparsers(help="ppdiffusers-cli command helpers")
-
-    # Register commands
-    EnvironmentCommand.register_subcommand(commands_parser)
-
-    # Let's go
-    args = parser.parse_args()
-
-    if not hasattr(args, "func"):
-        parser.print_help()
-        exit(1)
-
-    # Run
-    service = args.func(args)
-    service.run()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/ppdiffusers/ppdiffusers/configuration_utils.py b/ppdiffusers/ppdiffusers/configuration_utils.py
deleted file mode 100644
index 0756734198f6..000000000000
--- a/ppdiffusers/ppdiffusers/configuration_utils.py
+++ /dev/null
@@ -1,666 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2022 The HuggingFace Inc. team.
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" ConfigMixin base class and utilities."""
-import functools
-import importlib
-import inspect
-import json
-import os
-import re
-from collections import OrderedDict
-from pathlib import PosixPath
-from typing import Any, Dict, Tuple, Union
-
-import numpy as np
-import paddle
-
-from .utils import (
-    DIFFUSERS_CACHE,
-    PPDIFFUSERS_CACHE,
-    DummyObject,
-    bos_hf_download,
-    deprecate,
-    extract_commit_hash,
-    http_user_agent,
-    logging,
-)
-from .utils.constants import FROM_HF_HUB
-from .version import VERSION as __version__
-
-logger = logging.get_logger(__name__)
-
-_re_configuration_file = re.compile(r"config\.(.*)\.json")
-
-
-class FrozenDict(OrderedDict):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        for key, value in self.items():
-            setattr(self, key, value)
-
-        self.__frozen = True
-
-    def __delitem__(self, *args, **kwargs):
-        raise Exception(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.")
-
-    def setdefault(self, *args, **kwargs):
-        raise Exception(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.")
-
-    def pop(self, *args, **kwargs):
-        raise Exception(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")
-
-    def update(self, *args, **kwargs):
-        raise Exception(f"You cannot use ``update`` on a {self.__class__.__name__} instance.")
-
-    def __setattr__(self, name, value):
-        if hasattr(self, "__frozen") and self.__frozen:
-            raise Exception(f"You cannot use ``__setattr__`` on a {self.__class__.__name__} instance.")
-        super().__setattr__(name, value)
-
-    def __setitem__(self, name, value):
-        if hasattr(self, "__frozen") and self.__frozen:
-            raise Exception(f"You cannot use ``__setattr__`` on a {self.__class__.__name__} instance.")
-        super().__setitem__(name, value)
-
-
-class ConfigMixin:
-    r"""
-    Base class for all configuration classes. Stores all configuration parameters under `self.config` Also handles all
-    methods for loading/downloading/saving classes inheriting from [`ConfigMixin`] with
-        - [`~ConfigMixin.from_config`]
-        - [`~ConfigMixin.save_config`]
-
-    Class attributes:
-        - **config_name** (`str`) -- A filename under which the config should stored when calling
-          [`~ConfigMixin.save_config`] (should be overridden by parent class).
-        - **ignore_for_config** (`List[str]`) -- A list of attributes that should not be saved in the config (should be
-          overridden by subclass).
-        - **has_compatibles** (`bool`) -- Whether the class has compatible classes (should be overridden by subclass).
-        - **_deprecated_kwargs** (`List[str]`) -- Keyword arguments that are deprecated. Note that the init function
-          should only have a `kwargs` argument if at least one argument is deprecated (should be overridden by
-          subclass).
-    """
-    config_name = None
-    ignore_for_config = []
-    has_compatibles = False
-
-    _deprecated_kwargs = []
-
-    def register_to_config(self, **kwargs):
-        if self.config_name is None:
-            raise NotImplementedError(f"Make sure that {self.__class__} has defined a class name `config_name`")
-        # Special case for `kwargs` used in deprecation warning added to schedulers
-        # TODO: remove this when we remove the deprecation warning, and the `kwargs` argument,
-        # or solve in a more general way.
-        kwargs.pop("kwargs", None)
-
-        if not hasattr(self, "_internal_dict"):
-            internal_dict = kwargs
-        else:
-            previous_dict = dict(self._internal_dict)
-            internal_dict = {**self._internal_dict, **kwargs}
-            logger.debug(f"Updating config from {previous_dict} to {internal_dict}")
-
-        self._internal_dict = FrozenDict(internal_dict)
-
-    def __getattr__(self, name: str) -> Any:
-        """The only reason we overwrite `getattr` here is to gracefully deprecate accessing
-        config attributes directly. See https://github.com/huggingface/diffusers/pull/3129
-        Tihs funtion is mostly copied from PyTorch's __getattr__ overwrite:
-        https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module
-        """
-
-        is_in_config = "_internal_dict" in self.__dict__ and hasattr(self.__dict__["_internal_dict"], name)
-        is_attribute = name in self.__dict__
-
-        if is_in_config and not is_attribute:
-            deprecation_message = f"Accessing config attribute `{name}` directly via '{type(self).__name__}' object attribute is deprecated. Please access '{name}' over '{type(self).__name__}'s config object instead, e.g. 'scheduler.config.{name}'."
-            deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False)
-            return self._internal_dict[name]
-
-        raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
-
-    def save_config(
-        self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, to_diffusers=False, **kwargs
-    ):
-        """
-        Save a configuration object to the directory `save_directory`, so that it can be re-loaded using the
-        [`~ConfigMixin.from_config`] class method.
-
-        Args:
-            save_directory (`str` or `os.PathLike`):
-                Directory where the configuration JSON file will be saved (will be created if it does not exist).
-        """
-        if os.path.isfile(save_directory):
-            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
-
-        os.makedirs(save_directory, exist_ok=True)
-
-        # If we save using the predefined names, we can load using `from_config`
-        output_config_file = os.path.join(save_directory, self.config_name)
-
-        self.to_json_file(output_config_file, to_diffusers=to_diffusers)
-        logger.info(f"Configuration saved in {output_config_file}")
-
-    @classmethod
-    def from_config(cls, config: Union[FrozenDict, Dict[str, Any]] = None, return_unused_kwargs=False, **kwargs):
-        r"""
-        Instantiate a Python class from a config dictionary
-
-        Parameters:
-            config (`Dict[str, Any]`):
-                A config dictionary from which the Python class will be instantiated. Make sure to only load
-                configuration files of compatible classes.
-            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
-                Whether kwargs that are not consumed by the Python class should be returned or not.
-
-            kwargs (remaining dictionary of keyword arguments, *optional*):
-                Can be used to update the configuration object (after it being loaded) and initiate the Python class.
-                `**kwargs` will be directly passed to the underlying scheduler/model's `__init__` method and eventually
-                overwrite same named arguments of `config`.
-
-        Examples:
-
-        ```python
-        >>> from ppdiffusers import DDPMScheduler, DDIMScheduler, PNDMScheduler
-
-        >>> # Download scheduler from huggingface.co and cache.
-        >>> scheduler = DDPMScheduler.from_pretrained("google/ddpm-cifar10-32")
-
-        >>> # Instantiate DDIM scheduler class with same config as DDPM
-        >>> scheduler = DDIMScheduler.from_config(scheduler.config)
-
-        >>> # Instantiate PNDM scheduler class with same config as DDPM
-        >>> scheduler = PNDMScheduler.from_config(scheduler.config)
-        ```
-        """
-        # <===== TO BE REMOVED WITH DEPRECATION
-        # TODO(Patrick) - make sure to remove the following lines when config=="model_path" is deprecated
-        if "pretrained_model_name_or_path" in kwargs:
-            config = kwargs.pop("pretrained_model_name_or_path")
-
-        if config is None:
-            raise ValueError("Please make sure to provide a config as the first positional argument.")
-        # ======>
-
-        if not isinstance(config, dict):
-            deprecation_message = "It is deprecated to pass a pretrained model name or path to `from_config`."
-            if "Scheduler" in cls.__name__:
-                deprecation_message += (
-                    f"If you were trying to load a scheduler, please use {cls}.from_pretrained(...) instead."
-                    " Otherwise, please make sure to pass a configuration dictionary instead. This functionality will"
-                    " be removed in v1.0.0."
-                )
-            elif "Model" in cls.__name__:
-                deprecation_message += (
-                    f"If you were trying to load a model, please use {cls}.load_config(...) followed by"
-                    f" {cls}.from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary"
-                    " instead. This functionality will be removed in v1.0.0."
-                )
-            deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)
-            config, kwargs = cls.load_config(pretrained_model_name_or_path=config, return_unused_kwargs=True, **kwargs)
-
-        init_dict, unused_kwargs, hidden_dict = cls.extract_init_dict(config, **kwargs)
-
-        # Allow dtype to be specified on initialization
-        if "dtype" in unused_kwargs:
-            init_dict["dtype"] = unused_kwargs.pop("dtype")
-
-        # add possible deprecated kwargs
-        for deprecated_kwarg in cls._deprecated_kwargs:
-            if deprecated_kwarg in unused_kwargs:
-                init_dict[deprecated_kwarg] = unused_kwargs.pop(deprecated_kwarg)
-
-        # Return model and optionally state and/or unused_kwargs
-        model = cls(**init_dict)
-
-        # make sure to also save config parameters that might be used for compatible classes
-        model.register_to_config(**hidden_dict)
-
-        # add hidden kwargs of compatible classes to unused_kwargs
-        unused_kwargs = {**unused_kwargs, **hidden_dict}
-
-        if return_unused_kwargs:
-            return (model, unused_kwargs)
-        else:
-            return model
-
-    @classmethod
-    def get_config_dict(cls, *args, **kwargs):
-        deprecation_message = (
-            f" The function get_config_dict is deprecated. Please use {cls}.load_config instead. This function will be"
-            " removed in version v1.0.0"
-        )
-        deprecate("get_config_dict", "1.0.0", deprecation_message, standard_warn=False)
-        return cls.load_config(*args, **kwargs)
-
-    @classmethod
-    def load_config(
-        cls,
-        pretrained_model_name_or_path: Union[str, os.PathLike],
-        return_unused_kwargs=False,
-        return_commit_hash=False,
-        **kwargs,
-    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        r"""
-        Instantiate a Python class from a config dictionary
-
-        Parameters:
-            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
-                Can be either:
-
-                    - A string, the *model id* of a model repo on huggingface.co. Valid model ids should have an
-                      organization name, like `google/ddpm-celebahq-256`.
-                    - A path to a *directory* containing model weights saved using [`~ConfigMixin.save_config`], e.g.,
-                      `./my_model_directory/`.
-
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory in which a downloaded pretrained model configuration should be cached if the
-                standard cache should not be used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
-                file exists.
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            output_loading_info(`bool`, *optional*, defaults to `False`):
-                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
-            local_files_only(`bool`, *optional*, defaults to `False`):
-                Whether or not to only look at local files (i.e., do not try to download the model).
-            use_auth_token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-                when running `transformers-cli login` (stored in `~/.huggingface`).
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
-                identifier allowed by git.
-            subfolder (`str`, *optional*, defaults to `""`):
-                In case the relevant files are located inside a subfolder of the model repo (either remote in
-                huggingface.co or downloaded locally), you can specify the folder name here.
-            return_unused_kwargs (`bool`, *optional*, defaults to `False):
-                Whether unused keyword arguments of the config shall be returned.
-            return_commit_hash (`bool`, *optional*, defaults to `False):
-                Whether the commit_hash of the loaded configuration shall be returned.
-            from_hf_hub (bool, *optional*):
-                Whether to load from Hugging Face Hub. Defaults to False
-        <Tip>
-
-         It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated
-         models](https://huggingface.co/docs/hub/models-gated#gated-models).
-
-        </Tip>
-
-        <Tip>
-
-        Activate the special ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode) to
-        use this method in a firewalled environment.
-
-        </Tip>
-        """
-        from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB)
-        cache_dir = (
-            kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)
-        )
-        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-        use_auth_token = kwargs.pop("use_auth_token", None)
-        local_files_only = kwargs.pop("local_files_only", False)
-        revision = kwargs.pop("revision", None)
-        _ = kwargs.pop("mirror", None)
-        subfolder = kwargs.pop("subfolder", None)
-        user_agent = kwargs.pop("user_agent", {})
-        user_agent = {**user_agent, "file_type": "config"}
-        user_agent = http_user_agent(user_agent)
-        # new add return_config_file
-        return_config_file = kwargs.pop("return_config_file", False)
-
-        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
-
-        if cls.config_name is None:
-            raise ValueError(
-                "`self.config_name` is not defined. Note that one should not load a config from "
-                "`ConfigMixin`. Please make sure to define `config_name` in a class inheriting from `ConfigMixin`"
-            )
-
-        if os.path.isfile(pretrained_model_name_or_path):
-            config_file = pretrained_model_name_or_path
-        elif os.path.isdir(pretrained_model_name_or_path):
-            if os.path.isfile(os.path.join(pretrained_model_name_or_path, cls.config_name)):
-                # Load from a PyTorch checkpoint
-                config_file = os.path.join(pretrained_model_name_or_path, cls.config_name)
-            elif subfolder is not None and os.path.isfile(
-                os.path.join(pretrained_model_name_or_path, subfolder, cls.config_name)
-            ):
-                config_file = os.path.join(pretrained_model_name_or_path, subfolder, cls.config_name)
-            else:
-                raise EnvironmentError(
-                    f"Error no file named {cls.config_name} found in directory {pretrained_model_name_or_path}."
-                )
-        else:
-            config_file = bos_hf_download(
-                pretrained_model_name_or_path,
-                filename=cls.config_name,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                proxies=proxies,
-                resume_download=resume_download,
-                local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
-                user_agent=user_agent,
-                subfolder=subfolder,
-                revision=revision,
-                from_hf_hub=from_hf_hub,
-            )
-
-        try:
-            # Load config dict
-            config_dict = cls._dict_from_json_file(config_file)
-            commit_hash = extract_commit_hash(config_file)
-
-        except (json.JSONDecodeError, UnicodeDecodeError):
-            raise EnvironmentError(f"It looks like the config file at '{config_file}' is not a valid JSON file.")
-
-        if not (return_unused_kwargs or return_commit_hash or return_config_file):
-            return config_dict
-
-        outputs = (config_dict,)
-        if return_unused_kwargs:
-            outputs += (kwargs,)
-
-        if return_commit_hash:
-            outputs += (commit_hash,)
-
-        if return_config_file:
-            outputs += (config_file,)
-
-        return outputs
-
-    @staticmethod
-    def _get_init_keys(cls):
-        return set(dict(inspect.signature(cls.__init__).parameters).keys())
-
-    @classmethod
-    def extract_init_dict(cls, config_dict, **kwargs):
-        # 0. Copy origin config dict
-        original_dict = dict(config_dict.items())
-
-        # 1. Retrieve expected config attributes from __init__ signature
-        expected_keys = cls._get_init_keys(cls)
-        expected_keys.remove("self")
-        # remove general kwargs if present in dict
-        if "kwargs" in expected_keys:
-            expected_keys.remove("kwargs")
-
-        # 2. Remove attributes that cannot be expected from expected config attributes
-        # remove keys to be ignored
-        if len(cls.ignore_for_config) > 0:
-            expected_keys = expected_keys - set(cls.ignore_for_config)
-
-        # load diffusers library to import compatible and original scheduler
-        ppdiffusers_library = importlib.import_module(__name__.split(".")[0])
-
-        if cls.has_compatibles:
-            compatible_classes = [c for c in cls._get_compatibles() if not isinstance(c, DummyObject)]
-        else:
-            compatible_classes = []
-
-        expected_keys_comp_cls = set()
-        for c in compatible_classes:
-            expected_keys_c = cls._get_init_keys(c)
-            expected_keys_comp_cls = expected_keys_comp_cls.union(expected_keys_c)
-        expected_keys_comp_cls = expected_keys_comp_cls - cls._get_init_keys(cls)
-        config_dict = {k: v for k, v in config_dict.items() if k not in expected_keys_comp_cls}
-
-        # remove attributes from orig class that cannot be expected
-        orig_cls_name = config_dict.pop("_class_name", cls.__name__)
-        if orig_cls_name != cls.__name__ and hasattr(ppdiffusers_library, orig_cls_name):
-            orig_cls = getattr(ppdiffusers_library, orig_cls_name)
-            unexpected_keys_from_orig = cls._get_init_keys(orig_cls) - expected_keys
-            config_dict = {k: v for k, v in config_dict.items() if k not in unexpected_keys_from_orig}
-
-        # remove private attributes
-        config_dict = {k: v for k, v in config_dict.items() if not k.startswith("_")}
-
-        # 3. Create keyword arguments that will be passed to __init__ from expected keyword arguments
-        init_dict = {}
-        for key in expected_keys:
-            # if config param is passed to kwarg and is present in config dict
-            # it should overwrite existing config dict key
-            if key in kwargs and key in config_dict:
-                config_dict[key] = kwargs.pop(key)
-
-            if key in kwargs:
-                # overwrite key
-                init_dict[key] = kwargs.pop(key)
-            elif key in config_dict:
-                # use value from config dict
-                init_dict[key] = config_dict.pop(key)
-
-        # 4. Give nice warning if unexpected values have been passed
-        if len(config_dict) > 0:
-            logger.warning(
-                f"The config attributes {config_dict} were passed to {cls.__name__}, "
-                "but are not expected and will be ignored. Please verify your "
-                f"{cls.config_name} configuration file."
-            )
-
-        # 5. Give nice info if config attributes are initiliazed to default because they have not been passed
-        passed_keys = set(init_dict.keys())
-        if len(expected_keys - passed_keys) > 0:
-            logger.info(
-                f"{expected_keys - passed_keys} was not found in config. Values will be initialized to default values."
-            )
-
-        # 6. Define unused keyword arguments
-        unused_kwargs = {**config_dict, **kwargs}
-
-        # 7. Define "hidden" config parameters that were saved for compatible classes
-        hidden_config_dict = {k: v for k, v in original_dict.items() if k not in init_dict}
-
-        return init_dict, unused_kwargs, hidden_config_dict
-
-    @classmethod
-    def _dict_from_json_file(cls, json_file: Union[str, os.PathLike]):
-        with open(json_file, "r", encoding="utf-8") as reader:
-            text = reader.read()
-        data = json.loads(text)
-        if "_diffusers_version" in data and "_ppdiffusers_version" not in data:
-            data["_ppdiffusers_version"] = data.pop("_diffusers_version", __version__)
-        if "_diffusers_version" not in data and "_ppdiffusers_version" not in data:
-            data["_ppdiffusers_version"] = __version__
-
-        # remove Onnx and Flax prefix
-        _class_name = data.get("_class_name", None)
-        if _class_name is not None:
-            if _class_name.startswith("Flax"):
-                data["_class_name"] = _class_name[4:]
-            elif _class_name.startswith("Onnx"):
-                data["_class_name"] = "FastDeploy" + _class_name[4:]
-
-        return data
-
-    def __repr__(self):
-        return f"{self.__class__.__name__} {self.to_json_string()}"
-
-    @property
-    def config(self) -> Dict[str, Any]:
-        """
-        Returns the config of the class as a frozen dictionary
-
-        Returns:
-            `Dict[str, Any]`: Config of the class.
-        """
-        return self._internal_dict
-
-    def to_json_string(self, to_diffusers=False) -> str:
-        """
-        Serializes this instance to a JSON string.
-
-        Returns:
-            `str`: String containing all the attributes that make up this configuration instance in JSON format.
-        """
-        config_dict = self._internal_dict if hasattr(self, "_internal_dict") else {}
-        config_dict["_class_name"] = self.__class__.__name__
-
-        # json
-        if to_diffusers:
-            config_dict["_diffusers_version"] = __version__
-        else:
-            config_dict["_ppdiffusers_version"] = __version__
-
-        def to_json_saveable(value):
-            if isinstance(value, np.ndarray):
-                value = value.tolist()
-            elif isinstance(value, PosixPath):
-                value = str(value)
-            return value
-
-        config_dict = {k: to_json_saveable(v) for k, v in config_dict.items()}
-        if to_diffusers:
-            config_dict.pop("_ppdiffusers_version", None)
-        else:
-            config_dict.pop("_diffusers_version", None)
-        # Don't save "_ignore_files"
-        config_dict.pop("_ignore_files", None)
-        json_string = json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
-        if to_diffusers:
-            json_string = json_string.replace('"ppdiffusers"', '"diffusers"').replace(
-                '"paddlenlp.transformers"', '"transformers"'
-            )
-        return json_string
-
-    def to_json_file(self, json_file_path: Union[str, os.PathLike], to_diffusers=False):
-        """
-        Save this instance to a JSON file.
-
-        Args:
-            json_file_path (`str` or `os.PathLike`):
-                Path to the JSON file in which this configuration instance's parameters will be saved.
-        """
-        with open(json_file_path, "w", encoding="utf-8") as writer:
-            writer.write(self.to_json_string(to_diffusers=to_diffusers))
-
-
-def register_to_config(init):
-    r"""
-    Decorator to apply on the init of classes inheriting from [`ConfigMixin`] so that all the arguments are
-    automatically sent to `self.register_for_config`. To ignore a specific argument accepted by the init but that
-    shouldn't be registered in the config, use the `ignore_for_config` class variable
-
-    Warning: Once decorated, all private arguments (beginning with an underscore) are trashed and not sent to the init!
-    """
-
-    @functools.wraps(init)
-    def inner_init(self, *args, **kwargs):
-        # Ignore private kwargs in the init.
-        init_kwargs = {k: v for k, v in kwargs.items() if not k.startswith("_")}
-        config_init_kwargs = {k: v for k, v in kwargs.items() if k.startswith("_")}
-        if not isinstance(self, ConfigMixin):
-            raise RuntimeError(
-                f"`@register_for_config` was applied to {self.__class__.__name__} init method, but this class does "
-                "not inherit from `ConfigMixin`."
-            )
-
-        ignore = getattr(self, "ignore_for_config", [])
-        # Get positional arguments aligned with kwargs
-        new_kwargs = {}
-        signature = inspect.signature(init)
-        parameters = {
-            name: p.default for i, (name, p) in enumerate(signature.parameters.items()) if i > 0 and name not in ignore
-        }
-        for arg, name in zip(args, parameters.keys()):
-            new_kwargs[name] = arg
-
-        # Then add all kwargs
-        new_kwargs.update(
-            {
-                k: init_kwargs.get(k, default)
-                for k, default in parameters.items()
-                if k not in ignore and k not in new_kwargs
-            }
-        )
-        new_kwargs = {**config_init_kwargs, **new_kwargs}
-        getattr(self, "register_to_config")(**new_kwargs)
-        init(self, *args, **init_kwargs)
-
-    return inner_init
-
-
-def finfo(dtype: paddle.dtype = None):
-    if dtype is None:
-        dtype = paddle.get_default_dtype()
-
-    if dtype == paddle.bfloat16:
-        # Numpy do not support `np.finfo(np.uint16)`, so try to construct a finfo object to fetch min value
-        class BFloatFInfo:
-            min = -3.3895313892515355e38
-
-        return BFloatFInfo
-    if dtype == paddle.float32:
-        return np.finfo(np.float32)
-    if dtype == paddle.float16:
-        return np.finfo(np.float16)
-    if dtype == paddle.float64:
-        return np.finfo(np.float64)
-
-
-class ModuleUtilsMixin:
-    """
-    A few utilities for `torch.nn.Modules`, to be used as a mixin.
-    """
-
-    def get_extended_attention_mask(
-        self, attention_mask: paddle.Tensor, input_shape: Tuple[int], dtype: paddle.float32 = None
-    ) -> paddle.Tensor:
-        """
-        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
-        Arguments:
-            attention_mask (`paddle.Tensor`):
-                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
-            input_shape (`Tuple[int]`):
-                The shape of the input to the model.
-        Returns:
-            `paddle.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
-        """
-        if dtype is None:
-            dtype = self.dtype
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        if attention_mask.dim() == 3:
-            extended_attention_mask = attention_mask[:, None, :, :]
-        elif attention_mask.dim() == 2:
-            # Provided a padding mask of dimensions [batch_size, seq_length]
-            # - the model is an encoder, so make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
-            extended_attention_mask = attention_mask[:, None, None, :]
-        else:
-            raise ValueError(
-                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
-                    input_shape, attention_mask.shape
-                )
-            )
-
-        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-        # masked positions, this operation will create a tensor which is 0.0 for
-        # positions we want to attend and -10000.0 for masked positions.
-        # Since we are adding it to the raw scores before the softmax, this is
-        # effectively the same as removing these entirely.
-        extended_attention_mask = (1.0 - extended_attention_mask) * finfo(dtype).min
-        return extended_attention_mask
diff --git a/ppdiffusers/ppdiffusers/experimental/README.md b/ppdiffusers/ppdiffusers/experimental/README.md
deleted file mode 100644
index 847e23ba7c7a..000000000000
--- a/ppdiffusers/ppdiffusers/experimental/README.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# 🧨 PPDiffusers Experimental
-
-为了使得**PPDiffusers库**能够有更多的应用场景，我们在这里添加了一些**实验性的代码**。
-
-目前我们支持了以下场景:
-* Reinforcement learning via an implementation of the [PPDiffuser](https://arxiv.org/abs/2205.09991) model.
diff --git a/ppdiffusers/ppdiffusers/experimental/__init__.py b/ppdiffusers/ppdiffusers/experimental/__init__.py
deleted file mode 100644
index 9b4dc4a337e3..000000000000
--- a/ppdiffusers/ppdiffusers/experimental/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .rl import ValueGuidedRLPipeline
diff --git a/ppdiffusers/ppdiffusers/experimental/rl/__init__.py b/ppdiffusers/ppdiffusers/experimental/rl/__init__.py
deleted file mode 100644
index eb0af229daa9..000000000000
--- a/ppdiffusers/ppdiffusers/experimental/rl/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .value_guided_sampling import ValueGuidedRLPipeline
diff --git a/ppdiffusers/ppdiffusers/experimental/rl/value_guided_sampling.py b/ppdiffusers/ppdiffusers/experimental/rl/value_guided_sampling.py
deleted file mode 100644
index 730f5b91dba6..000000000000
--- a/ppdiffusers/ppdiffusers/experimental/rl/value_guided_sampling.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import paddle
-
-from ...models.unet_1d import UNet1DModel
-from ...pipelines import DiffusionPipeline
-from ...utils import randn_tensor
-from ...utils.dummy_paddle_objects import DDPMScheduler
-
-
-class ValueGuidedRLPipeline(DiffusionPipeline):
-    r"""
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-    Pipeline for sampling actions from a diffusion model trained to predict sequences of states.
-
-    Original implementation inspired by this repository: https://github.com/jannerm/diffuser.
-
-    Parameters:
-        value_function ([`UNet1DModel`]): A specialized UNet for fine-tuning trajectories base on reward.
-        unet ([`UNet1DModel`]): U-Net architecture to denoise the encoded trajectories.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded trajectories. Default for this
-            application is [`DDPMScheduler`].
-        env: An environment following the OpenAI gym API to act in. For now only Hopper has pretrained models.
-    """
-
-    def __init__(
-        self,
-        value_function: UNet1DModel,
-        unet: UNet1DModel,
-        scheduler: DDPMScheduler,
-        env,
-    ):
-        super().__init__()
-        self.value_function = value_function
-        self.unet = unet
-        self.scheduler = scheduler
-        self.env = env
-        self.data = env.get_dataset()
-        self.means = {}
-        for key in self.data.keys():
-            try:
-                self.means[key] = self.data[key].mean()
-            except Exception:
-                pass
-        self.stds = {}
-        for key in self.data.keys():
-            try:
-                self.stds[key] = self.data[key].std()
-            except Exception:
-                pass
-        self.state_dim = env.observation_space.shape[0]
-        self.action_dim = env.action_space.shape[0]
-
-    def normalize(self, x_in, key):
-        return (x_in - self.means[key]) / self.stds[key]
-
-    def de_normalize(self, x_in, key):
-        return x_in * self.stds[key] + self.means[key]
-
-    def to_paddle(self, x_in):
-        if type(x_in) is dict:
-            return {k: self.to_paddle(v) for k, v in x_in.items()}
-        elif paddle.is_tensor(x_in):
-            return x_in
-        return paddle.to_tensor(x_in)
-
-    def reset_x0(self, x_in, cond, act_dim):
-        for key, val in cond.items():
-            x_in[:, key, act_dim:] = val.clone()
-        return x_in
-
-    def run_diffusion(self, x, conditions, n_guide_steps, scale):
-        batch_size = x.shape[0]
-        y = None
-        for i in self.progress_bar(self.scheduler.timesteps):
-            # create batch of timesteps to pass into model
-            timesteps = paddle.full((batch_size,), i, dtype=paddle.int64)
-            for _ in range(n_guide_steps):
-                with paddle.set_grad_enabled(True):
-                    x.stop_gradient = False
-
-                    # permute to match dimension for pre-trained models
-                    y = self.value_function(x.transpose([0, 2, 1]), timesteps).sample
-                    grad = paddle.autograd.grad([y.sum()], [x])[0]
-
-                    posterior_variance = self.scheduler._get_variance(i)
-                    model_std = paddle.exp(0.5 * posterior_variance)
-                    grad = model_std * grad
-
-                grad[timesteps < 2] = 0
-                x = x.detach()
-                x = x + scale * grad
-                x = self.reset_x0(x, conditions, self.action_dim)
-
-            prev_x = self.unet(x.transpose([0, 2, 1]), timesteps).sample.transpose([0, 2, 1])
-
-            # TODO: verify deprecation of this kwarg
-            x = self.scheduler.step(prev_x, i, x, predict_epsilon=False)["prev_sample"]
-
-            # apply conditions to the trajectory (set the initial state)
-            x = self.reset_x0(x, conditions, self.action_dim)
-            x = self.to_paddle(x)
-        return x, y
-
-    def __call__(self, obs, batch_size=64, planning_horizon=32, n_guide_steps=2, scale=0.1):
-        # normalize the observations and create  batch dimension
-        obs = self.normalize(obs, "observations")
-        obs = obs[None].repeat(batch_size, axis=0)
-
-        conditions = {0: self.to_paddle(obs)}
-        shape = (batch_size, planning_horizon, self.state_dim + self.action_dim)
-
-        # generate initial noise and apply our conditions (to make the trajectories start at current state)
-        x1 = randn_tensor(shape, dtype=self.unet.dtype)
-        x = self.reset_x0(x1, conditions, self.action_dim)
-        x = self.to_paddle(x)
-
-        # run the diffusion process
-        x, y = self.run_diffusion(x, conditions, n_guide_steps, scale)
-
-        # sort output trajectories by value
-        sorted_idx = paddle.argsort(y, 0, descending=True).squeeze()
-        sorted_values = x[sorted_idx]
-        actions = sorted_values[:, :, : self.action_dim]
-        actions = actions.detach().cpu().numpy()
-        denorm_actions = self.de_normalize(actions, key="actions")
-
-        # select the action with the highest value
-        if y is not None:
-            selected_index = 0
-        else:
-            # if we didn't run value guiding, select a random action
-            selected_index = np.random.randint(0, batch_size)
-
-        denorm_actions = denorm_actions[selected_index, 0]
-        return denorm_actions
diff --git a/ppdiffusers/ppdiffusers/image_processor.py b/ppdiffusers/ppdiffusers/image_processor.py
deleted file mode 100644
index 3d1fe9e43aab..000000000000
--- a/ppdiffusers/ppdiffusers/image_processor.py
+++ /dev/null
@@ -1,255 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-from typing import List, Optional, Union
-
-import numpy as np
-import paddle
-import PIL
-from PIL import Image
-
-from .configuration_utils import ConfigMixin, register_to_config
-from .utils import CONFIG_NAME, PIL_INTERPOLATION, deprecate
-
-
-class VaeImageProcessor(ConfigMixin):
-    """
-    Image Processor for VAE
-
-    Args:
-        do_resize (`bool`, *optional*, defaults to `True`):
-            Whether to downscale the image's (height, width) dimensions to multiples of `vae_scale_factor`. Can accept
-            `height` and `width` arguments from `preprocess` method
-        vae_scale_factor (`int`, *optional*, defaults to `8`):
-            VAE scale factor. If `do_resize` is True, the image will be automatically resized to multiples of this
-            factor.
-        resample (`str`, *optional*, defaults to `lanczos`):
-            Resampling filter to use when resizing the image.
-        do_normalize (`bool`, *optional*, defaults to `True`):
-            Whether to normalize the image to [-1,1]
-        do_convert_rgb (`bool`, *optional*, defaults to be `False`):
-            Whether to convert the images to RGB format.
-    """
-
-    config_name = CONFIG_NAME
-
-    @register_to_config
-    def __init__(
-        self,
-        do_resize: bool = True,
-        vae_scale_factor: int = 8,
-        resample: str = "lanczos",
-        do_normalize: bool = True,
-        do_convert_rgb: bool = False,
-    ):
-        super().__init__()
-
-    @staticmethod
-    def numpy_to_pil(images: np.ndarray) -> PIL.Image.Image:
-        """
-        Convert a numpy image or a batch of images to a PIL image.
-        """
-        if images.ndim == 3:
-            images = images[None, ...]
-        images = (images * 255).round().astype("uint8")
-        if images.shape[-1] == 1:
-            # special case for grayscale (single channel) images
-            pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
-        else:
-            pil_images = [Image.fromarray(image) for image in images]
-
-        return pil_images
-
-    @staticmethod
-    def pil_to_numpy(images: Union[List[PIL.Image.Image], PIL.Image.Image]) -> np.ndarray:
-        """
-        Convert a PIL image or a list of PIL images to numpy arrays.
-        """
-        if not isinstance(images, list):
-            images = [images]
-        images = [np.array(image).astype(np.float32) / 255.0 for image in images]
-        images = np.stack(images, axis=0)
-
-        return images
-
-    @staticmethod
-    def numpy_to_pd(images: np.ndarray) -> paddle.Tensor:
-        """
-        Convert a numpy image to a paddle tensor
-        """
-        if images.ndim == 3:
-            images = images[..., None]
-
-        images = paddle.to_tensor(images.transpose([0, 3, 1, 2]))
-        return images
-
-    @staticmethod
-    def pd_to_numpy(images: paddle.Tensor) -> np.ndarray:
-        """
-        Convert a paddle tensor to a numpy image
-        """
-        images = images.cast("float32").transpose([0, 2, 3, 1]).numpy()
-        return images
-
-    @staticmethod
-    def normalize(images):
-        """
-        Normalize an image array to [-1,1]
-        """
-        return 2.0 * images - 1.0
-
-    @staticmethod
-    def denormalize(images):
-        """
-        Denormalize an image array to [0,1]
-        """
-        return (images / 2 + 0.5).clip(0, 1)
-
-    @staticmethod
-    def convert_to_rgb(image: PIL.Image.Image) -> PIL.Image.Image:
-        """
-        Converts an image to RGB format.
-        """
-        image = image.convert("RGB")
-        return image
-
-    def resize(
-        self,
-        image: PIL.Image.Image,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-    ) -> PIL.Image.Image:
-        """
-        Resize a PIL image. Both height and width will be downscaled to the next integer multiple of `vae_scale_factor`
-        """
-        if height is None:
-            height = image.height
-        if width is None:
-            width = image.width
-
-        width, height = (
-            x - x % self.config.vae_scale_factor for x in (width, height)
-        )  # resize to integer multiple of vae_scale_factor
-        image = image.resize((width, height), resample=PIL_INTERPOLATION[self.config.resample])
-        return image
-
-    def preprocess(
-        self,
-        image: Union[paddle.Tensor, PIL.Image.Image, np.ndarray],
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        do_normalize: Optional[bool] = None,  # new added, not exists in diffusers
-    ) -> paddle.Tensor:
-        """
-        Preprocess the image input, accepted formats are PIL images, numpy arrays or paddle tensors"
-        """
-        supported_formats = (PIL.Image.Image, np.ndarray, paddle.Tensor)
-        if isinstance(image, supported_formats):
-            image = [image]
-        elif not (isinstance(image, list) and all(isinstance(i, supported_formats) for i in image)):
-            raise ValueError(
-                f"Input is in incorrect format: {[type(i) for i in image]}. Currently, we only support {', '.join(supported_formats)}"
-            )
-
-        if isinstance(image[0], PIL.Image.Image):
-            if self.config.do_convert_rgb:
-                image = [self.convert_to_rgb(i) for i in image]
-            if self.config.do_resize:
-                image = [self.resize(i, height, width) for i in image]
-            image = self.pil_to_numpy(image)  # to np
-            image = self.numpy_to_pd(image)  # to pd
-
-        elif isinstance(image[0], np.ndarray):
-            image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
-            image = self.numpy_to_pd(image)
-            _, _, height, width = image.shape
-            if self.config.do_resize and (
-                height % self.config.vae_scale_factor != 0 or width % self.config.vae_scale_factor != 0
-            ):
-                raise ValueError(
-                    f"Currently we only support resizing for PIL image - please resize your numpy array to be divisible by {self.config.vae_scale_factor}"
-                    f"currently the sizes are {height} and {width}. You can also pass a PIL image instead to use resize option in VAEImageProcessor"
-                )
-
-        elif isinstance(image[0], paddle.Tensor):
-            image = paddle.concat(image, axis=0) if image[0].ndim == 4 else paddle.stack(image, axis=0)
-            _, channel, height, width = image.shape
-
-            # don't need any preprocess if the image is latents
-            if channel == 4:
-                return image
-
-            if self.config.do_resize and (
-                height % self.config.vae_scale_factor != 0 or width % self.config.vae_scale_factor != 0
-            ):
-                raise ValueError(
-                    f"Currently we only support resizing for PIL image - please resize your paddle tensor to be divisible by {self.config.vae_scale_factor}"
-                    f"currently the sizes are {height} and {width}. You can also pass a PIL image instead to use resize option in VAEImageProcessor"
-                )
-
-        # expected range [0,1], normalize to [-1,1]
-        do_normalize = self.config.do_normalize if do_normalize is None else do_normalize
-        if image.min() < 0:
-            warnings.warn(
-                "Passing `image` as paddle tensor with value range in [-1,1] is deprecated. The expected value range for image tensor is [0,1] "
-                f"when passing as paddle tensor or numpy Array. You passed `image` with value range [{image.min()},{image.max()}]",
-                FutureWarning,
-            )
-            do_normalize = False
-
-        if do_normalize:
-            image = self.normalize(image)
-
-        return image
-
-    def postprocess(
-        self,
-        image: paddle.Tensor,
-        output_type: str = "pil",
-        do_denormalize: Optional[List[bool]] = None,
-    ):
-        if not isinstance(image, paddle.Tensor):
-            raise ValueError(
-                f"Input for postprocessing is in incorrect format: {type(image)}. We only support paddle tensor"
-            )
-        if output_type not in ["latent", "pd", "np", "pil"]:
-            deprecation_message = (
-                f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: "
-                "`pil`, `np`, `pd`, `latent`"
-            )
-            deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False)
-            output_type = "np"
-
-        if output_type == "latent":
-            return image
-
-        if do_denormalize is None:
-            do_denormalize = [self.config.do_normalize] * image.shape[0]
-
-        image = paddle.stack(
-            [self.denormalize(image[i]) if do_denormalize[i] else image[i] for i in range(image.shape[0])]
-        )
-
-        if output_type == "pd":
-            return image
-
-        image = self.pd_to_numpy(image)
-
-        if output_type == "np":
-            return image
-
-        if output_type == "pil":
-            return self.numpy_to_pil(image)
diff --git a/ppdiffusers/ppdiffusers/initializer.py b/ppdiffusers/ppdiffusers/initializer.py
deleted file mode 100644
index d084b5090bb4..000000000000
--- a/ppdiffusers/ppdiffusers/initializer.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-
-# limitations under the License.
-
-# NOTE: This file is deprecated and will be removed in a future version.
-# It only exists so that temporarely `from ppdiffusers.utils.initializer_utils import *` works
-# flake8: noqa
-from .utils.initializer_utils import *  # noqa: F401
diff --git a/ppdiffusers/ppdiffusers/loaders.py b/ppdiffusers/ppdiffusers/loaders.py
deleted file mode 100644
index 351b6ba3a1e8..000000000000
--- a/ppdiffusers/ppdiffusers/loaders.py
+++ /dev/null
@@ -1,1621 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import copy
-import os
-import warnings
-from collections import defaultdict
-from pathlib import Path
-from typing import Callable, Dict, List, Optional, Union
-
-import paddle
-import paddle.nn as nn
-from huggingface_hub import hf_hub_download
-from huggingface_hub.file_download import _request_wrapper, hf_raise_for_status
-
-from .models.attention_processor import (
-    CustomDiffusionAttnProcessor,
-    CustomDiffusionXFormersAttnProcessor,
-    LoRAAttnProcessor,
-)
-from .models.modeling_utils import convert_state_dict
-from .utils import (
-    DIFFUSERS_CACHE,
-    FROM_DIFFUSERS,
-    FROM_HF_HUB,
-    HF_HUB_OFFLINE,
-    PPDIFFUSERS_CACHE,
-    TEXT_ENCODER_ATTN_MODULE,
-    TO_DIFFUSERS,
-    _get_model_file,
-    is_paddlenlp_available,
-    is_safetensors_available,
-    is_torch_available,
-    is_torch_file,
-    logging,
-    ppdiffusers_url_download,
-    safetensors_load,
-    smart_load,
-    torch_load,
-)
-
-logger = logging.get_logger(__name__)
-
-if is_torch_available():
-    import torch
-if is_safetensors_available():
-    import safetensors
-
-if is_paddlenlp_available():
-    from paddlenlp.transformers import PretrainedModel, PretrainedTokenizer
-
-TEXT_ENCODER_NAME = "text_encoder"
-UNET_NAME = "unet"
-
-TORCH_LORA_WEIGHT_NAME = "pytorch_lora_weights.bin"
-TORCH_LORA_WEIGHT_NAME_SAFE = "pytorch_lora_weights.safetensors"
-PADDLE_LORA_WEIGHT_NAME = "paddle_lora_weights.pdparams"
-
-TORCH_TEXT_INVERSION_NAME = "learned_embeds.bin"
-TORCH_TEXT_INVERSION_NAME_SAFE = "learned_embeds.safetensors"
-PADDLE_TEXT_INVERSION_NAME = "learned_embeds.pdparams"
-
-TORCH_CUSTOM_DIFFUSION_WEIGHT_NAME = "pytorch_custom_diffusion_weights.bin"
-TORCH_CUSTOM_DIFFUSION_WEIGHT_NAME_SAFE = "pytorch_custom_diffusion_weights.safetensors"
-PADDLE_CUSTOM_DIFFUSION_WEIGHT_NAME = "paddle_custom_diffusion_weights.pdparams"
-
-
-def transpose_state_dict(state_dict, name_mapping=None):
-    new_state_dict = {}
-    for k, v in state_dict.items():
-        if name_mapping is not None:
-            for old_name, new_name in name_mapping.items():
-                k = k.replace(old_name, new_name)
-        if v.ndim == 2:
-            new_state_dict[k] = v.T.contiguous() if hasattr(v, "contiguous") else v.T
-        else:
-            new_state_dict[k] = v.contiguous() if hasattr(v, "contiguous") else v
-    return new_state_dict
-
-
-class AttnProcsLayers(nn.Layer):
-    def __init__(self, state_dict: Dict[str, paddle.Tensor]):
-        super().__init__()
-        self.layers = nn.LayerList(state_dict.values())
-        self.mapping = dict(enumerate(state_dict.keys()))
-        self.rev_mapping = {v: k for k, v in enumerate(state_dict.keys())}
-
-        # .processor for unet, .self_attn for text encoder
-        self.split_keys = [".processor", ".self_attn"]
-
-        # we add a hook to state_dict() and load_state_dict() so that the
-        # naming fits with `unet.attn_processors`
-        def map_to(state_dict, *args, **kwargs):
-            new_state_dict = {}
-            for key, value in state_dict.items():
-                num = int(key.split(".")[1])  # 0 is always "layers"
-                new_key = key.replace(f"layers.{num}", self.mapping[num])
-                new_state_dict[new_key] = value
-
-            return new_state_dict
-
-        def remap_key(key, state_dict):
-            for k in self.split_keys:
-                if k in key:
-                    return key.split(k)[0] + k
-
-            raise ValueError(
-                f"There seems to be a problem with the state_dict: {set(state_dict.keys())}. {key} has to have one of {self.split_keys}."
-            )
-
-        def map_from(module, state_dict, *args, **kwargs):
-            all_keys = list(state_dict.keys())
-            for key in all_keys:
-                replace_key = remap_key(key, state_dict)
-                new_key = key.replace(replace_key, f"layers.{module.rev_mapping[replace_key]}")
-                state_dict[new_key] = state_dict[key]
-                del state_dict[key]
-
-        self.register_state_dict_hook(map_to)
-        self.register_load_state_dict_pre_hook(map_from, with_module=True)
-
-
-class UNet2DConditionLoadersMixin:
-    text_encoder_name = TEXT_ENCODER_NAME
-    unet_name = UNET_NAME
-
-    def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, paddle.Tensor]], **kwargs):
-        r"""
-        Load pretrained attention processor layers into `UNet2DConditionModel`. Attention processor layers have to be
-        defined in
-        [`cross_attention.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py)
-        and be a `nn.Layer` class.
-
-        <Tip warning={true}>
-
-        This function is experimental and might change in the future.
-
-        </Tip>
-
-        Parameters:
-            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
-                Can be either:
-                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids should have an organization name, like `google/ddpm-celebahq-256`.
-                    - A path to a *directory* containing model weights saved using [`~ModelMixin.save_config`], e.g.,
-                      `./my_model_directory/`.
-                    - A [torch state
-                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory in which a downloaded pretrained model configuration should be cached if the
-                standard cache should not be used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
-                file exists.
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            local_files_only(`bool`, *optional*, defaults to `False`):
-                Whether or not to only look at local files (i.e., do not try to download the model).
-            use_auth_token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-                when running `diffusers-cli login` (stored in `~/.huggingface`).
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
-                identifier allowed by git.
-            subfolder (`str`, *optional*, defaults to `""`):
-                In case the relevant files are located inside a subfolder of the model repo (either remote in
-                huggingface.co or downloaded locally), you can specify the folder name here.
-            mirror (`str`, *optional*):
-                Mirror source to accelerate downloads in China. If you are from China and have an accessibility
-                problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
-                Please refer to the mirror site for more information.
-            from_hf_hub (bool, optional): whether to load from Huggingface Hub.
-            from_diffusers (`bool`, *optional*, defaults to `False`):
-                Load the model weights from a torch checkpoint save file.
-        <Tip>
-         It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated
-         models](https://huggingface.co/docs/hub/models-gated#gated-models).
-        </Tip>
-        """
-        from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB)
-        cache_dir = (
-            kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)
-        )
-        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
-        from_diffusers = kwargs.pop("from_diffusers", FROM_DIFFUSERS)
-        proxies = kwargs.pop("proxies", None)
-        local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE)
-        use_auth_token = kwargs.pop("use_auth_token", None)
-        revision = kwargs.pop("revision", None)
-        subfolder = kwargs.pop("subfolder", None)
-        weight_name = kwargs.pop("weight_name", None)
-        use_safetensors = kwargs.pop("use_safetensors", None)
-        # This value has the same meaning as the `--network_alpha` option in the kohya-ss trainer script.
-        # See https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
-        network_alpha = kwargs.pop("network_alpha", None)
-
-        if from_diffusers and use_safetensors and not is_safetensors_available():
-            raise ValueError(
-                "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors"
-            )
-        if use_safetensors is None:
-            use_safetensors = is_safetensors_available()
-
-        user_agent = {
-            "file_type": "attn_procs_weights",
-            "framework": "pytorch" if from_diffusers else "paddle",
-        }
-
-        model_file = None
-
-        if not isinstance(pretrained_model_name_or_path_or_dict, dict):
-            if from_diffusers:
-                # Let's first try to load .safetensors weights
-                if (use_safetensors and weight_name is None) or (
-                    weight_name is not None and weight_name.endswith(".safetensors")
-                ):
-                    try:
-                        model_file = _get_model_file(
-                            pretrained_model_name_or_path_or_dict,
-                            weights_name=weight_name or TORCH_LORA_WEIGHT_NAME_SAFE,
-                            cache_dir=cache_dir,
-                            force_download=force_download,
-                            resume_download=resume_download,
-                            proxies=proxies,
-                            local_files_only=local_files_only,
-                            use_auth_token=use_auth_token,
-                            revision=revision,
-                            subfolder=subfolder,
-                            user_agent=user_agent,
-                            from_hf_hub=from_hf_hub,
-                        )
-                        state_dict = smart_load(model_file)
-                    except Exception:
-                        model_file = None
-                        pass
-                if model_file is None:
-                    model_file = _get_model_file(
-                        pretrained_model_name_or_path_or_dict,
-                        weights_name=weight_name or TORCH_LORA_WEIGHT_NAME,
-                        cache_dir=cache_dir,
-                        force_download=force_download,
-                        resume_download=resume_download,
-                        proxies=proxies,
-                        local_files_only=local_files_only,
-                        use_auth_token=use_auth_token,
-                        revision=revision,
-                        subfolder=subfolder,
-                        user_agent=user_agent,
-                        from_hf_hub=from_hf_hub,
-                    )
-                    state_dict = smart_load(model_file)
-            else:
-                model_file = _get_model_file(
-                    pretrained_model_name_or_path_or_dict,
-                    weights_name=weight_name or PADDLE_LORA_WEIGHT_NAME,
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    resume_download=resume_download,
-                    proxies=proxies,
-                    local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
-                    revision=revision,
-                    subfolder=subfolder,
-                    user_agent=user_agent,
-                    from_hf_hub=from_hf_hub,
-                )
-                state_dict = smart_load(model_file)
-        else:
-            state_dict = pretrained_model_name_or_path_or_dict
-
-        # fill attn processors
-        attn_processors = {}
-
-        is_lora = all("lora" in k for k in state_dict.keys())
-        is_custom_diffusion = any("custom_diffusion" in k for k in state_dict.keys())
-
-        if from_diffusers or is_torch_file(model_file):
-            state_dict = transpose_state_dict(state_dict)
-
-        if is_lora:
-            is_new_lora_format = all(
-                key.startswith(self.unet_name) or key.startswith(self.text_encoder_name) for key in state_dict.keys()
-            )
-            if is_new_lora_format:
-                # Strip the `"unet"` prefix.
-                is_text_encoder_present = any(key.startswith(self.text_encoder_name) for key in state_dict.keys())
-                if is_text_encoder_present:
-                    warn_message = "The state_dict contains LoRA params corresponding to the text encoder which are not being used here. To use both UNet and text encoder related LoRA params, use [`pipe.load_lora_weights()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraLoaderMixin.load_lora_weights)."
-                    warnings.warn(warn_message)
-                unet_keys = [k for k in state_dict.keys() if k.startswith(self.unet_name)]
-                state_dict = {k.replace(f"{self.unet_name}.", ""): v for k, v in state_dict.items() if k in unet_keys}
-
-            lora_grouped_dict = defaultdict(dict)
-            for key, value in state_dict.items():
-                attn_processor_key, sub_key = ".".join(key.split(".")[:-3]), ".".join(key.split(".")[-3:])
-                lora_grouped_dict[attn_processor_key][sub_key] = value.cast(
-                    dtype="float32"
-                )  # we must cast this to float32
-
-            for key, value_dict in lora_grouped_dict.items():
-                rank = value_dict["to_k_lora.down.weight"].shape[1]  # 0 -> 1, torch vs paddle nn.Linear
-                cross_attention_dim = value_dict["to_k_lora.down.weight"].shape[0]  # 1 -> 0, torch vs paddle nn.Linear
-                hidden_size = value_dict["to_k_lora.up.weight"].shape[1]  # 0 -> 1, torch vs paddle nn.Linear
-
-                attn_processors[key] = LoRAAttnProcessor(
-                    hidden_size=hidden_size,
-                    cross_attention_dim=cross_attention_dim,
-                    rank=rank,
-                    network_alpha=network_alpha,
-                )
-                attn_processors[key].load_dict(value_dict)
-        elif is_custom_diffusion:
-            custom_diffusion_grouped_dict = defaultdict(dict)
-            for key, value in state_dict.items():
-                if len(value) == 0:
-                    custom_diffusion_grouped_dict[key] = {}
-                else:
-                    if "to_out" in key:
-                        attn_processor_key, sub_key = ".".join(key.split(".")[:-3]), ".".join(key.split(".")[-3:])
-                    else:
-                        attn_processor_key, sub_key = ".".join(key.split(".")[:-2]), ".".join(key.split(".")[-2:])
-                    custom_diffusion_grouped_dict[attn_processor_key][sub_key] = value.cast(
-                        dtype="float32"
-                    )  # we must cast this to float32
-
-            for key, value_dict in custom_diffusion_grouped_dict.items():
-                if len(value_dict) == 0:
-                    attn_processors[key] = CustomDiffusionAttnProcessor(
-                        train_kv=False, train_q_out=False, hidden_size=None, cross_attention_dim=None
-                    )
-                else:
-                    cross_attention_dim = value_dict["to_k_custom_diffusion.weight"].shape[
-                        0
-                    ]  # 1 -> 0, torch vs paddle nn.Linear
-                    hidden_size = value_dict["to_k_custom_diffusion.weight"].shape[
-                        1
-                    ]  # 0 -> 1, torch vs paddle nn.Linear
-                    train_q_out = True if "to_q_custom_diffusion.weight" in value_dict else False
-                    attn_processors[key] = CustomDiffusionAttnProcessor(
-                        train_kv=True,
-                        train_q_out=train_q_out,
-                        hidden_size=hidden_size,
-                        cross_attention_dim=cross_attention_dim,
-                    )
-                    attn_processors[key].load_dict(value_dict)
-        else:
-            raise ValueError(
-                f"{model_file} does not seem to be in the correct format expected by LoRA or Custom Diffusion training."
-            )
-        # set correct dtype & device
-        attn_processors = {k: v.to(dtype=self.dtype) for k, v in attn_processors.items()}
-
-        # set layers
-        self.set_attn_processor(attn_processors)
-
-    def save_attn_procs(
-        self,
-        save_directory: Union[str, os.PathLike],
-        is_main_process: bool = True,
-        weight_name: str = None,
-        save_function: Callable = None,
-        safe_serialization: bool = False,
-        to_diffusers: Optional[bool] = None,
-    ):
-        r"""
-        Save an attention processor to a directory, so that it can be re-loaded using the
-        `[`~loaders.UNet2DConditionLoadersMixin.load_attn_procs`]` method.
-
-        Arguments:
-            save_directory (`str` or `os.PathLike`):
-                Directory to which to save. Will be created if it doesn't exist.
-            is_main_process (`bool`, *optional*, defaults to `True`):
-                Whether the process calling this is the main process or not. Useful when in distributed training like
-                TPUs and need to call this function on all processes. In this case, set `is_main_process=True` only on
-                the main process to avoid race conditions.
-            save_function (`Callable`):
-                The function to use to save the state dictionary. Useful on distributed training like TPUs when one
-                need to replace `paddle.save` by another method. Can be configured with the environment variable
-                `DIFFUSERS_SAVE_MODE`.
-            variant (`str`, *optional*):
-                If specified, weights are saved in the format pytorch_model.<variant>.bin.
-            to_diffusers (`bool`, *optional*, defaults to `None`):
-                If specified, weights are saved in the format of torch. eg. linear need transpose.
-            safe_serialization (`bool`, *optional*, defaults to `False`):
-                Only when `to_diffusers` is True, Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
-        """
-        if to_diffusers is None:
-            to_diffusers = TO_DIFFUSERS
-        if to_diffusers and safe_serialization and not is_safetensors_available():
-            raise ImportError("`safe_serialization` requires the `safetensors library: `pip install safetensors`.")
-
-        if os.path.isfile(save_directory):
-            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
-            return
-
-        os.makedirs(save_directory, exist_ok=True)
-
-        is_custom_diffusion = any(
-            isinstance(x, (CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor))
-            for (_, x) in self.attn_processors.items()
-        )
-        if is_custom_diffusion:
-            model_to_save = AttnProcsLayers(
-                {
-                    y: x
-                    for (y, x) in self.attn_processors.items()
-                    if isinstance(x, (CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor))
-                }
-            )
-            state_dict = model_to_save.state_dict()
-            for name, attn in self.attn_processors.items():
-                if len(attn.state_dict()) == 0:
-                    state_dict[name] = {}
-        else:
-            model_to_save = AttnProcsLayers(self.attn_processors)
-            state_dict = model_to_save.state_dict()
-
-        if weight_name is None:
-            if to_diffusers:
-                if safe_serialization:
-                    weight_name = (
-                        TORCH_CUSTOM_DIFFUSION_WEIGHT_NAME_SAFE if is_custom_diffusion else TORCH_LORA_WEIGHT_NAME_SAFE
-                    )
-                else:
-                    weight_name = TORCH_CUSTOM_DIFFUSION_WEIGHT_NAME if is_custom_diffusion else TORCH_LORA_WEIGHT_NAME
-            else:
-                weight_name = PADDLE_CUSTOM_DIFFUSION_WEIGHT_NAME if is_custom_diffusion else PADDLE_LORA_WEIGHT_NAME
-
-        # choose save_function
-        if save_function is None:
-            if to_diffusers:
-                if safe_serialization:
-                    if is_torch_available():
-                        _save_function = safetensors.torch.save_file
-                        state_dict = convert_state_dict(state_dict, framework="torch")
-                    else:
-                        _save_function = safetensors.numpy.save_file
-                        state_dict = convert_state_dict(state_dict, framework="numpy")
-
-                    def save_function(weights, filename):
-                        return _save_function(weights, filename, metadata={"format": "pt"})
-
-                else:
-                    if not is_torch_available():
-                        raise ImportError(
-                            "`to_diffusers=True` with `safe_serialization=False` requires the `torch library: `pip install torch`."
-                        )
-                    save_function = torch.save
-                    state_dict = convert_state_dict(state_dict, framework="torch")
-                state_dict = transpose_state_dict(state_dict)
-            else:
-                save_function = paddle.save
-
-        # Save the model
-        save_function(state_dict, os.path.join(save_directory, weight_name))
-
-        logger.info(f"Model weights saved in {os.path.join(save_directory, weight_name)}")
-
-
-class TextualInversionLoaderMixin:
-    r"""
-    Mixin class for loading textual inversion tokens and embeddings to the tokenizer and text encoder.
-    """
-
-    def maybe_convert_prompt(self, prompt: Union[str, List[str]], tokenizer: "PretrainedTokenizer"):
-        r"""
-        Maybe convert a prompt into a "multi vector"-compatible prompt. If the prompt includes a token that corresponds
-        to a multi-vector textual inversion embedding, this function will process the prompt so that the special token
-        is replaced with multiple special tokens each corresponding to one of the vectors. If the prompt has no textual
-        inversion token or a textual inversion token that is a single vector, the input prompt is simply returned.
-        Parameters:
-            prompt (`str` or list of `str`):
-                The prompt or prompts to guide the image generation.
-            tokenizer (`PretrainedTokenizer`):
-                The tokenizer responsible for encoding the prompt into input tokens.
-        Returns:
-            `str` or list of `str`: The converted prompt
-        """
-        if not isinstance(prompt, List):
-            prompts = [prompt]
-        else:
-            prompts = prompt
-
-        prompts = [self._maybe_convert_prompt(p, tokenizer) for p in prompts]
-
-        if not isinstance(prompt, List):
-            return prompts[0]
-
-        return prompts
-
-    def _maybe_convert_prompt(self, prompt: str, tokenizer: "PretrainedTokenizer"):
-        r"""
-        Maybe convert a prompt into a "multi vector"-compatible prompt. If the prompt includes a token that corresponds
-        to a multi-vector textual inversion embedding, this function will process the prompt so that the special token
-        is replaced with multiple special tokens each corresponding to one of the vectors. If the prompt has no textual
-        inversion token or a textual inversion token that is a single vector, the input prompt is simply returned.
-        Parameters:
-            prompt (`str`):
-                The prompt to guide the image generation.
-            tokenizer (`PretrainedTokenizer`):
-                The tokenizer responsible for encoding the prompt into input tokens.
-        Returns:
-            `str`: The converted prompt
-        """
-        tokens = tokenizer.tokenize(prompt)
-        for token in tokens:
-            if token in tokenizer.added_tokens_encoder:
-                replacement = token
-                i = 1
-                while f"{token}_{i}" in tokenizer.added_tokens_encoder:
-                    replacement += f" {token}_{i}"
-                    i += 1
-
-                prompt = prompt.replace(token, replacement)
-
-        return prompt
-
-    def load_textual_inversion(
-        self,
-        pretrained_model_name_or_path: Union[str, Dict[str, paddle.Tensor]],
-        token: Optional[str] = None,
-        **kwargs
-    ):
-        r"""
-        Load textual inversion embeddings into the text encoder of stable diffusion pipelines. Both `diffusers` and
-        `Automatic1111` formats are supported (see example below).
-        <Tip warning={true}>
-        This function is experimental and might change in the future.
-        </Tip>
-        Parameters:
-            pretrained_model_name_or_path (`str` or `os.PathLike`):
-                Can be either:
-                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids should have an organization name, like
-                      `"sd-concepts-library/low-poly-hd-logos-icons"`.
-                    - A path to a *directory* containing textual inversion weights, e.g.
-                      `./my_text_inversion_directory/`.
-            weight_name (`str`, *optional*):
-                Name of a custom weight file. This should be used in two cases:
-                    - The saved textual inversion file is in `diffusers` format, but was saved under a specific weight
-                      name, such as `text_inv.bin`.
-                    - The saved textual inversion file is in the "Automatic1111" form.
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory in which a downloaded pretrained model configuration should be cached if the
-                standard cache should not be used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
-                file exists.
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            local_files_only(`bool`, *optional*, defaults to `False`):
-                Whether or not to only look at local files (i.e., do not try to download the model).
-            use_auth_token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-                when running `diffusers-cli login` (stored in `~/.huggingface`).
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
-                identifier allowed by git.
-            subfolder (`str`, *optional*, defaults to `""`):
-                In case the relevant files are located inside a subfolder of the model repo (either remote in
-                huggingface.co or downloaded locally), you can specify the folder name here.
-            mirror (`str`, *optional*):
-                Mirror source to accelerate downloads in China. If you are from China and have an accessibility
-                problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
-                Please refer to the mirror site for more information.
-        <Tip>
-         It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated
-         models](https://huggingface.co/docs/hub/models-gated#gated-models).
-        </Tip>
-        Example:
-        To load a textual inversion embedding vector in `ppdiffusers` format:
-        ```py
-        from ppdiffusers import StableDiffusionPipeline
-        import paddle
-        model_id = "runwayml/stable-diffusion-v1-5"
-        pipe = StableDiffusionPipeline.from_pretrained(model_id, paddle_dtype=paddle.float16)
-        pipe.load_textual_inversion("sd-concepts-library/cat-toy")
-        prompt = "A <cat-toy> backpack"
-        image = pipe(prompt, num_inference_steps=50).images[0]
-        image.save("cat-backpack.png")
-        ```
-        To load a textual inversion embedding vector in Automatic1111 format, make sure to first download the vector,
-        e.g. from [civitAI](https://civitai.com/models/3036?modelVersionId=9857) and then load the vector locally:
-        ```py
-        from diffusers import StableDiffusionPipeline
-        import paddle
-        model_id = "runwayml/stable-diffusion-v1-5"
-        pipe = StableDiffusionPipeline.from_pretrained(model_id, paddle_dtype=paddle.float16)
-        pipe.load_textual_inversion("./charturnerv2.pt", token="charturnerv2")
-        prompt = "charturnerv2, multiple views of the same character in the same outfit, a character turnaround of a woman wearing a black jacket and red shirt, best quality, intricate details."
-        image = pipe(prompt, num_inference_steps=50).images[0]
-        image.save("character.png")
-        ```
-        """
-        if not hasattr(self, "tokenizer") or not isinstance(self.tokenizer, PretrainedTokenizer):
-            raise ValueError(
-                f"{self.__class__.__name__} requires `self.tokenizer` of type `PretrainedTokenizer` for calling"
-                f" `{self.load_textual_inversion.__name__}`"
-            )
-
-        if not hasattr(self, "text_encoder") or not isinstance(self.text_encoder, PretrainedModel):
-            raise ValueError(
-                f"{self.__class__.__name__} requires `self.text_encoder` of type `PretrainedModel` for calling"
-                f" `{self.load_textual_inversion.__name__}`"
-            )
-        from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB)
-        cache_dir = (
-            kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)
-        )
-        from_diffusers = kwargs.pop("from_diffusers", FROM_DIFFUSERS)
-        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-        local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE)
-        use_auth_token = kwargs.pop("use_auth_token", None)
-        revision = kwargs.pop("revision", None)
-        subfolder = kwargs.pop("subfolder", None)
-        weight_name = kwargs.pop("weight_name", None)
-        use_safetensors = kwargs.pop("use_safetensors", None)
-
-        if from_diffusers and use_safetensors and not is_safetensors_available():
-            raise ValueError(
-                "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors"
-            )
-        if use_safetensors is None:
-            use_safetensors = is_safetensors_available()
-        user_agent = {
-            "file_type": "text_inversion",
-            "framework": "pytorch" if from_diffusers else "paddle",
-        }
-
-        # 1. Load textual inversion file
-        model_file = None
-        # Let's first try to load .safetensors weights
-        if from_diffusers:
-            if (use_safetensors and weight_name is None) or (
-                weight_name is not None and weight_name.endswith(".safetensors")
-            ):
-                try:
-                    model_file = _get_model_file(
-                        pretrained_model_name_or_path,
-                        weights_name=weight_name or TORCH_TEXT_INVERSION_NAME_SAFE,
-                        cache_dir=cache_dir,
-                        force_download=force_download,
-                        resume_download=resume_download,
-                        proxies=proxies,
-                        local_files_only=local_files_only,
-                        use_auth_token=use_auth_token,
-                        revision=revision,
-                        subfolder=subfolder,
-                        user_agent=user_agent,
-                        from_hf_hub=from_hf_hub,
-                    )
-                    state_dict = safetensors_load(model_file)
-                except Exception:
-                    model_file = None
-                    pass
-            if model_file is None:
-                model_file = _get_model_file(
-                    pretrained_model_name_or_path,
-                    weights_name=weight_name or TORCH_TEXT_INVERSION_NAME,
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    resume_download=resume_download,
-                    proxies=proxies,
-                    local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
-                    revision=revision,
-                    subfolder=subfolder,
-                    user_agent=user_agent,
-                    from_hf_hub=from_hf_hub,
-                )
-                state_dict = torch_load(model_file)
-        else:
-            model_file = _get_model_file(
-                pretrained_model_name_or_path,
-                weights_name=weight_name or PADDLE_TEXT_INVERSION_NAME,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                resume_download=resume_download,
-                proxies=proxies,
-                local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
-                revision=revision,
-                subfolder=subfolder,
-                user_agent=user_agent,
-                from_hf_hub=from_hf_hub,
-            )
-            if is_torch_file(model_file):
-                try:
-                    state_dict = safetensors_load(model_file)
-                except:
-                    state_dict = torch_load(model_file)
-            else:
-                state_dict = paddle.load(model_file)
-
-        # 2. Load token and embedding correcly from file
-        if isinstance(state_dict, paddle.Tensor):
-            if token is None:
-                raise ValueError(
-                    "You are trying to load a textual inversion embedding that has been saved as a Paddle tensor. Make sure to pass the name of the corresponding token in this case: `token=...`."
-                )
-            embedding = state_dict
-        elif len(state_dict) == 1:
-            # diffusers
-            loaded_token, embedding = next(iter(state_dict.items()))
-        elif "string_to_param" in state_dict:
-            # A1111
-            loaded_token = state_dict["name"]
-            embedding = state_dict["string_to_param"]["*"]
-
-        if token is not None and loaded_token != token:
-            logger.warn(f"The loaded token: {loaded_token} is overwritten by the passed token {token}.")
-        else:
-            token = loaded_token
-
-        if not isinstance(state_dict, paddle.Tensor):
-            if hasattr(embedding, "detach"):
-                embedding = embedding.detach()
-            if hasattr(embedding, "cpu"):
-                embedding = embedding.cpu()
-            if hasattr(embedding, "numpy"):
-                embedding = embedding.numpy()
-            embedding = paddle.to_tensor(embedding)
-        embedding = embedding.cast(dtype=self.text_encoder.dtype)
-
-        # 3. Make sure we don't mess up the tokenizer or text encoder
-        vocab = self.tokenizer.get_vocab()
-        if token in vocab:
-            raise ValueError(
-                f"Token {token} already in tokenizer vocabulary. Please choose a different token name or remove {token} and embedding from the tokenizer and text encoder."
-            )
-        elif f"{token}_1" in vocab:
-            multi_vector_tokens = [token]
-            i = 1
-            while f"{token}_{i}" in self.tokenizer.added_tokens_encoder:
-                multi_vector_tokens.append(f"{token}_{i}")
-                i += 1
-
-            raise ValueError(
-                f"Multi-vector Token {multi_vector_tokens} already in tokenizer vocabulary. Please choose a different token name or remove the {multi_vector_tokens} and embedding from the tokenizer and text encoder."
-            )
-
-        is_multi_vector = len(embedding.shape) > 1 and embedding.shape[0] > 1
-
-        if is_multi_vector:
-            tokens = [token] + [f"{token}_{i}" for i in range(1, embedding.shape[0])]
-            embeddings = [e for e in embedding]  # noqa: C416
-        else:
-            tokens = [token]
-            embeddings = [embedding[0]] if len(embedding.shape) > 1 else [embedding]
-
-        # add tokens and get ids
-        self.tokenizer.add_tokens(tokens)
-        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
-
-        # resize token embeddings and set new embeddings
-        self.text_encoder.resize_token_embeddings(len(self.tokenizer))
-        with paddle.no_grad():
-            for token_id, embedding in zip(token_ids, embeddings):
-                self.text_encoder.get_input_embeddings().weight[token_id] = embedding
-
-        logger.info(f"Loaded textual inversion embedding for {token}.")
-
-
-class LoraLoaderMixin:
-    r"""
-    Utility class for handling the loading LoRA layers into UNet (of class [`UNet2DConditionModel`]) and Text Encoder
-    (of class [`CLIPTextModel`](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel)).
-    <Tip warning={true}>
-    This function is experimental and might change in the future.
-    </Tip>
-    """
-    text_encoder_name = TEXT_ENCODER_NAME
-    unet_name = UNET_NAME
-
-    def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, paddle.Tensor]], **kwargs):
-        r"""
-        Load pretrained attention processor layers (such as LoRA) into [`UNet2DConditionModel`] and
-        [`CLIPTextModel`](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel)).
-        <Tip warning={true}>
-        This function is experimental and might change in the future.
-        </Tip>
-        Parameters:
-            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
-                Can be either:
-                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids should have an organization name, like `google/ddpm-celebahq-256`.
-                    - A path to a *directory* containing model weights saved using [`~ModelMixin.save_config`], e.g.,
-                      `./my_model_directory/`.
-                    - A [torch state
-                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory in which a downloaded pretrained model configuration should be cached if the
-                standard cache should not be used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
-                file exists.
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            local_files_only(`bool`, *optional*, defaults to `False`):
-                Whether or not to only look at local files (i.e., do not try to download the model).
-            use_auth_token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-                when running `diffusers-cli login` (stored in `~/.huggingface`).
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
-                identifier allowed by git.
-            subfolder (`str`, *optional*, defaults to `""`):
-                In case the relevant files are located inside a subfolder of the model repo (either remote in
-                huggingface.co or downloaded locally), you can specify the folder name here.
-            mirror (`str`, *optional*):
-                Mirror source to accelerate downloads in China. If you are from China and have an accessibility
-                problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
-                Please refer to the mirror site for more information.
-        <Tip>
-        It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated
-        models](https://huggingface.co/docs/hub/models-gated#gated-models).
-        </Tip>
-        """
-        # Load the main state dict first which has the LoRA layers for either of
-        # UNet and text encoder or both.
-        from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB)
-        cache_dir = (
-            kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)
-        )
-        from_diffusers = kwargs.pop("from_diffusers", FROM_DIFFUSERS)
-        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-        local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE)
-        use_auth_token = kwargs.pop("use_auth_token", None)
-        revision = kwargs.pop("revision", None)
-        subfolder = kwargs.pop("subfolder", None)
-        weight_name = kwargs.pop("weight_name", None)
-        use_safetensors = kwargs.pop("use_safetensors", None)
-
-        # set lora scale to a reasonable default
-        self._lora_scale = 1.0
-
-        if from_diffusers and use_safetensors and not is_safetensors_available():
-            raise ValueError(
-                "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors"
-            )
-        if use_safetensors is None:
-            use_safetensors = is_safetensors_available()
-
-        user_agent = {
-            "file_type": "attn_procs_weights",
-            "framework": "pytorch" if from_diffusers else "paddle",
-        }
-
-        model_file = None
-
-        if not isinstance(pretrained_model_name_or_path_or_dict, dict):
-            if from_diffusers:
-                # Let's first try to load .safetensors weights
-                if (use_safetensors and weight_name is None) or (
-                    weight_name is not None and weight_name.endswith(".safetensors")
-                ):
-                    try:
-                        model_file = _get_model_file(
-                            pretrained_model_name_or_path_or_dict,
-                            weights_name=weight_name or TORCH_LORA_WEIGHT_NAME_SAFE,
-                            cache_dir=cache_dir,
-                            force_download=force_download,
-                            resume_download=resume_download,
-                            proxies=proxies,
-                            local_files_only=local_files_only,
-                            use_auth_token=use_auth_token,
-                            revision=revision,
-                            subfolder=subfolder,
-                            user_agent=user_agent,
-                            from_hf_hub=from_hf_hub,
-                        )
-                        state_dict = smart_load(model_file)
-                    except Exception:
-                        model_file = None
-                        pass
-                if model_file is None:
-                    model_file = _get_model_file(
-                        pretrained_model_name_or_path_or_dict,
-                        weights_name=weight_name or TORCH_LORA_WEIGHT_NAME,
-                        cache_dir=cache_dir,
-                        force_download=force_download,
-                        resume_download=resume_download,
-                        proxies=proxies,
-                        local_files_only=local_files_only,
-                        use_auth_token=use_auth_token,
-                        revision=revision,
-                        subfolder=subfolder,
-                        user_agent=user_agent,
-                        from_hf_hub=from_hf_hub,
-                    )
-                    state_dict = smart_load(model_file)
-            else:
-                model_file = _get_model_file(
-                    pretrained_model_name_or_path_or_dict,
-                    weights_name=weight_name or PADDLE_LORA_WEIGHT_NAME,
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    resume_download=resume_download,
-                    proxies=proxies,
-                    local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
-                    revision=revision,
-                    subfolder=subfolder,
-                    user_agent=user_agent,
-                    from_hf_hub=from_hf_hub,
-                )
-                state_dict = smart_load(model_file)
-        else:
-            state_dict = pretrained_model_name_or_path_or_dict
-
-        if not from_diffusers:
-            from_diffusers = is_torch_file(model_file)
-
-        # Convert kohya-ss Style LoRA attn procs to ppdiffusers attn procs
-        network_alpha = None
-        if all((k.startswith("lora_te_") or k.startswith("lora_unet_")) for k in state_dict.keys()):
-            state_dict, network_alpha = self._convert_kohya_lora_to_diffusers(state_dict)
-            from_diffusers = True
-
-        # If the serialization format is new (introduced in https://github.com/huggingface/diffusers/pull/2918),
-        # then the `state_dict` keys should have `self.unet_name` and/or `self.text_encoder_name` as
-        # their prefixes.
-        keys = list(state_dict.keys())
-        if all(key.startswith(self.unet_name) or key.startswith(self.text_encoder_name) for key in keys):
-            # Load the layers corresponding to UNet.
-            unet_keys = [k for k in keys if k.startswith(self.unet_name)]
-            logger.info(f"Loading {self.unet_name}.")
-            unet_lora_state_dict = {
-                k.replace(f"{self.unet_name}.", ""): v for k, v in state_dict.items() if k in unet_keys
-            }
-            self.unet.load_attn_procs(unet_lora_state_dict, network_alpha=network_alpha, from_diffusers=from_diffusers)
-
-            # Load the layers corresponding to text encoder and make necessary adjustments.
-            text_encoder_keys = [k for k in keys if k.startswith(self.text_encoder_name)]
-            text_encoder_lora_state_dict = {
-                k.replace(f"{self.text_encoder_name}.", ""): v for k, v in state_dict.items() if k in text_encoder_keys
-            }
-            if len(text_encoder_lora_state_dict) > 0:
-                logger.info(f"Loading {self.text_encoder_name}.")
-                attn_procs_text_encoder = self._load_text_encoder_attn_procs(
-                    text_encoder_lora_state_dict,
-                    network_alpha=network_alpha,
-                    from_diffusers=from_diffusers,
-                )
-                self._modify_text_encoder(attn_procs_text_encoder)
-
-                # save lora attn procs of text encoder so that it can be easily retrieved
-                self._text_encoder_lora_attn_procs = attn_procs_text_encoder
-
-        # Otherwise, we're dealing with the old format. This means the `state_dict` should only
-        # contain the module names of the `unet` as its keys WITHOUT any prefix.
-        elif not all(
-            key.startswith(self.unet_name) or key.startswith(self.text_encoder_name) for key in state_dict.keys()
-        ):
-            self.unet.load_attn_procs(state_dict, network_alpha=network_alpha, from_diffusers=from_diffusers)
-            warn_message = "You have saved the LoRA weights using the old format. To convert the old LoRA weights to the new format, you can first load them in a dictionary and then create a new dictionary like the following: `new_state_dict = {f'unet'.{module_name}: params for module_name, params in old_state_dict.items()}`."
-            warnings.warn(warn_message)
-
-    @property
-    def lora_scale(self) -> float:
-        # property function that returns the lora scale which can be set at run time by the pipeline.
-        # if _lora_scale has not been set, return 1
-        return self._lora_scale if hasattr(self, "_lora_scale") else 1.0
-
-    @property
-    def text_encoder_lora_attn_procs(self):
-        if hasattr(self, "_text_encoder_lora_attn_procs"):
-            return self._text_encoder_lora_attn_procs
-        return
-
-    def _remove_text_encoder_monkey_patch(self):
-        # Loop over the nn.MultiHeadAttention module of text_encoder
-        for name, attn_module in self.text_encoder.named_sublayers(include_self=True):
-            if name.endswith(TEXT_ENCODER_ATTN_MODULE):
-                # Loop over the LoRA layers
-                for _, text_encoder_attr in self._lora_attn_processor_attr_to_text_encoder_attr.items():
-                    # Retrieve the q/k/v/out projection of nn.MultiHeadAttention
-                    module = attn_module.get_sublayer(text_encoder_attr)
-                    if hasattr(module, "old_forward"):
-                        # restore original `forward` to remove monkey-patch
-                        module.forward = module.old_forward
-                        delattr(module, "old_forward")
-
-                # new added by Junnyu, no exists in diffusers
-                if hasattr(attn_module, "processor"):
-                    # del processor
-                    delattr(attn_module, "processor")
-
-    def _modify_text_encoder(self, attn_processors: Dict[str, LoRAAttnProcessor]):
-        r"""
-        Monkey-patches the forward passes of attention modules of the text encoder.
-
-        Parameters:
-            attn_processors: Dict[str, `LoRAAttnProcessor`]:
-                A dictionary mapping the module names and their corresponding [`~LoRAAttnProcessor`].
-        """
-
-        # First, remove any monkey-patch that might have been applied before
-        self._remove_text_encoder_monkey_patch()
-
-        # Loop over the nn.MultiHeadAttention module of text_encoder
-        for name, attn_module in self.text_encoder.named_sublayers(include_self=True):
-            if name.endswith(TEXT_ENCODER_ATTN_MODULE):
-                # Loop over the LoRA layers
-                for attn_proc_attr, text_encoder_attr in self._lora_attn_processor_attr_to_text_encoder_attr.items():
-                    # Retrieve the q/k/v/out projection of nn.MultiHeadAttention and its corresponding LoRA layer.
-                    module = attn_module.get_sublayer(text_encoder_attr)
-                    lora_layer = attn_processors[name].get_sublayer(attn_proc_attr)
-                    # save old_forward to module that can be used to remove monkey-patch
-                    old_forward = module.old_forward = module.forward
-
-                    # create a new scope that locks in the old_forward, lora_layer value for each new_forward function
-                    # for more detail, see https://github.com/huggingface/diffusers/pull/3490#issuecomment-1555059060
-                    def make_new_forward(old_forward, lora_layer):
-                        def new_forward(x):
-                            result = old_forward(x) + self.lora_scale * lora_layer(x)
-                            return result
-
-                        return new_forward
-
-                    # Monkey-patch.
-                    module.forward = make_new_forward(old_forward, lora_layer)
-
-                # new added by Junnyu, no exists in diffusers
-                attn_module.processor = attn_processors[name]
-
-    @property
-    def _lora_attn_processor_attr_to_text_encoder_attr(self):
-        return {
-            "to_q_lora": "q_proj",
-            "to_k_lora": "k_proj",
-            "to_v_lora": "v_proj",
-            "to_out_lora": "out_proj",
-        }
-
-    def _load_text_encoder_attn_procs(
-        self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, paddle.Tensor]], **kwargs
-    ):
-        r"""
-        Load pretrained attention processor layers for
-        [`CLIPTextModel`](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel).
-        <Tip warning={true}>
-        This function is experimental and might change in the future.
-        </Tip>
-        Parameters:
-            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
-                Can be either:
-                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids should have an organization name, like `google/ddpm-celebahq-256`.
-                    - A path to a *directory* containing model weights saved using [`~ModelMixin.save_config`], e.g.,
-                      `./my_model_directory/`.
-                    - A [torch state
-                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory in which a downloaded pretrained model configuration should be cached if the
-                standard cache should not be used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
-                file exists.
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            local_files_only(`bool`, *optional*, defaults to `False`):
-                Whether or not to only look at local files (i.e., do not try to download the model).
-            use_auth_token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-                when running `diffusers-cli login` (stored in `~/.huggingface`).
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
-                identifier allowed by git.
-            subfolder (`str`, *optional*, defaults to `""`):
-                In case the relevant files are located inside a subfolder of the model repo (either remote in
-                huggingface.co or downloaded locally), you can specify the folder name here.
-            mirror (`str`, *optional*):
-                Mirror source to accelerate downloads in China. If you are from China and have an accessibility
-                problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
-                Please refer to the mirror site for more information.
-        Returns:
-            `Dict[name, LoRAAttnProcessor]`: Mapping between the module names and their corresponding
-            [`LoRAAttnProcessor`].
-        <Tip>
-        It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated
-        models](https://huggingface.co/docs/hub/models-gated#gated-models).
-        </Tip>
-        """
-
-        from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB)
-        cache_dir = (
-            kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)
-        )
-        from_diffusers = kwargs.pop("from_diffusers", FROM_DIFFUSERS)
-        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-        local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE)
-        use_auth_token = kwargs.pop("use_auth_token", None)
-        revision = kwargs.pop("revision", None)
-        subfolder = kwargs.pop("subfolder", None)
-        weight_name = kwargs.pop("weight_name", None)
-        use_safetensors = kwargs.pop("use_safetensors", None)
-        network_alpha = kwargs.pop("network_alpha", None)
-
-        if from_diffusers and use_safetensors and not is_safetensors_available():
-            raise ValueError(
-                "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors"
-            )
-        if use_safetensors is None:
-            use_safetensors = is_safetensors_available()
-        user_agent = {
-            "file_type": "attn_procs_weights",
-            "framework": "pytorch" if from_diffusers else "paddle",
-        }
-
-        model_file = None
-        if not isinstance(pretrained_model_name_or_path_or_dict, dict):
-            if from_diffusers:
-                # Let's first try to load .safetensors weights
-                if (use_safetensors and weight_name is None) or (
-                    weight_name is not None and weight_name.endswith(".safetensors")
-                ):
-                    try:
-                        model_file = _get_model_file(
-                            pretrained_model_name_or_path_or_dict,
-                            weights_name=weight_name or TORCH_LORA_WEIGHT_NAME_SAFE,
-                            cache_dir=cache_dir,
-                            force_download=force_download,
-                            resume_download=resume_download,
-                            proxies=proxies,
-                            local_files_only=local_files_only,
-                            use_auth_token=use_auth_token,
-                            revision=revision,
-                            subfolder=subfolder,
-                            user_agent=user_agent,
-                            from_hf_hub=from_hf_hub,
-                        )
-                        state_dict = smart_load(model_file)
-                    except Exception:
-                        model_file = None
-                        pass
-                if model_file is None:
-                    model_file = _get_model_file(
-                        pretrained_model_name_or_path_or_dict,
-                        weights_name=weight_name or TORCH_LORA_WEIGHT_NAME,
-                        cache_dir=cache_dir,
-                        force_download=force_download,
-                        resume_download=resume_download,
-                        proxies=proxies,
-                        local_files_only=local_files_only,
-                        use_auth_token=use_auth_token,
-                        revision=revision,
-                        subfolder=subfolder,
-                        user_agent=user_agent,
-                        from_hf_hub=from_hf_hub,
-                    )
-                    state_dict = smart_load(model_file)
-            else:
-                model_file = _get_model_file(
-                    pretrained_model_name_or_path_or_dict,
-                    weights_name=weight_name or PADDLE_LORA_WEIGHT_NAME,
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    resume_download=resume_download,
-                    proxies=proxies,
-                    local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
-                    revision=revision,
-                    subfolder=subfolder,
-                    user_agent=user_agent,
-                    from_hf_hub=from_hf_hub,
-                )
-                state_dict = smart_load(model_file)
-        else:
-            state_dict = pretrained_model_name_or_path_or_dict
-
-        # fill attn processors
-        attn_processors = {}
-
-        is_lora = all("lora" in k for k in state_dict.keys())
-
-        if from_diffusers or is_torch_file(model_file):
-            state_dict = transpose_state_dict(state_dict, name_mapping={".encoder.": ".transformer."})
-
-        if is_lora:
-            lora_grouped_dict = defaultdict(dict)
-            for key, value in state_dict.items():
-                attn_processor_key, sub_key = ".".join(key.split(".")[:-3]), ".".join(key.split(".")[-3:])
-                lora_grouped_dict[attn_processor_key][sub_key] = value.cast(
-                    dtype="float32"
-                )  # we must cast this to float32
-
-            for key, value_dict in lora_grouped_dict.items():
-                rank = value_dict["to_k_lora.down.weight"].shape[1]  # 0 -> 1, torch vs paddle nn.Linear
-                cross_attention_dim = value_dict["to_k_lora.down.weight"].shape[0]  # 1 -> 0, torch vs paddle nn.Linear
-                hidden_size = value_dict["to_k_lora.up.weight"].shape[1]  # 0 -> 1, torch vs paddle nn.Linear
-
-                attn_processors[key] = LoRAAttnProcessor(
-                    hidden_size=hidden_size,
-                    cross_attention_dim=cross_attention_dim,
-                    rank=rank,
-                    network_alpha=network_alpha,
-                )
-                attn_processors[key].load_dict(value_dict)
-
-        else:
-            raise ValueError(f"{model_file} does not seem to be in the correct format expected by LoRA training.")
-
-        # set correct dtype & device
-        attn_processors = {k: v.to(dtype=self.text_encoder.dtype) for k, v in attn_processors.items()}
-        return attn_processors
-
-    @classmethod
-    def save_lora_weights(
-        self,
-        save_directory: Union[str, os.PathLike],
-        unet_lora_layers: Dict[str, nn.Layer] = None,
-        text_encoder_lora_layers: Dict[str, nn.Layer] = None,
-        is_main_process: bool = True,
-        weight_name: str = None,
-        save_function: Callable = None,
-        safe_serialization: bool = False,
-        to_diffusers: Optional[bool] = None,
-    ):
-        r"""
-        Save the LoRA parameters corresponding to the UNet and the text encoder.
-        Arguments:
-            save_directory (`str` or `os.PathLike`):
-                Directory to which to save. Will be created if it doesn't exist.
-            unet_lora_layers (`Dict[str, nn.Layer`]):
-                State dict of the LoRA layers corresponding to the UNet. Specifying this helps to make the
-                serialization process easier and cleaner.
-            text_encoder_lora_layers (`Dict[str, nn.Layer`]):
-                State dict of the LoRA layers corresponding to the `text_encoder`. Since the `text_encoder` comes from
-                `paddlenlp`, we cannot rejig it. That is why we have to explicitly pass the text encoder LoRA state
-                dict.
-            is_main_process (`bool`, *optional*, defaults to `True`):
-                Whether the process calling this is the main process or not. Useful when in distributed training like
-                TPUs and need to call this function on all processes. In this case, set `is_main_process=True` only on
-                the main process to avoid race conditions.
-            save_function (`Callable`):
-                The function to use to save the state dictionary. Useful on distributed training like TPUs when one
-                need to replace `torch.save` by another method. Can be configured with the environment variable
-                `DIFFUSERS_SAVE_MODE`.
-        """
-        if to_diffusers is None:
-            to_diffusers = TO_DIFFUSERS
-        if to_diffusers and safe_serialization and not is_safetensors_available():
-            raise ImportError("`safe_serialization` requires the `safetensors library: `pip install safetensors`.")
-
-        if os.path.isfile(save_directory):
-            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
-            return
-
-        os.makedirs(save_directory, exist_ok=True)
-
-        # Create a flat dictionary.
-        state_dict = {}
-        if unet_lora_layers is not None:
-            unet_lora_state_dict = {
-                f"{self.unet_name}.{module_name}": param
-                for module_name, param in unet_lora_layers.state_dict().items()
-            }
-            state_dict.update(unet_lora_state_dict)
-        if text_encoder_lora_layers is not None:
-            text_encoder_lora_state_dict = {
-                f"{self.text_encoder_name}.{module_name}": param
-                for module_name, param in text_encoder_lora_layers.state_dict().items()
-            }
-            state_dict.update(text_encoder_lora_state_dict)
-            # TODO junnyu, rename paramaters.
-
-        # Save the model
-        if weight_name is None:
-            if to_diffusers:
-                if safe_serialization:
-                    weight_name = TORCH_LORA_WEIGHT_NAME_SAFE
-                else:
-                    weight_name = TORCH_LORA_WEIGHT_NAME
-            else:
-                weight_name = PADDLE_LORA_WEIGHT_NAME
-
-        # choose save_function
-        if save_function is None:
-            if to_diffusers:
-                if safe_serialization:
-                    if is_torch_available():
-                        _save_function = safetensors.torch.save_file
-                        state_dict = convert_state_dict(state_dict, framework="torch")
-                    else:
-                        _save_function = safetensors.numpy.save_file
-                        state_dict = convert_state_dict(state_dict, framework="numpy")
-
-                    def save_function(weights, filename):
-                        return _save_function(weights, filename, metadata={"format": "pt"})
-
-                else:
-                    if not is_torch_available():
-                        raise ImportError(
-                            "`to_diffusers=True` with `safe_serialization=False` requires the `torch library: `pip install torch`."
-                        )
-                    save_function = torch.save
-                    state_dict = convert_state_dict(state_dict, framework="torch")
-                state_dict = transpose_state_dict(state_dict, name_mapping={".transformer.": ".encoder."})
-            else:
-                save_function = paddle.save
-
-        save_function(state_dict, os.path.join(save_directory, weight_name))
-        logger.info(f"Model weights saved in {os.path.join(save_directory, weight_name)}")
-
-    def _convert_kohya_lora_to_diffusers(self, state_dict):
-        unet_state_dict = {}
-        te_state_dict = {}
-        network_alpha = None
-
-        for key, value in state_dict.items():
-            if "lora_down" in key:
-                lora_name = key.split(".")[0]
-                lora_name_up = lora_name + ".lora_up.weight"
-                lora_name_alpha = lora_name + ".alpha"
-                if lora_name_alpha in state_dict:
-                    # we must cast this to float32, before get item
-                    alpha = state_dict[lora_name_alpha].cast("float32").item()
-                    if network_alpha is None:
-                        network_alpha = alpha
-                    elif network_alpha != alpha:
-                        raise ValueError("Network alpha is not consistent")
-
-                if lora_name.startswith("lora_unet_"):
-                    diffusers_name = key.replace("lora_unet_", "").replace("_", ".")
-                    diffusers_name = diffusers_name.replace("down.blocks", "down_blocks")
-                    diffusers_name = diffusers_name.replace("mid.block", "mid_block")
-                    diffusers_name = diffusers_name.replace("up.blocks", "up_blocks")
-                    diffusers_name = diffusers_name.replace("transformer.blocks", "transformer_blocks")
-                    diffusers_name = diffusers_name.replace("to.q.lora", "to_q_lora")
-                    diffusers_name = diffusers_name.replace("to.k.lora", "to_k_lora")
-                    diffusers_name = diffusers_name.replace("to.v.lora", "to_v_lora")
-                    diffusers_name = diffusers_name.replace("to.out.0.lora", "to_out_lora")
-                    if "transformer_blocks" in diffusers_name:
-                        if "attn1" in diffusers_name or "attn2" in diffusers_name:
-                            diffusers_name = diffusers_name.replace("attn1", "attn1.processor")
-                            diffusers_name = diffusers_name.replace("attn2", "attn2.processor")
-                            unet_state_dict[diffusers_name] = value
-                            unet_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict[lora_name_up]
-                elif lora_name.startswith("lora_te_"):
-                    diffusers_name = key.replace("lora_te_", "").replace("_", ".")
-                    diffusers_name = diffusers_name.replace("text.model", "text_model")
-                    diffusers_name = diffusers_name.replace("self.attn", "self_attn")
-                    diffusers_name = diffusers_name.replace("q.proj.lora", "to_q_lora")
-                    diffusers_name = diffusers_name.replace("k.proj.lora", "to_k_lora")
-                    diffusers_name = diffusers_name.replace("v.proj.lora", "to_v_lora")
-                    diffusers_name = diffusers_name.replace("out.proj.lora", "to_out_lora")
-                    if "self_attn" in diffusers_name:
-                        te_state_dict[diffusers_name] = value
-                        te_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict[lora_name_up]
-
-        unet_state_dict = {f"{UNET_NAME}.{module_name}": params for module_name, params in unet_state_dict.items()}
-        te_state_dict = {f"{TEXT_ENCODER_NAME}.{module_name}": params for module_name, params in te_state_dict.items()}
-        new_state_dict = {**unet_state_dict, **te_state_dict}
-        return new_state_dict, network_alpha
-
-
-class FromCkptMixin:
-    """This helper class allows to directly load .ckpt or .safetensors stable diffusion file_extension
-    into the respective classes."""
-
-    @classmethod
-    def from_ckpt(cls, pretrained_model_link_or_path, **kwargs):
-        r"""
-        Instantiate a Paddle diffusion pipeline from pre-trained pipeline weights saved in the original .ckpt format.
-        The pipeline is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated).
-        Parameters:
-            pretrained_model_link_or_path (`str` or `os.PathLike`, *optional*):
-                Can be either:
-                    - A link to the .ckpt file on the Hub. Should be in the format
-                      `"https://huggingface.co/<repo_id>/blob/main/<path_to_file>"`
-                    - A link to the .safetensors file on the civitai.com. Should be in the format
-                      `"https://civitai.com/api/download/models/<number_of_file>"`
-                    - A path to a *file* containing all pipeline weights.
-            paddle_dtype (`str` or `paddle.dtype`, *optional*):
-                Override the default `paddle.dtype` and load the model under this dtype.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory in which a downloaded pretrained model configuration should be cached if the
-                standard cache should not be used.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
-                file exists.
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            local_files_only (`bool`, *optional*, defaults to `False`):
-                Whether or not to only look at local files (i.e., do not try to download the model).
-            use_auth_token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-                when running `huggingface-cli login` (stored in `~/.huggingface`).
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
-                identifier allowed by git.
-            extract_ema (`bool`, *optional*, defaults to `False`): Only relevant for
-                checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights or not. Defaults
-                to `False`. Pass `True` to extract the EMA weights. EMA weights usually yield higher quality images for
-                inference. Non-EMA weights are usually better to continue fine-tuning.
-            upcast_attention (`bool`, *optional*, defaults to `None`):
-                Whether the attention computation should always be upcasted. This is necessary when running stable
-            image_size (`int`, *optional*, defaults to 512):
-                The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable Diffusion v2
-                Base. Use 768 for Stable Diffusion v2.
-            prediction_type (`str`, *optional*):
-                The prediction type that the model was trained on. Use `'epsilon'` for Stable Diffusion v1.X and Stable
-                Diffusion v2 Base. Use `'v_prediction'` for Stable Diffusion v2.
-            num_in_channels (`int`, *optional*, defaults to None):
-                The number of input channels. If `None`, it will be automatically inferred.
-            scheduler_type (`str`, *optional*, defaults to 'pndm'):
-                Type of scheduler to use. Should be one of `["pndm", "lms", "heun", "euler", "euler-ancestral", "dpm",
-                "ddim"]`.
-            load_safety_checker (`bool`, *optional*, defaults to `False`):
-                Whether to load the safety checker or not. Defaults to `False`.
-            kwargs (remaining dictionary of keyword arguments, *optional*):
-                Can be used to overwrite load - and saveable variables - *i.e.* the pipeline components - of the
-                specific pipeline class. The overwritten components are then directly passed to the pipelines
-                `__init__` method. See example below for more information.
-        Examples:
-        ```py
-        >>> from ppdiffusers import StableDiffusionPipeline
-        >>> # Download pipeline from huggingface.co and cache.
-        >>> pipeline = StableDiffusionPipeline.from_ckpt(
-        ...     "https://huggingface.co/WarriorMama777/OrangeMixs/blob/main/Models/AbyssOrangeMix/AbyssOrangeMix.safetensors"
-        ... )
-        >>> # Download pipeline from local file
-        >>> # file is downloaded under ./v1-5-pruned-emaonly.ckpt
-        >>> pipeline = StableDiffusionPipeline.from_ckpt("./v1-5-pruned-emaonly")
-        >>> # Enable float16
-        >>> pipeline = StableDiffusionPipeline.from_ckpt(
-        ...     "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt",
-        ...     paddle_dtype=paddle.float16,
-        ... )
-        ```
-        """
-        # import here to avoid circular dependency
-        from .pipelines.stable_diffusion.convert_from_ckpt import (
-            download_from_original_stable_diffusion_ckpt,
-        )
-
-        from_hf_hub = "huggingface.co" in pretrained_model_link_or_path or "hf.co"
-        cache_dir = (
-            kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)
-        )
-        resume_download = kwargs.pop("resume_download", False)
-        force_download = kwargs.pop("force_download", False)
-        proxies = kwargs.pop("proxies", None)
-        local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE)
-        use_auth_token = kwargs.pop("use_auth_token", None)
-        revision = kwargs.pop("revision", None)
-        extract_ema = kwargs.pop("extract_ema", False)
-        image_size = kwargs.pop("image_size", 512)
-        scheduler_type = kwargs.pop("scheduler_type", "pndm")
-        num_in_channels = kwargs.pop("num_in_channels", None)
-        upcast_attention = kwargs.pop("upcast_attention", None)
-        load_safety_checker = kwargs.pop("load_safety_checker", False)
-        prediction_type = kwargs.pop("prediction_type", None)
-
-        paddle_dtype = kwargs.pop("paddle_dtype", None)
-
-        pipeline_name = cls.__name__
-
-        # TODO: For now we only support stable diffusion
-        stable_unclip = None
-        controlnet = False
-
-        if pipeline_name == "StableDiffusionControlNetPipeline":
-            model_type = "FrozenCLIPEmbedder"
-            controlnet = True
-        elif "StableDiffusion" in pipeline_name:
-            model_type = "FrozenCLIPEmbedder"
-        elif pipeline_name == "StableUnCLIPPipeline":
-            model_type == "FrozenOpenCLIPEmbedder"
-            stable_unclip = "txt2img"
-        elif pipeline_name == "StableUnCLIPImg2ImgPipeline":
-            model_type == "FrozenOpenCLIPEmbedder"
-            stable_unclip = "img2img"
-        elif pipeline_name == "PaintByExamplePipeline":
-            model_type == "PaintByExample"
-        elif pipeline_name == "LDMTextToImagePipeline":
-            model_type == "LDMTextToImage"
-        else:
-            raise ValueError(f"Unhandled pipeline class: {pipeline_name}")
-
-        pretrained_model_link_or_path = str(pretrained_model_link_or_path)
-        if os.path.isfile(pretrained_model_link_or_path):
-            checkpoint_path = pretrained_model_link_or_path
-        elif pretrained_model_link_or_path.startswith("http://") or pretrained_model_link_or_path.startswith(
-            "https://"
-        ):
-            # HF Hub models
-            if any(p in pretrained_model_link_or_path for p in ["huggingface.co", "hf.co"]):
-                # remove huggingface url
-                for prefix in ["https://huggingface.co/", "huggingface.co/", "hf.co/", "https://hf.co/"]:
-                    if pretrained_model_link_or_path.startswith(prefix):
-                        pretrained_model_link_or_path = pretrained_model_link_or_path[len(prefix) :]
-
-                # Code based on diffusers.pipelines.pipeline_utils.DiffusionPipeline.from_pretrained
-                ckpt_path = Path(pretrained_model_link_or_path)
-                if not ckpt_path.is_file():
-                    # get repo_id and (potentially nested) file path of ckpt in repo
-                    repo_id = str(Path().joinpath(*ckpt_path.parts[:2]))
-                    file_path = str(Path().joinpath(*ckpt_path.parts[2:]))
-
-                    if file_path.startswith("blob/"):
-                        file_path = file_path[len("blob/") :]
-
-                    if file_path.startswith("main/"):
-                        file_path = file_path[len("main/") :]
-
-                    checkpoint_path = hf_hub_download(
-                        repo_id,
-                        filename=file_path,
-                        cache_dir=cache_dir,
-                        resume_download=resume_download,
-                        proxies=proxies,
-                        local_files_only=local_files_only,
-                        use_auth_token=use_auth_token,
-                        revision=revision,
-                        force_download=force_download,
-                    )
-                else:
-                    checkpoint_path = ckpt_path
-            else:
-                checkpoint_path = ppdiffusers_url_download(
-                    pretrained_model_link_or_path,
-                    cache_dir=cache_dir,
-                    filename=http_file_name(pretrained_model_link_or_path).strip('"'),
-                    force_download=force_download,
-                    resume_download=resume_download,
-                )
-        else:
-            checkpoint_path = pretrained_model_link_or_path
-
-        pipe = download_from_original_stable_diffusion_ckpt(
-            checkpoint_path,
-            pipeline_class=cls,
-            model_type=model_type,
-            stable_unclip=stable_unclip,
-            controlnet=controlnet,
-            extract_ema=extract_ema,
-            image_size=image_size,
-            scheduler_type=scheduler_type,
-            num_in_channels=num_in_channels,
-            upcast_attention=upcast_attention,
-            load_safety_checker=load_safety_checker,
-            prediction_type=prediction_type,
-            paddle_dtype=paddle_dtype,
-        )
-
-        return pipe
-
-
-def http_file_name(
-    url: str,
-    *,
-    proxies=None,
-    headers: Optional[Dict[str, str]] = None,
-    timeout=10.0,
-    max_retries=0,
-):
-    """
-    Get a remote file name.
-    """
-    headers = copy.deepcopy(headers) or {}
-    r = _request_wrapper(
-        method="GET",
-        url=url,
-        stream=True,
-        proxies=proxies,
-        headers=headers,
-        timeout=timeout,
-        max_retries=max_retries,
-    )
-    hf_raise_for_status(r)
-    displayed_name = url.split("/")[-1]
-    content_disposition = r.headers.get("Content-Disposition")
-    if content_disposition is not None and "filename=" in content_disposition:
-        # Means file is on CDN
-        displayed_name = content_disposition.split("filename=")[-1]
-    return displayed_name
diff --git a/ppdiffusers/ppdiffusers/models/__init__.py b/ppdiffusers/ppdiffusers/models/__init__.py
deleted file mode 100644
index cfd5eb946355..000000000000
--- a/ppdiffusers/ppdiffusers/models/__init__.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# flake8: noqa
-
-
-from ..utils.import_utils import (
-    OptionalDependencyNotAvailable,
-    is_einops_available,
-    is_paddle_available,
-)
-
-if is_paddle_available():
-    from .adapter import MultiAdapter, T2IAdapter
-    from .autoencoder_kl import AutoencoderKL
-    from .controlnet import ControlNetModel
-    from .dual_transformer_2d import DualTransformer2DModel
-    from .ema import LitEma
-    from .modeling_utils import ModelMixin
-    from .prior_transformer import PriorTransformer
-    from .t5_film_transformer import T5FilmDecoder
-    from .transformer_2d import Transformer2DModel
-    from .unet_1d import UNet1DModel
-    from .unet_2d import UNet2DModel
-    from .unet_2d_condition import UNet2DConditionModel
-    from .unet_3d_condition import UNet3DConditionModel
-    from .vq_model import VQModel
-
-try:
-    if not (is_paddle_available() and is_einops_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ..utils.dummy_paddle_and_einops_objects import *  # noqa F403
-else:
-    from .uvit import UViTModel
diff --git a/ppdiffusers/ppdiffusers/models/adapter.py b/ppdiffusers/ppdiffusers/models/adapter.py
deleted file mode 100644
index 2df877854dc3..000000000000
--- a/ppdiffusers/ppdiffusers/models/adapter.py
+++ /dev/null
@@ -1,252 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import List, Optional
-
-import paddle
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from .modeling_utils import ModelMixin
-from .resnet import Downsample2D
-
-
-class BottleneckResnetBlock(paddle.nn.Layer):
-    def __init__(self, in_c, mid_c, out_c, down, ksize=3, sk=False, use_conv=True, proj_ksize=1):
-        super().__init__()
-        ps = ksize // 2
-        proj_pad = proj_ksize // 2
-        if in_c != mid_c or sk is False:
-            self.conv1 = paddle.nn.Conv2D(
-                in_channels=in_c, out_channels=mid_c, kernel_size=proj_ksize, stride=1, padding=proj_pad
-            )
-        else:
-            self.conv1 = None
-        if out_c != mid_c:
-            self.conv2 = paddle.nn.Conv2D(
-                in_channels=mid_c, out_channels=out_c, kernel_size=proj_ksize, stride=1, padding=proj_pad
-            )
-        else:
-            self.conv2 = None
-        self.block1 = paddle.nn.Conv2D(in_channels=mid_c, out_channels=mid_c, kernel_size=3, stride=1, padding=1)
-        self.act = paddle.nn.ReLU()
-        self.block2 = paddle.nn.Conv2D(in_channels=mid_c, out_channels=mid_c, kernel_size=ksize, stride=1, padding=ps)
-        if sk is False:
-            self.conv_shortcut = paddle.nn.Conv2D(
-                in_channels=in_c, out_channels=mid_c, kernel_size=ksize, stride=1, padding=ps
-            )
-        else:
-            self.conv_shortcut = None
-        self.down = down
-        if self.down is True:
-            self.downsample = Downsample2D(in_c, use_conv=use_conv)
-
-    def forward(self, x):
-        if self.down is True:
-            x = self.downsample(x)
-        if self.conv1 is not None:
-            x = self.conv1(x)
-        h = self.block1(x)
-        h = self.act(h)
-        h = self.block2(h)
-        if self.conv_shortcut is not None:
-            h = h + self.conv_shortcut(x)
-        else:
-            h = h + x
-        if self.conv2 is not None:
-            h = self.conv2(h)
-        return h
-
-
-class T2IAdapter(ModelMixin, ConfigMixin):
-    """
-    A simple ResNet-like model that accepts images containing control signals such as keyposes and depth. The model
-    generates multiple feature maps that are used as additional conditioning in [`UNet2DConditionModel`]. The model's
-    architecture follows the original implementation of
-    [Adapter](https://github.com/TencentARC/T2I-Adapter/blob/686de4681515662c0ac2ffa07bf5dda83af1038a/ldm/modules/encoders/adapter.py#L97)
-     and
-     [AdapterLight](https://github.com/TencentARC/T2I-Adapter/blob/686de4681515662c0ac2ffa07bf5dda83af1038a/ldm/modules/encoders/adapter.py#L235).
-
-    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
-    implements for all the model (such as downloading or saving, etc.)
-
-    Parameters:
-        block_out_channels (`List[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
-            The number of channel of each downsample block's output hidden state. The `len(block_out_channels)` will
-            also determine the number of downsample blocks in the Adapter.
-        block_mid_channels (`List[int]`, *optional*, defaults to `block_out_channels` if not provided):
-            The number of channels ResNet blocks in each downsample blocks will have, a downsample block will insert a
-             projection layer in the last ResNet block when having different "mid_channel" and "out_channel".
-        num_res_blocks (`int`, *optional*, defaults to 3):
-            Number of ResNet blocks in each downsample block
-        channels_in (`int`, *optional*, defaults to 3):
-            Number of channels of Aapter's input(*control image*). Set this parameter to 1 if you're using gray scale
-            image as *control image*.
-        kernel_size (`int`, *optional*, defaults to 3):
-            Kernel size of conv-2d layers inside ResNet blocks.
-        proj_kernel_size (`int`, *optional*, defaults to 3):
-            Kernel size of conv-2d projection layers located at the start and end of a downsample block.
-        res_block_skip (`bool`, *optional*, defaults to True):
-            If set to `True`, ResNet block will using a regular residual connect that add layer's input to its output.
-            If set to `False`, ResNet block will create a additional conv-2d layer in residual connect before adding
-            residual back.
-        use_conv (`bool`, *optional*, defaults to False):
-            Whether to use a conv-2d layer for down sample feature map or a average pooling layer.
-        input_scale_factor (`int`, *optional*, defaults to 8):
-            The down scaling factor will be apply to input image when it is frist deliver to Adapter. Which should be
-            equal to the down scaling factor of the VAE of your choice.
-    """
-
-    @register_to_config
-    def __init__(
-        self,
-        block_out_channels: List[int] = [320, 640, 1280, 1280],
-        block_mid_channels: Optional[List[int]] = None,
-        num_res_blocks: int = 3,
-        channels_in: int = 3,
-        kernel_size: int = 3,
-        proj_kernel_size: int = 1,
-        res_block_skip: bool = True,
-        use_conv: bool = False,
-        input_scale_factor: int = 8,
-    ):
-        super(T2IAdapter, self).__init__()
-        self.num_downsample_blocks = len(block_out_channels)
-        self.unshuffle = paddle.nn.PixelUnshuffle(downscale_factor=input_scale_factor)
-        self.num_res_blocks = num_res_blocks
-        self.body = []
-        if block_mid_channels is None:
-            block_mid_channels = block_out_channels
-        for i in range(self.num_downsample_blocks):
-            for j in range(num_res_blocks):
-                if i != 0 and j == 0:
-                    self.body.append(
-                        BottleneckResnetBlock(
-                            block_out_channels[i - 1],
-                            block_mid_channels[i],
-                            block_mid_channels[i],
-                            down=True,
-                            ksize=kernel_size,
-                            proj_ksize=proj_kernel_size,
-                            sk=res_block_skip,
-                            use_conv=use_conv,
-                        )
-                    )
-                elif j == num_res_blocks - 1:
-                    self.body.append(
-                        BottleneckResnetBlock(
-                            block_mid_channels[i],
-                            block_mid_channels[i],
-                            block_out_channels[i],
-                            down=False,
-                            ksize=kernel_size,
-                            proj_ksize=proj_kernel_size,
-                            sk=res_block_skip,
-                            use_conv=use_conv,
-                        )
-                    )
-                else:
-                    self.body.append(
-                        BottleneckResnetBlock(
-                            block_mid_channels[i],
-                            block_mid_channels[i],
-                            block_mid_channels[i],
-                            down=False,
-                            ksize=kernel_size,
-                            proj_ksize=proj_kernel_size,
-                            sk=res_block_skip,
-                            use_conv=use_conv,
-                        )
-                    )
-        self.body = paddle.nn.LayerList(sublayers=self.body)
-        if block_mid_channels[0] == block_out_channels[0]:
-            self.conv_in = paddle.nn.Conv2D(
-                in_channels=channels_in * input_scale_factor**2,
-                out_channels=block_mid_channels[0],
-                kernel_size=3,
-                stride=1,
-                padding=1,
-            )
-        else:
-            self.conv_in = paddle.nn.Conv2D(
-                in_channels=channels_in * input_scale_factor**2,
-                out_channels=block_mid_channels[0],
-                kernel_size=proj_kernel_size,
-                stride=1,
-                padding=proj_kernel_size // 2,
-            )
-
-    def forward(self, x: paddle.Tensor) -> List[paddle.Tensor]:
-        """
-        Args:
-            x (`torch.Tensor`):
-                (batch, channel, height, width) input images for adapter model, `channel` should equal to
-                `channels_in`.
-        """
-        x = self.unshuffle(x)
-        features = []
-        x = self.conv_in(x)
-        for i in range(self.num_downsample_blocks):
-            for j in range(self.num_res_blocks):
-                idx = i * self.num_res_blocks + j
-                x = self.body[idx](x)
-            features.append(x)
-        return features
-
-
-class MultiAdapter(ModelMixin):
-    """
-    MultiAdapter is a wrapper model that contains multiple adapter models and merges their outputs according to
-    user-assigned weighting.
-
-    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
-    implements for all the model (such as downloading or saving, etc.)
-
-    Parameters:
-        adapters (`List[T2IAdapter]`, *optional*, defaults to None):
-            A list of `T2IAdapter` model instances.
-    """
-
-    def __init__(self, adapters: List[T2IAdapter]):
-        super(MultiAdapter, self).__init__()
-        self.num_adapter = len(adapters)
-        self.adapters = paddle.nn.LayerList(sublayers=adapters)
-
-    def forward(self, xs: paddle.Tensor, adapter_weights: Optional[List[float]] = None) -> List[paddle.Tensor]:
-        """
-        Args:
-            xs (`torch.Tensor`):
-                (batch, channel, height, width) input images for multiple adapter models concated along dimension 1,
-                `channel` should equal to `num_adapter` * "number of channel of image".
-            adapter_weights (`List[float]`, *optional*, defaults to None):
-                List of floats representing the weight which will be multiply to each adapter's output before adding
-                them together.
-        """
-        if adapter_weights is None:
-            adapter_weights = paddle.to_tensor([1 / self.num_adapter] * self.num_adapter)
-        else:
-            adapter_weights = paddle.to_tensor(adapter_weights)
-        if xs.shape[1] % self.num_adapter != 0:
-            raise ValueError(
-                f"Expecting multi-adapter's input have number of channel that cab be evenly divisible by num_adapter: {xs.shape[1]} % {self.num_adapter} != 0"
-            )
-        x_list = paddle.chunk(x=xs, chunks=self.num_adapter, axis=1)
-        accume_state = None
-        for x, w, adapter in zip(x_list, adapter_weights, self.adapters):
-            features = adapter(x)
-            if accume_state is None:
-                accume_state = features
-            else:
-                for i in range(len(features)):
-                    accume_state[i] += w * features[i]
-        return accume_state
diff --git a/ppdiffusers/ppdiffusers/models/attention.py b/ppdiffusers/ppdiffusers/models/attention.py
deleted file mode 100644
index 162da05757a8..000000000000
--- a/ppdiffusers/ppdiffusers/models/attention.py
+++ /dev/null
@@ -1,574 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-from typing import Optional
-
-import paddle
-import paddle.nn.functional as F
-from paddle import nn
-
-from ..utils import is_ppxformers_available
-from .attention_processor import Attention
-from .embeddings import CombinedTimestepLabelEmbeddings
-
-
-def drop_path(input, drop_prob: float = 0.0, training: bool = False):
-    """
-    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
-
-    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
-    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
-    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
-    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
-    argument.
-    """
-    if drop_prob == 0.0 or not training:
-        return input
-    keep_prob = 1 - drop_prob
-    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
-    random_tensor = keep_prob + paddle.rand(shape, dtype=input.dtype)
-    random_tensor = paddle.floor(random_tensor)  # binarize
-    output = (input / keep_prob) * random_tensor
-    return output
-
-
-class DropPath(nn.Layer):
-    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
-
-    def __init__(self, drop_prob: Optional[float] = None) -> None:
-        super().__init__()
-        self.drop_prob = drop_prob
-
-    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
-        return drop_path(hidden_states, self.drop_prob, self.training)
-
-    def extra_repr(self) -> str:
-        return "p={}".format(self.drop_prob)
-
-
-class Mlp(nn.Layer):
-    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        self.fc1 = nn.Linear(in_features, hidden_features)
-        self.act = act_layer()
-        self.fc2 = nn.Linear(hidden_features, out_features)
-        self.drop = nn.Dropout(drop)
-
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.drop(x)
-        x = self.fc2(x)
-        x = self.drop(x)
-        return x
-
-
-class AttentionBlock(nn.Layer):
-    """
-    An attention block that allows spatial positions to attend to each other. Originally ported from here, but adapted
-    to the N-d case.
-    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
-    Uses three q, k, v linear layers to compute attention.
-
-    Parameters:
-        channels (`int`): The number of channels in the input and output.
-        num_head_channels (`int`, *optional*):
-            The number of channels in each head. If None, then `num_heads` = 1.
-        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for group norm.
-        rescale_output_factor (`float`, *optional*, defaults to 1.0): The factor to rescale the output by.
-        eps (`float`, *optional*, defaults to 1e-5): The epsilon value to use for group norm.
-    """
-
-    # IMPORTANT;TODO(Patrick, William) - this class will be deprecated soon. Do not use it anymore
-
-    def __init__(
-        self,
-        channels: int,
-        num_head_channels: Optional[int] = None,
-        norm_num_groups: int = 32,
-        rescale_output_factor: float = 1.0,
-        eps: float = 1e-5,
-    ):
-        super().__init__()
-        self.channels = channels
-
-        self.num_heads = channels // num_head_channels if num_head_channels is not None else 1
-        self.head_size = self.channels // self.num_heads
-        self.scale = 1 / math.sqrt(self.channels / self.num_heads)
-
-        self.group_norm = nn.GroupNorm(num_channels=channels, num_groups=norm_num_groups, epsilon=eps)
-
-        # define q,k,v as linear layers
-        self.query = nn.Linear(channels, channels)
-        self.key = nn.Linear(channels, channels)
-        self.value = nn.Linear(channels, channels)
-
-        self.rescale_output_factor = rescale_output_factor
-        self.proj_attn = nn.Linear(channels, channels)
-
-        self._use_memory_efficient_attention_xformers = False
-        self._use_2_5_attn = True
-        self._attention_op = None
-
-    def reshape_heads_to_batch_dim(self, tensor, transpose=True, merge_head_and_batch=False):
-        tensor = tensor.reshape([0, 0, self.num_heads, self.head_size])
-        # currently we donot use `unmerge_head_and_batch`
-        if transpose or merge_head_and_batch:
-            tensor = tensor.transpose([0, 2, 1, 3])
-
-        if merge_head_and_batch:
-            tensor = tensor.flatten(0, 1)
-        return tensor
-
-    def reshape_batch_dim_to_heads(self, tensor, transpose=True, unmerge_head_and_batch=False):
-        # currently we donot use `unmerge_head_and_batch`
-        if unmerge_head_and_batch:
-            seq_len = tensor.shape[1]
-            tensor = tensor.reshape([-1, self.num_heads, seq_len, self.head_size])
-
-        if transpose or unmerge_head_and_batch:
-            tensor = tensor.transpose([0, 2, 1, 3])
-
-        tensor = tensor.reshape([0, 0, tensor.shape[2] * tensor.shape[3]])
-        return tensor
-
-    def set_use_memory_efficient_attention_xformers(
-        self, use_memory_efficient_attention_xformers: bool, attention_op: Optional[str] = None
-    ):
-        if self.head_size > 128 and attention_op == "flash":
-            attention_op = "cutlass"
-        if use_memory_efficient_attention_xformers:
-            if not is_ppxformers_available():
-                raise NotImplementedError(
-                    "requires the scaled_dot_product_attention but your PaddlePaddle donot have this. Checkout the instructions on the installation page: https://www.paddlepaddle.org.cn/install/quick and follow the ones that match your environment."
-                )
-            else:
-                try:
-                    _ = F.scaled_dot_product_attention_(
-                        paddle.randn((1, 1, 2, 40), dtype=paddle.float16),
-                        paddle.randn((1, 1, 2, 40), dtype=paddle.float16),
-                        paddle.randn((1, 1, 2, 40), dtype=paddle.float16),
-                        attention_op=attention_op,
-                    )
-                except Exception as e:
-                    raise e
-
-        self._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
-        self._attention_op = attention_op
-
-    def forward(self, hidden_states):
-        residual = hidden_states
-        batch, channel, height, width = hidden_states.shape
-
-        # norm
-        hidden_states = self.group_norm(hidden_states)
-
-        hidden_states = hidden_states.reshape([batch, channel, height * width]).transpose([0, 2, 1])
-
-        # proj to q, k, v
-        query_proj = self.query(hidden_states)
-        key_proj = self.key(hidden_states)
-        value_proj = self.value(hidden_states)
-
-        query_proj = self.reshape_heads_to_batch_dim(
-            query_proj, transpose=not self._use_memory_efficient_attention_xformers
-        )
-        key_proj = self.reshape_heads_to_batch_dim(
-            key_proj, transpose=not self._use_memory_efficient_attention_xformers
-        )
-        value_proj = self.reshape_heads_to_batch_dim(
-            value_proj, transpose=not self._use_memory_efficient_attention_xformers
-        )
-
-        if self._use_memory_efficient_attention_xformers:
-            hidden_states = F.scaled_dot_product_attention_(
-                query_proj,
-                key_proj,
-                value_proj,
-                attn_mask=None,
-                scale=self.scale,
-                dropout_p=0.0,
-                training=self.training,
-                attention_op=self._attention_op,
-            )
-        else:
-            attention_scores = paddle.matmul(query_proj, key_proj, transpose_y=True) * self.scale
-            attention_probs = F.softmax(attention_scores.cast("float32"), axis=-1).cast(attention_scores.dtype)
-            hidden_states = paddle.matmul(attention_probs, value_proj)
-
-        # reshape hidden_states
-        hidden_states = self.reshape_batch_dim_to_heads(
-            hidden_states, transpose=not self._use_memory_efficient_attention_xformers
-        )
-
-        # compute next hidden_states
-        hidden_states = self.proj_attn(hidden_states)
-
-        hidden_states = hidden_states.transpose([0, 2, 1]).reshape([batch, channel, height, width])
-
-        # res connect and rescale
-        hidden_states = (hidden_states + residual) / self.rescale_output_factor
-        return hidden_states
-
-
-class BasicTransformerBlock(nn.Layer):
-    r"""
-    A basic Transformer block.
-
-    Parameters:
-        dim (`int`): The number of channels in the input and output.
-        num_attention_heads (`int`): The number of heads to use for multi-head attention.
-        attention_head_dim (`int`): The number of channels in each head.
-        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
-        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
-        only_cross_attention (`bool`, *optional*):
-            Whether to use only cross-attention layers. In this case two cross attention layers are used.
-        double_self_attention (`bool`, *optional*):
-            Whether to use two self-attention layers. In this case no cross attention layers are used.
-        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
-        num_embeds_ada_norm (:
-            obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
-        attention_bias (:
-            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
-    """
-
-    def __init__(
-        self,
-        dim: int,
-        num_attention_heads: int,
-        attention_head_dim: int,
-        dropout=0.0,
-        cross_attention_dim: Optional[int] = None,
-        activation_fn: str = "geglu",
-        num_embeds_ada_norm: Optional[int] = None,
-        attention_bias: bool = False,
-        only_cross_attention: bool = False,
-        double_self_attention: bool = False,
-        upcast_attention: bool = False,
-        norm_elementwise_affine: bool = True,
-        norm_type: str = "layer_norm",
-        final_dropout: bool = False,
-    ):
-        super().__init__()
-        self.only_cross_attention = only_cross_attention
-
-        self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
-        self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
-
-        if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
-            raise ValueError(
-                f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
-                f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
-            )
-
-        if not norm_elementwise_affine:
-            norm_kwargs = {"weight_attr": False, "bias_attr": False}
-        else:
-            norm_kwargs = {}
-
-        # Define 3 blocks. Each block has its own normalization layer.
-        # 1. Self-Attn
-        if self.use_ada_layer_norm:
-            self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
-        elif self.use_ada_layer_norm_zero:
-            self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
-        else:
-            self.norm1 = nn.LayerNorm(dim, **norm_kwargs)
-        self.attn1 = Attention(
-            query_dim=dim,
-            heads=num_attention_heads,
-            dim_head=attention_head_dim,
-            dropout=dropout,
-            bias=attention_bias,
-            cross_attention_dim=cross_attention_dim if only_cross_attention else None,
-            upcast_attention=upcast_attention,
-        )
-
-        # 2. Cross-Attn
-        if cross_attention_dim is not None or double_self_attention:
-            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
-            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
-            # the second cross attention block.
-            self.norm2 = (
-                AdaLayerNorm(dim, num_embeds_ada_norm) if self.use_ada_layer_norm else nn.LayerNorm(dim, **norm_kwargs)
-            )
-            self.attn2 = Attention(
-                query_dim=dim,
-                cross_attention_dim=cross_attention_dim if not double_self_attention else None,
-                heads=num_attention_heads,
-                dim_head=attention_head_dim,
-                dropout=dropout,
-                bias=attention_bias,
-                upcast_attention=upcast_attention,
-            )  # is self-attn if encoder_hidden_states is none
-        else:
-            self.norm2 = None
-            self.attn2 = None
-
-        # 3. Feed-forward
-        self.norm3 = nn.LayerNorm(dim, **norm_kwargs)
-        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout)
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        timestep=None,
-        cross_attention_kwargs=None,
-        class_labels=None,
-    ):
-        # Notice that normalization is always applied before the real computation in the following blocks.
-        # 1. Self-Attention
-        if self.use_ada_layer_norm:
-            norm_hidden_states = self.norm1(hidden_states, timestep)
-        elif self.use_ada_layer_norm_zero:
-            norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
-                hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
-            )
-        else:
-            norm_hidden_states = self.norm1(hidden_states)
-
-        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
-        attn_output = self.attn1(
-            norm_hidden_states,
-            encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
-            attention_mask=attention_mask,
-            **cross_attention_kwargs,
-        )
-        if self.use_ada_layer_norm_zero:
-            attn_output = gate_msa.unsqueeze(1) * attn_output
-        hidden_states = attn_output + hidden_states
-
-        if self.attn2 is not None:
-            norm_hidden_states = (
-                self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
-            )
-            # TODO (Birch-San): Here we should prepare the encoder_attention mask correctly
-            # prepare attention mask here
-
-            # 2. Cross-Attention
-            attn_output = self.attn2(
-                norm_hidden_states,
-                encoder_hidden_states=encoder_hidden_states,
-                attention_mask=encoder_attention_mask,
-                **cross_attention_kwargs,
-            )
-            hidden_states = attn_output + hidden_states
-
-        # 3. Feed-forward
-        norm_hidden_states = self.norm3(hidden_states)
-
-        if self.use_ada_layer_norm_zero:
-            norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
-
-        ff_output = self.ff(norm_hidden_states)
-
-        if self.use_ada_layer_norm_zero:
-            ff_output = gate_mlp.unsqueeze(1) * ff_output
-
-        hidden_states = ff_output + hidden_states
-
-        return hidden_states
-
-
-class FeedForward(nn.Layer):
-    r"""
-    A feed-forward layer.
-
-    Parameters:
-        dim (`int`): The number of channels in the input.
-        dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
-        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
-        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
-        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
-        final_dropout (`bool` *optional*, defaults to False): Apply a final dropout.
-    """
-
-    def __init__(
-        self,
-        dim: int,
-        dim_out: Optional[int] = None,
-        mult: int = 4,
-        dropout: float = 0.0,
-        activation_fn: str = "geglu",
-        final_dropout: bool = False,
-    ):
-        super().__init__()
-        inner_dim = int(dim * mult)
-        dim_out = dim_out if dim_out is not None else dim
-
-        if activation_fn == "gelu":
-            act_fn = GELU(dim, inner_dim)
-        if activation_fn == "gelu-approximate":
-            act_fn = GELU(dim, inner_dim, approximate="tanh")
-        elif activation_fn == "geglu":
-            act_fn = GEGLU(dim, inner_dim)
-        elif activation_fn == "geglu-approximate":
-            act_fn = ApproximateGELU(dim, inner_dim)
-
-        self.net = nn.LayerList([])
-        # project in
-        self.net.append(act_fn)
-        # project dropout
-        self.net.append(nn.Dropout(dropout))
-        # project out
-        self.net.append(nn.Linear(inner_dim, dim_out))
-        # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
-        if final_dropout:
-            self.net.append(nn.Dropout(dropout))
-
-    def forward(self, hidden_states):
-        for module in self.net:
-            hidden_states = module(hidden_states)
-        return hidden_states
-
-
-class GELU(nn.Layer):
-    r"""
-    GELU activation function with tanh approximation support with `approximate="tanh"`.
-    """
-
-    def __init__(self, dim_in: int, dim_out: int, approximate: str = "none"):
-        super().__init__()
-        self.proj = nn.Linear(dim_in, dim_out)
-        self.approximate = approximate
-        self.approximate_bool = approximate == "tanh"
-
-    def forward(self, hidden_states):
-        hidden_states = self.proj(hidden_states)
-        hidden_states = F.gelu(hidden_states, approximate=self.approximate_bool)
-        return hidden_states
-
-
-class GEGLU(nn.Layer):
-    r"""
-    A variant of the gated linear unit activation function from https://arxiv.org/abs/2002.05202.
-
-    Parameters:
-        dim_in (`int`): The number of channels in the input.
-        dim_out (`int`): The number of channels in the output.
-    """
-
-    def __init__(self, dim_in: int, dim_out: int):
-        super().__init__()
-        self.proj = nn.Linear(dim_in, dim_out * 2)
-
-    def forward(self, hidden_states):
-        hidden_states, gate = self.proj(hidden_states).chunk(2, axis=-1)
-        return hidden_states * F.gelu(gate)
-
-
-class ApproximateGELU(nn.Layer):
-    """
-    The approximate form of Gaussian Error Linear Unit (GELU)
-
-    For more details, see section 2: https://arxiv.org/abs/1606.08415
-    """
-
-    def __init__(self, dim_in: int, dim_out: int):
-        super().__init__()
-        self.proj = nn.Linear(dim_in, dim_out)
-
-    def forward(self, x):
-        x = self.proj(x)
-        return x * F.sigmoid(1.702 * x)
-
-
-class AdaLayerNorm(nn.Layer):
-    """
-    Norm layer modified to incorporate timestep embeddings.
-    """
-
-    def __init__(self, embedding_dim, num_embeddings):
-        super().__init__()
-        self.emb = nn.Embedding(num_embeddings, embedding_dim)
-        self.silu = nn.Silu()
-        self.linear = nn.Linear(embedding_dim, embedding_dim * 2)
-        # elementwise_affine=False
-        norm_kwargs = {"weight_attr": False, "bias_attr": False}
-        self.norm = nn.LayerNorm(embedding_dim, **norm_kwargs)
-
-    def forward(self, x, timestep):
-        emb = self.linear(self.silu(self.emb(timestep)))
-        # must set axis=-1, paddle vs pytorch
-        scale, shift = paddle.chunk(emb, 2, axis=-1)
-        x = self.norm(x) * (1 + scale) + shift
-        return x
-
-
-class AdaLayerNormZero(nn.Layer):
-    """
-    Norm layer adaptive layer norm zero (adaLN-Zero).
-    """
-
-    def __init__(self, embedding_dim, num_embeddings):
-        super().__init__()
-
-        self.emb = CombinedTimestepLabelEmbeddings(num_embeddings, embedding_dim)
-
-        self.silu = nn.Silu()
-        self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias_attr=True)
-        # elementwise_affine=False
-        norm_kwargs = {"weight_attr": False, "bias_attr": False}
-        self.norm = nn.LayerNorm(embedding_dim, epsilon=1e-6, **norm_kwargs)
-
-    def forward(self, x, timestep, class_labels, hidden_dtype=None):
-        emb = self.linear(self.silu(self.emb(timestep, class_labels, hidden_dtype=hidden_dtype)))
-        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.chunk(6, axis=1)
-        x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
-        return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
-
-
-class AdaGroupNorm(nn.Layer):
-    """
-    GroupNorm layer modified to incorporate timestep embeddings.
-    """
-
-    def __init__(
-        self, embedding_dim: int, out_dim: int, num_groups: int, act_fn: Optional[str] = None, eps: float = 1e-5
-    ):
-        super().__init__()
-        self.num_groups = num_groups
-        self.eps = eps
-        self.act = None
-        if act_fn == "swish":
-            self.act = lambda x: F.silu(x)
-        elif act_fn == "mish":
-            self.act = nn.Mish()
-        elif act_fn == "silu":
-            self.act = nn.Silu()
-        elif act_fn == "gelu":
-            self.act = nn.GELU()
-
-        self.linear = nn.Linear(embedding_dim, out_dim * 2)
-        # elementwise_affine=False
-        norm_kwargs = {"weight_attr": False, "bias_attr": False}
-        self.group_norm = nn.GroupNorm(num_groups, out_dim, epsilon=eps, **norm_kwargs)
-        self.group_norm.weight = None
-        self.group_norm.bias = None
-
-    def forward(self, x, emb):
-        if self.act:
-            emb = self.act(emb)
-        emb = self.linear(emb)
-        emb = emb[:, :, None, None]
-        scale, shift = emb.chunk(2, axis=1)
-        x = self.group_norm(x)
-        x = x * (1 + scale) + shift
-        return x
diff --git a/ppdiffusers/ppdiffusers/models/attention_processor.py b/ppdiffusers/ppdiffusers/models/attention_processor.py
deleted file mode 100644
index d31682aa1f03..000000000000
--- a/ppdiffusers/ppdiffusers/models/attention_processor.py
+++ /dev/null
@@ -1,1078 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Optional, Union
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-
-from ..initializer import normal_, zeros_
-from ..utils import deprecate, is_ppxformers_available, logging
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-class Attention(nn.Layer):
-    r"""
-    A cross attention layer.
-
-    Parameters:
-        query_dim (`int`): The number of channels in the query.
-        cross_attention_dim (`int`, *optional*):
-            The number of channels in the encoder_hidden_states. If not given, defaults to `query_dim`.
-        heads (`int`,  *optional*, defaults to 8): The number of heads to use for multi-head attention.
-        dim_head (`int`,  *optional*, defaults to 64): The number of channels in each head.
-        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
-        bias (`bool`, *optional*, defaults to False):
-            Set to `True` for the query, key, and value linear layers to contain a bias parameter.
-    """
-
-    def __init__(
-        self,
-        query_dim: int,
-        cross_attention_dim: Optional[int] = None,
-        heads: int = 8,
-        dim_head: int = 64,
-        dropout: float = 0.0,
-        bias=False,
-        upcast_attention: bool = False,
-        upcast_softmax: bool = False,
-        cross_attention_norm: Optional[str] = None,
-        cross_attention_norm_num_groups: int = 32,
-        added_kv_proj_dim: Optional[int] = None,
-        norm_num_groups: Optional[int] = None,
-        out_bias: bool = True,
-        scale_qk: bool = True,
-        only_cross_attention: bool = False,
-        processor: Optional["AttnProcessor"] = None,
-    ):
-        super().__init__()
-        inner_dim = dim_head * heads
-        cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
-        self.upcast_attention = upcast_attention
-        self.upcast_softmax = upcast_softmax
-
-        self.scale = dim_head**-0.5 if scale_qk else 1.0
-
-        self.heads = heads
-        self.head_dim = dim_head
-        # for slice_size > 0 the attention score computation
-        # is split across the batch axis to save memory
-        # You can set slice_size with `set_attention_slice`
-        self.sliceable_head_dim = heads
-
-        self.added_kv_proj_dim = added_kv_proj_dim
-        self.only_cross_attention = only_cross_attention
-
-        if self.added_kv_proj_dim is None and self.only_cross_attention:
-            raise ValueError(
-                "`only_cross_attention` can only be set to True if `added_kv_proj_dim` is not None. Make sure to set either `only_cross_attention=False` or define `added_kv_proj_dim`."
-            )
-
-        if norm_num_groups is not None:
-            self.group_norm = nn.GroupNorm(num_channels=query_dim, num_groups=norm_num_groups, epsilon=1e-5)
-        else:
-            self.group_norm = None
-
-        if cross_attention_norm is None:
-            self.norm_cross = None
-        elif cross_attention_norm == "layer_norm":
-            self.norm_cross = nn.LayerNorm(cross_attention_dim)
-        elif cross_attention_norm == "group_norm":
-            if self.added_kv_proj_dim is not None:
-                # The given `encoder_hidden_states` are initially of shape
-                # (batch_size, seq_len, added_kv_proj_dim) before being projected
-                # to (batch_size, seq_len, cross_attention_dim). The norm is applied
-                # before the projection, so we need to use `added_kv_proj_dim` as
-                # the number of channels for the group norm.
-                norm_cross_num_channels = added_kv_proj_dim
-            else:
-                norm_cross_num_channels = cross_attention_dim
-
-            self.norm_cross = nn.GroupNorm(
-                num_channels=norm_cross_num_channels, num_groups=cross_attention_norm_num_groups, epsilon=1e-5
-            )
-        else:
-            raise ValueError(
-                f"unknown cross_attention_norm: {cross_attention_norm}. Should be None, 'layer_norm' or 'group_norm'"
-            )
-
-        self.to_q = nn.Linear(query_dim, inner_dim, bias_attr=bias)
-
-        if not self.only_cross_attention:
-            # only relevant for the `AddedKVProcessor` classes
-            self.to_k = nn.Linear(cross_attention_dim, inner_dim, bias_attr=bias)
-            self.to_v = nn.Linear(cross_attention_dim, inner_dim, bias_attr=bias)
-        else:
-            self.to_k = None
-            self.to_v = None
-
-        if self.added_kv_proj_dim is not None:
-            self.add_k_proj = nn.Linear(added_kv_proj_dim, inner_dim)
-            self.add_v_proj = nn.Linear(added_kv_proj_dim, inner_dim)
-
-        self.to_out = nn.LayerList([])
-        self.to_out.append(nn.Linear(inner_dim, query_dim, bias_attr=out_bias))
-        self.to_out.append(nn.Dropout(dropout))
-
-        # set attention processor
-        if processor is None:
-            processor = AttnProcessor()
-            # processor = AttnProcessor2_5() if is_ppxformers_available() else AttnProcessor()
-        self.set_processor(processor)
-
-    def set_use_memory_efficient_attention_xformers(
-        self, use_memory_efficient_attention_xformers: bool, attention_op: Optional[str] = None
-    ):
-        is_lora = hasattr(self, "processor") and isinstance(
-            self.processor, (LoRAAttnProcessor, LoRAXFormersAttnProcessor)
-        )
-        is_custom_diffusion = hasattr(self, "processor") and isinstance(
-            self.processor, (CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor)
-        )
-        is_added_kv = self.added_kv_proj_dim is not None
-        if use_memory_efficient_attention_xformers:
-            # if self.added_kv_proj_dim is not None:
-            #     # TODO(Anton, Patrick, Suraj, William) - currently xformers doesn't work for UnCLIP
-            #     # which uses this type of cross attention ONLY because the attention mask of format
-            #     # [0, ..., -10.000, ..., 0, ...,] is not supported
-            #     raise NotImplementedError(
-            #         "Memory efficient attention with `xformers` is currently not supported when"
-            #         " `self.added_kv_proj_dim` is defined."
-            #     )
-            if not is_ppxformers_available():
-                raise NotImplementedError(
-                    "requires the scaled_dot_product_attention but your PaddlePaddle donot have this. Checkout the instructions on the installation page: https://www.paddlepaddle.org.cn/install/quick and follow the ones that match your environment."
-                )
-            else:
-                try:
-                    # Make sure we can run the memory efficient attention
-                    _ = F.scaled_dot_product_attention_(
-                        paddle.ones((1, 1, 2, 40), dtype=paddle.float16),
-                        paddle.ones((1, 1, 2, 40), dtype=paddle.float16),
-                        paddle.ones((1, 1, 2, 40), dtype=paddle.float16),
-                        attention_op=attention_op,
-                    )
-                except Exception as e:
-                    raise e
-            if self.head_dim > 128 and attention_op == "flash":
-                attention_op = "cutlass"
-            if is_lora:
-                processor = LoRAXFormersAttnProcessor(
-                    hidden_size=self.processor.hidden_size,
-                    cross_attention_dim=self.processor.cross_attention_dim,
-                    rank=self.processor.rank,
-                    attention_op=attention_op,
-                )
-                # we must cast dtype
-                processor.to(dtype=self.dtype)
-                processor.load_dict(self.processor.state_dict())
-            elif is_custom_diffusion:
-                processor = CustomDiffusionXFormersAttnProcessor(
-                    train_kv=self.processor.train_kv,
-                    train_q_out=self.processor.train_q_out,
-                    hidden_size=self.processor.hidden_size,
-                    cross_attention_dim=self.processor.cross_attention_dim,
-                    attention_op=attention_op,
-                )
-                # we must cast dtype
-                processor.to(dtype=self.dtype)
-                processor.load_dict(self.processor.state_dict())
-            elif is_added_kv:
-                processor = XFormersAttnAddedKVProcessor(attention_op=attention_op)
-            else:
-                processor = XFormersAttnProcessor(attention_op=attention_op)
-        else:
-            if is_lora:
-                processor = LoRAAttnProcessor(
-                    hidden_size=self.processor.hidden_size,
-                    cross_attention_dim=self.processor.cross_attention_dim,
-                    rank=self.processor.rank,
-                )
-                # we must cast dtype
-                processor.to(dtype=self.dtype)
-                processor.load_dict(self.processor.state_dict())
-            elif is_custom_diffusion:
-                processor = CustomDiffusionAttnProcessor(
-                    train_kv=self.processor.train_kv,
-                    train_q_out=self.processor.train_q_out,
-                    hidden_size=self.processor.hidden_size,
-                    cross_attention_dim=self.processor.cross_attention_dim,
-                )
-                # we must cast dtype
-                processor.to(dtype=self.dtype)
-                processor.load_dict(self.processor.state_dict())
-            elif is_added_kv:
-                processor = AttnAddedKVProcessor(attention_op=attention_op)
-            else:
-                processor = AttnProcessor()
-
-        self.set_processor(processor)
-
-    def set_attention_slice(self, slice_size):
-        if slice_size is not None and slice_size > self.sliceable_head_dim:
-            raise ValueError(f"slice_size {slice_size} has to be smaller or equal to {self.sliceable_head_dim}.")
-
-        if slice_size is not None and self.added_kv_proj_dim is not None:
-            processor = SlicedAttnAddedKVProcessor(slice_size)
-        elif slice_size is not None:
-            processor = SlicedAttnProcessor(slice_size)
-        elif self.added_kv_proj_dim is not None:
-            processor = AttnAddedKVProcessor()
-        else:
-            processor = AttnProcessor()
-
-        self.set_processor(processor)
-
-    def set_processor(self, processor: "AttnProcessor"):
-        # if current processor is in `self._sub_layers` and if passed `processor` is not, we need to
-        # pop `processor` from `self._sub_layers`
-        if hasattr(self, "processor") and isinstance(self.processor, nn.Layer) and not isinstance(processor, nn.Layer):
-            logger.info(f"You are removing possibly trained weights of {self.processor} with {processor}")
-            self._sub_layers.pop("processor")
-
-        self.processor = processor
-
-    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, **cross_attention_kwargs):
-        # The `Attention` class can call different attention processors / attention functions
-        # here we simply pass along all tensors to the selected processor class
-        # For standard processors that are defined here, `**cross_attention_kwargs` is empty
-        return self.processor(
-            self,
-            hidden_states,
-            encoder_hidden_states=encoder_hidden_states,
-            attention_mask=attention_mask,
-            **cross_attention_kwargs,
-        )
-
-    def batch_to_head_dim(self, tensor, transpose=True, in_dim=4):
-        if in_dim == 3:
-            head_size = self.heads
-            batch_size, seq_len, dim = tensor.shape
-            tensor = tensor.reshape([batch_size // head_size, head_size, seq_len, dim])
-        if transpose:
-            tensor = tensor.transpose([0, 2, 1, 3])
-        tensor = tensor.reshape([0, 0, tensor.shape[2] * tensor.shape[3]])
-        return tensor
-
-    def head_to_batch_dim(self, tensor, transpose=True, out_dim=4):
-        tensor = tensor.reshape([0, 0, self.heads, self.head_dim])
-        if transpose or out_dim == 3:
-            tensor = tensor.transpose([0, 2, 1, 3])
-        if out_dim == 3:
-            tensor = tensor.flatten(0, 1)
-        return tensor
-
-    def get_attention_scores(self, query, key, attention_mask=None):
-        if self.upcast_softmax or self.upcast_attention:
-            dtype = query.dtype
-
-        if self.upcast_attention:
-            query = query.cast(paddle.float32)
-            key = key.cast(paddle.float32)
-
-        attention_scores = paddle.matmul(query, key, transpose_y=True) * self.scale
-
-        if attention_mask is not None:
-            attention_scores = attention_scores + attention_mask
-
-        if self.upcast_softmax:
-            attention_scores = attention_scores.cast(paddle.float32)
-
-        attention_probs = F.softmax(attention_scores, axis=-1)
-
-        if self.upcast_softmax or self.upcast_attention:
-            attention_probs = attention_probs.cast(dtype)
-
-        return attention_probs
-
-    def prepare_attention_mask(self, attention_mask, target_length, batch_size=None, out_dim=4, transpose=True):
-        if batch_size is None:
-            deprecate(
-                "batch_size=None",
-                "0.0.15",
-                message=(
-                    "Not passing the `batch_size` parameter to `prepare_attention_mask` can lead to incorrect"
-                    " attention mask preparation and is deprecated behavior. Please make sure to pass `batch_size` to"
-                    " `prepare_attention_mask` when preparing the attention_mask."
-                ),
-            )
-            batch_size = 1
-
-        num_heads = self.heads
-        if attention_mask is None:
-            return attention_mask
-
-        if attention_mask.shape[-1] != target_length:
-            attention_mask = F.pad(attention_mask, (0, target_length), value=0.0, data_format="NCL")
-        if out_dim == 3:
-            if attention_mask.shape[0] < batch_size * num_heads:
-                attention_mask = attention_mask.repeat_interleave(num_heads, axis=0)
-        elif out_dim == 4:
-            attention_mask = attention_mask.unsqueeze(1)
-            if attention_mask.shape[0] < batch_size * num_heads:
-                attention_mask = attention_mask.repeat_interleave(num_heads, axis=1)
-            attention_mask = paddle.reshape(attention_mask, [batch_size, num_heads, -1, attention_mask.shape[-1]])
-
-        if attention_mask.ndim == 4:
-            if not transpose:
-                attention_mask = attention_mask.transpose([0, 2, 1, 3])
-        return attention_mask
-
-    def norm_encoder_hidden_states(self, encoder_hidden_states):
-        assert self.norm_cross is not None, "self.norm_cross must be defined to call self.norm_encoder_hidden_states"
-
-        if isinstance(self.norm_cross, nn.LayerNorm):
-            encoder_hidden_states = self.norm_cross(encoder_hidden_states)
-        elif isinstance(self.norm_cross, nn.GroupNorm):
-            # Group norm norms along the channels dimension and expects
-            # input to be in the shape of (N, C, *). In this case, we want
-            # to norm along the hidden dimension, so we need to move
-            # (batch_size, sequence_length, hidden_size) ->
-            # (batch_size, hidden_size, sequence_length)
-            encoder_hidden_states = encoder_hidden_states.transpose([0, 2, 1])
-            encoder_hidden_states = self.norm_cross(encoder_hidden_states)
-            encoder_hidden_states = encoder_hidden_states.transpose([0, 2, 1])
-        else:
-            assert False
-
-        return encoder_hidden_states
-
-
-class AttnProcessor:
-    def __call__(
-        self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, **cross_attention_kwargs
-    ):
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-        query = attn.to_q(hidden_states)
-
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-
-        key = attn.to_k(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states)
-
-        query = attn.head_to_batch_dim(query)
-        key = attn.head_to_batch_dim(key)
-        value = attn.head_to_batch_dim(value)
-
-        attention_probs = attn.get_attention_scores(query, key, attention_mask)
-        hidden_states = paddle.matmul(attention_probs, value)
-        hidden_states = attn.batch_to_head_dim(hidden_states)
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-
-        return hidden_states
-
-
-class LoRALinearLayer(nn.Layer):
-    def __init__(self, in_features, out_features, rank=4, network_alpha=None):
-        super().__init__()
-
-        if rank > min(in_features, out_features):
-            raise ValueError(f"LoRA rank {rank} must be less or equal than {min(in_features, out_features)}")
-
-        self.down = nn.Linear(in_features, rank, bias_attr=False)
-        self.up = nn.Linear(rank, out_features, bias_attr=False)
-        # This value has the same meaning as the `--network_alpha` option in the kohya-ss trainer script.
-        # See https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
-        self.network_alpha = network_alpha
-        self.rank = rank
-
-        normal_(self.down.weight, std=1 / rank)
-        zeros_(self.up.weight)
-
-    def forward(self, hidden_states):
-        orig_dtype = hidden_states.dtype
-        dtype = self.down.weight.dtype
-
-        down_hidden_states = self.down(hidden_states.cast(dtype))
-        up_hidden_states = self.up(down_hidden_states)
-
-        if self.network_alpha is not None:
-            up_hidden_states *= self.network_alpha / self.rank
-
-        return up_hidden_states.cast(orig_dtype)
-
-
-class LoRAAttnProcessor(nn.Layer):
-    r"""
-    Processor for implementing the LoRA attention mechanism.
-
-    Args:
-        hidden_size (`int`, *optional*):
-            The hidden size of the attention layer.
-        cross_attention_dim (`int`, *optional*):
-            The number of channels in the `encoder_hidden_states`.
-        rank (`int`, defaults to 4):
-            The dimension of the LoRA update matrices.
-        network_alpha (`int`, *optional*):
-            Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs.
-    """
-
-    def __init__(self, hidden_size, cross_attention_dim=None, rank=4, network_alpha=None):
-        super().__init__()
-
-        self.hidden_size = hidden_size
-        self.cross_attention_dim = cross_attention_dim
-        self.rank = rank
-
-        self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
-        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
-        self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
-        self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
-
-    def __call__(
-        self,
-        attn: Attention,
-        hidden_states,
-        encoder_hidden_states=None,
-        attention_mask=None,
-        scale=1.0,
-        **cross_attention_kwargs
-    ):
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-
-        query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states)
-        query = attn.head_to_batch_dim(query)
-
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-
-        key = attn.to_k(encoder_hidden_states) + scale * self.to_k_lora(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states) + scale * self.to_v_lora(encoder_hidden_states)
-
-        key = attn.head_to_batch_dim(key)
-        value = attn.head_to_batch_dim(value)
-
-        attention_probs = attn.get_attention_scores(query, key, attention_mask)
-        hidden_states = paddle.matmul(attention_probs, value)
-        hidden_states = attn.batch_to_head_dim(hidden_states)
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-
-        return hidden_states
-
-
-class CustomDiffusionAttnProcessor(nn.Layer):
-    def __init__(
-        self,
-        train_kv=True,
-        train_q_out=True,
-        hidden_size=None,
-        cross_attention_dim=None,
-        out_bias=True,
-        dropout=0.0,
-    ):
-        super().__init__()
-        self.train_kv = train_kv
-        self.train_q_out = train_q_out
-
-        self.hidden_size = hidden_size
-        self.cross_attention_dim = cross_attention_dim
-
-        # `_custom_diffusion` id for easy serialization and loading.
-        if self.train_kv:
-            self.to_k_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias_attr=False)
-            self.to_v_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias_attr=False)
-        if self.train_q_out:
-            self.to_q_custom_diffusion = nn.Linear(hidden_size, hidden_size, bias_attr=False)
-            self.to_out_custom_diffusion = nn.LayerList([])
-            self.to_out_custom_diffusion.append(nn.Linear(hidden_size, hidden_size, bias_attr=out_bias))
-            self.to_out_custom_diffusion.append(nn.Dropout(dropout))
-
-    def __call__(
-        self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, **cross_attention_kwargs
-    ):
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-        if self.train_q_out:
-            query = self.to_q_custom_diffusion(hidden_states)
-        else:
-            query = attn.to_q(hidden_states)
-
-        if encoder_hidden_states is None:
-            crossattn = False
-            encoder_hidden_states = hidden_states
-        else:
-            crossattn = True
-            if attn.norm_cross:
-                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-
-        if self.train_kv:
-            key = self.to_k_custom_diffusion(encoder_hidden_states)
-            value = self.to_v_custom_diffusion(encoder_hidden_states)
-        else:
-            key = attn.to_k(encoder_hidden_states)
-            value = attn.to_v(encoder_hidden_states)
-
-        if crossattn:
-            detach = paddle.ones_like(key)
-            detach[:, :1, :] = detach[:, :1, :] * 0.0
-            key = detach * key + (1 - detach) * key.detach()
-            value = detach * value + (1 - detach) * value.detach()
-
-        query = attn.head_to_batch_dim(query)
-        key = attn.head_to_batch_dim(key)
-        value = attn.head_to_batch_dim(value)
-
-        attention_probs = attn.get_attention_scores(query, key, attention_mask)
-        hidden_states = paddle.matmul(attention_probs, value)
-        hidden_states = attn.batch_to_head_dim(hidden_states)
-
-        if self.train_q_out:
-            # linear proj
-            hidden_states = self.to_out_custom_diffusion[0](hidden_states)
-            # dropout
-            hidden_states = self.to_out_custom_diffusion[1](hidden_states)
-        else:
-            # linear proj
-            hidden_states = attn.to_out[0](hidden_states)
-            # dropout
-            hidden_states = attn.to_out[1](hidden_states)
-
-        return hidden_states
-
-
-class AttnAddedKVProcessor:
-    def __call__(
-        self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, **cross_attention_kwargs
-    ):
-        residual = hidden_states
-        hidden_states = hidden_states.reshape([hidden_states.shape[0], hidden_states.shape[1], -1]).transpose(
-            [0, 2, 1]
-        )
-        batch_size, sequence_length, _ = hidden_states.shape
-
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-
-        hidden_states = attn.group_norm(hidden_states.transpose([0, 2, 1])).transpose([0, 2, 1])
-
-        query = attn.to_q(hidden_states)
-        query = attn.head_to_batch_dim(query)
-
-        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
-        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
-        encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj)
-        encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj)
-
-        if not attn.only_cross_attention:
-            key = attn.to_k(hidden_states)
-            value = attn.to_v(hidden_states)
-            key = attn.head_to_batch_dim(key)
-            value = attn.head_to_batch_dim(value)
-            key = paddle.concat([encoder_hidden_states_key_proj, key], axis=2)
-            value = paddle.concat([encoder_hidden_states_value_proj, value], axis=2)
-        else:
-            key = encoder_hidden_states_key_proj
-            value = encoder_hidden_states_value_proj
-
-        attention_probs = attn.get_attention_scores(query, key, attention_mask)
-        hidden_states = paddle.matmul(attention_probs, value)
-        hidden_states = attn.batch_to_head_dim(hidden_states)
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-
-        hidden_states = hidden_states.transpose([0, 2, 1]).reshape(residual.shape)
-        hidden_states = hidden_states + residual
-
-        return hidden_states
-
-
-class XFormersAttnAddedKVProcessor:
-    def __init__(self, attention_op: Optional[str] = None):
-        assert attention_op in [None, "cutlass", "flash"]
-        self.attention_op = attention_op
-
-    def __call__(
-        self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, **cross_attention_kwargs
-    ):
-        residual = hidden_states
-        hidden_states = hidden_states.reshape([hidden_states.shape[0], hidden_states.shape[1], -1]).transpose(
-            [0, 2, 1]
-        )
-        batch_size, sequence_length, _ = hidden_states.shape
-
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size, transpose=False)
-
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-
-        hidden_states = attn.group_norm(hidden_states.transpose([0, 2, 1])).transpose([0, 2, 1])
-
-        query = attn.to_q(hidden_states)
-        query = attn.head_to_batch_dim(query, transpose=False)
-
-        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
-        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
-        encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj, transpose=False)
-        encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj, transpose=False)
-
-        if not attn.only_cross_attention:
-            key = attn.to_k(hidden_states)
-            value = attn.to_v(hidden_states)
-            key = attn.head_to_batch_dim(key, transpose=False)
-            value = attn.head_to_batch_dim(value, transpose=False)
-            key = paddle.concat([encoder_hidden_states_key_proj, key], axis=1)
-            value = paddle.concat([encoder_hidden_states_value_proj, value], axis=1)
-        else:
-            key = encoder_hidden_states_key_proj
-            value = encoder_hidden_states_value_proj
-
-        hidden_states = F.scaled_dot_product_attention_(
-            query,
-            key,
-            value,
-            attn_mask=attention_mask,
-            scale=attn.scale,
-            dropout_p=0.0,
-            training=attn.training,
-            attention_op=self.attention_op,
-        )
-        hidden_states = attn.batch_to_head_dim(hidden_states, transpose=False)
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-
-        hidden_states = hidden_states.transpose([0, 2, 1]).reshape(residual.shape)
-        hidden_states = hidden_states + residual
-
-        return hidden_states
-
-
-class XFormersAttnProcessor:
-    def __init__(self, attention_op: Optional[str] = None):
-        assert attention_op in [None, "cutlass", "flash"]
-        self.attention_op = attention_op
-
-    def __call__(
-        self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, **cross_attention_kwargs
-    ):
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size, transpose=False)
-
-        query = attn.to_q(hidden_states)
-
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-
-        key = attn.to_k(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states)
-
-        # if transpose = False, query's shape will be [batch_size, seq_len, num_head, head_dim]
-        query = attn.head_to_batch_dim(query, transpose=False)
-        key = attn.head_to_batch_dim(key, transpose=False)
-        value = attn.head_to_batch_dim(value, transpose=False)
-
-        hidden_states = F.scaled_dot_product_attention_(
-            query,
-            key,
-            value,
-            attn_mask=attention_mask,
-            scale=attn.scale,
-            dropout_p=0.0,
-            training=attn.training,
-            attention_op=self.attention_op,
-        )
-
-        # hidden_states = hidden_states.cast(query.dtype)
-        hidden_states = attn.batch_to_head_dim(hidden_states, transpose=False)
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-        return hidden_states
-
-
-class LoRAXFormersAttnProcessor(nn.Layer):
-    r"""
-    Processor for implementing the LoRA attention mechanism with memory efficient attention using xFormers.
-
-    Args:
-        hidden_size (`int`, *optional*):
-            The hidden size of the attention layer.
-        cross_attention_dim (`int`, *optional*):
-            The number of channels in the `encoder_hidden_states`.
-        rank (`int`, defaults to 4):
-            The dimension of the LoRA update matrices.
-        attention_op (`Callable`, *optional*, defaults to `None`):
-            The base
-            [operator](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.AttentionOpBase) to
-            use as the attention operator. It is recommended to set to `None`, and allow xFormers to choose the best
-            operator.
-        network_alpha (`int`, *optional*):
-            Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs.
-
-    """
-
-    def __init__(
-        self, hidden_size, cross_attention_dim, rank=4, attention_op: Optional[str] = None, network_alpha=None
-    ):
-        super().__init__()
-
-        self.hidden_size = hidden_size
-        self.cross_attention_dim = cross_attention_dim
-        self.rank = rank
-        self.attention_op = attention_op
-
-        self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
-        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
-        self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
-        self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
-
-    def __call__(
-        self,
-        attn: Attention,
-        hidden_states,
-        encoder_hidden_states=None,
-        attention_mask=None,
-        scale=1.0,
-        **cross_attention_kwargs
-    ):
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size, transpose=False)
-
-        query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states)
-        query = attn.head_to_batch_dim(query, transpose=False)
-
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-
-        key = attn.to_k(encoder_hidden_states) + scale * self.to_k_lora(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states) + scale * self.to_v_lora(encoder_hidden_states)
-
-        key = attn.head_to_batch_dim(key, transpose=False)
-        value = attn.head_to_batch_dim(value, transpose=False)
-
-        hidden_states = F.scaled_dot_product_attention_(
-            query,
-            key,
-            value,
-            attn_mask=attention_mask,
-            scale=attn.scale,
-            dropout_p=0.0,
-            training=attn.training,
-            attention_op=self.attention_op,
-        )
-
-        hidden_states = attn.batch_to_head_dim(hidden_states, transpose=False)
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-
-        return hidden_states
-
-
-class CustomDiffusionXFormersAttnProcessor(nn.Layer):
-    def __init__(
-        self,
-        train_kv=True,
-        train_q_out=False,
-        hidden_size=None,
-        cross_attention_dim=None,
-        out_bias=True,
-        dropout=0.0,
-        attention_op: Optional[str] = None,
-    ):
-        super().__init__()
-        assert attention_op in [None, "cutlass", "flash"]
-        self.train_kv = train_kv
-        self.train_q_out = train_q_out
-
-        self.hidden_size = hidden_size
-        self.cross_attention_dim = cross_attention_dim
-        self.attention_op = attention_op
-
-        # `_custom_diffusion` id for easy serialization and loading.
-        if self.train_kv:
-            self.to_k_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias_attr=False)
-            self.to_v_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias_attr=False)
-        if self.train_q_out:
-            self.to_q_custom_diffusion = nn.Linear(hidden_size, hidden_size, bias_attr=False)
-            self.to_out_custom_diffusion = nn.LayerList([])
-            self.to_out_custom_diffusion.append(nn.Linear(hidden_size, hidden_size, bias_attr=out_bias))
-            self.to_out_custom_diffusion.append(nn.Dropout(dropout))
-
-    def __call__(
-        self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, **cross_attention_kwargs
-    ):
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size, transpose=False)
-
-        if self.train_q_out:
-            query = self.to_q_custom_diffusion(hidden_states)
-        else:
-            query = attn.to_q(hidden_states)
-
-        if encoder_hidden_states is None:
-            crossattn = False
-            encoder_hidden_states = hidden_states
-        else:
-            crossattn = True
-            if attn.norm_cross:
-                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-
-        if self.train_kv:
-            key = self.to_k_custom_diffusion(encoder_hidden_states)
-            value = self.to_v_custom_diffusion(encoder_hidden_states)
-        else:
-            key = attn.to_k(encoder_hidden_states)
-            value = attn.to_v(encoder_hidden_states)
-
-        if crossattn:
-            detach = paddle.ones_like(key)
-            detach[:, :1, :] = detach[:, :1, :] * 0.0
-            key = detach * key + (1 - detach) * key.detach()
-            value = detach * value + (1 - detach) * value.detach()
-
-        # if transpose = False, query's shape will be [batch_size, seq_len, num_head, head_dim]
-        query = attn.head_to_batch_dim(query, transpose=False)
-        key = attn.head_to_batch_dim(key, transpose=False)
-        value = attn.head_to_batch_dim(value, transpose=False)
-
-        hidden_states = F.scaled_dot_product_attention_(
-            query,
-            key,
-            value,
-            attn_mask=attention_mask,
-            scale=attn.scale,
-            dropout_p=0.0,
-            training=attn.training,
-            attention_op=self.attention_op,
-        )
-        # hidden_states = hidden_states.cast(query.dtype)
-        hidden_states = attn.batch_to_head_dim(hidden_states, transpose=False)
-
-        if self.train_q_out:
-            # linear proj
-            hidden_states = self.to_out_custom_diffusion[0](hidden_states)
-            # dropout
-            hidden_states = self.to_out_custom_diffusion[1](hidden_states)
-        else:
-            # linear proj
-            hidden_states = attn.to_out[0](hidden_states)
-            # dropout
-            hidden_states = attn.to_out[1](hidden_states)
-        return hidden_states
-
-
-class SlicedAttnProcessor:
-    def __init__(self, slice_size):
-        self.slice_size = slice_size
-
-    def __call__(
-        self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, **cross_attention_kwargs
-    ):
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size, out_dim=3)
-
-        query = attn.to_q(hidden_states)
-        query = attn.head_to_batch_dim(query)
-
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-
-        key = attn.to_k(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states)
-        key = attn.head_to_batch_dim(key)
-        value = attn.head_to_batch_dim(value)
-
-        query = query.flatten(0, 1)
-        key = key.flatten(0, 1)
-        value = value.flatten(0, 1)
-
-        batch_size_attention = query.shape[0]
-        query_len = query.shape[1]
-        hidden_states = paddle.zeros((batch_size_attention, query_len, attn.head_dim), dtype=query.dtype)
-        for i in range(batch_size_attention // self.slice_size):
-            start_idx = i * self.slice_size
-            end_idx = (i + 1) * self.slice_size
-
-            query_slice = query[start_idx:end_idx]
-            key_slice = key[start_idx:end_idx]
-            attn_mask_slice = attention_mask[start_idx:end_idx] if attention_mask is not None else None
-
-            attn_slice = attn.get_attention_scores(query_slice, key_slice, attn_mask_slice)
-
-            attn_slice = paddle.matmul(attn_slice, value[start_idx:end_idx])
-
-            hidden_states[start_idx:end_idx] = attn_slice
-
-        # reshape back to [bs, num_heads, seqlen, head_dim]
-        hidden_states = hidden_states.reshape([-1, attn.heads, query_len, attn.head_dim])
-
-        hidden_states = attn.batch_to_head_dim(hidden_states)
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-
-        return hidden_states
-
-
-class SlicedAttnAddedKVProcessor:
-    def __init__(self, slice_size):
-        self.slice_size = slice_size
-
-    def __call__(
-        self,
-        attn: "Attention",
-        hidden_states,
-        encoder_hidden_states=None,
-        attention_mask=None,
-        **cross_attention_kwargs
-    ):
-        residual = hidden_states
-        hidden_states = hidden_states.reshape([hidden_states.shape[0], hidden_states.shape[1], -1]).transpose(
-            [0, 2, 1]
-        )
-
-        batch_size, sequence_length, _ = hidden_states.shape
-
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size, out_dim=3)
-
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-
-        hidden_states = attn.group_norm(hidden_states.transpose([0, 2, 1])).transpose([0, 2, 1])
-
-        query = attn.to_q(hidden_states)
-        query = attn.head_to_batch_dim(query)
-
-        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
-        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
-
-        encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj)
-        encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj)
-
-        if not attn.only_cross_attention:
-            key = attn.to_k(hidden_states)
-            value = attn.to_v(hidden_states)
-            key = attn.head_to_batch_dim(key)
-            value = attn.head_to_batch_dim(value)
-            key = paddle.concat([encoder_hidden_states_key_proj, key], axis=2)
-            value = paddle.concat([encoder_hidden_states_value_proj, value], axis=2)
-        else:
-            key = encoder_hidden_states_key_proj
-            value = encoder_hidden_states_value_proj
-
-        # flatten
-        query = query.flatten(0, 1)
-        key = key.flatten(0, 1)
-        value = value.flatten(0, 1)
-
-        batch_size_attention = query.shape[0]
-        query_len = query.shape[1]
-        hidden_states = paddle.zeros((batch_size_attention, query_len, attn.head_dim), dtype=query.dtype)
-
-        for i in range(batch_size_attention // self.slice_size):
-            start_idx = i * self.slice_size
-            end_idx = (i + 1) * self.slice_size
-
-            query_slice = query[start_idx:end_idx]
-            key_slice = key[start_idx:end_idx]
-            attn_mask_slice = attention_mask[start_idx:end_idx] if attention_mask is not None else None
-
-            attn_slice = attn.get_attention_scores(query_slice, key_slice, attn_mask_slice)
-
-            attn_slice = paddle.matmul(attn_slice, value[start_idx:end_idx])
-
-            hidden_states[start_idx:end_idx] = attn_slice
-
-        # reshape back to [bs, num_heads, seqlen, head_dim]
-        hidden_states = hidden_states.reshape([-1, attn.heads, query_len, attn.head_dim])
-
-        hidden_states = attn.batch_to_head_dim(hidden_states)
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-
-        hidden_states = hidden_states.transpose([0, 2, 1]).reshape(residual.shape)
-        hidden_states = hidden_states + residual
-
-        return hidden_states
-
-
-AttnProcessor2_5 = XFormersAttnProcessor
-AttnAddedKVProcessor2_5 = XFormersAttnAddedKVProcessor
-LoRAAttnProcessor2_5 = LoRAXFormersAttnProcessor
-AttentionProcessor = Union[
-    AttnProcessor,
-    AttnProcessor2_5,
-    XFormersAttnProcessor,
-    SlicedAttnProcessor,
-    AttnAddedKVProcessor,
-    SlicedAttnAddedKVProcessor,
-    AttnAddedKVProcessor2_5,
-    XFormersAttnAddedKVProcessor,
-    LoRAAttnProcessor,
-    LoRAXFormersAttnProcessor,
-    LoRAAttnProcessor2_5,
-    CustomDiffusionAttnProcessor,
-    CustomDiffusionXFormersAttnProcessor,
-]
diff --git a/ppdiffusers/ppdiffusers/models/autoencoder_kl.py b/ppdiffusers/ppdiffusers/models/autoencoder_kl.py
deleted file mode 100644
index ee77c9103285..000000000000
--- a/ppdiffusers/ppdiffusers/models/autoencoder_kl.py
+++ /dev/null
@@ -1,341 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from dataclasses import dataclass
-from typing import Optional, Tuple, Union
-
-import paddle
-import paddle.nn as nn
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, apply_forward_hook
-from .modeling_utils import ModelMixin
-from .vae import Decoder, DecoderOutput, DiagonalGaussianDistribution, Encoder
-
-
-@dataclass
-class AutoencoderKLOutput(BaseOutput):
-    """
-    Output of AutoencoderKL encoding method.
-
-    Args:
-        latent_dist (`DiagonalGaussianDistribution`):
-            Encoded outputs of `Encoder` represented as the mean and logvar of `DiagonalGaussianDistribution`.
-            `DiagonalGaussianDistribution` allows for sampling latents from the distribution.
-    """
-
-    latent_dist: DiagonalGaussianDistribution
-
-
-class AutoencoderKL(ModelMixin, ConfigMixin):
-    r"""Variational Autoencoder (VAE) model with KL loss from the paper Auto-Encoding Variational Bayes by Diederik P. Kingma
-    and Max Welling.
-
-    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
-    implements for all the model (such as downloading or saving, etc.)
-
-    Parameters:
-        in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
-        out_channels (int,  *optional*, defaults to 3): Number of channels in the output.
-        down_block_types (`Tuple[str]`, *optional*, defaults to :
-            obj:`("DownEncoderBlock2D",)`): Tuple of downsample block types.
-        up_block_types (`Tuple[str]`, *optional*, defaults to :
-            obj:`("UpDecoderBlock2D",)`): Tuple of upsample block types.
-        block_out_channels (`Tuple[int]`, *optional*, defaults to :
-            obj:`(64,)`): Tuple of block output channels.
-        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
-        latent_channels (`int`, *optional*, defaults to 4): Number of channels in the latent space.
-        sample_size (`int`, *optional*, defaults to `32`): TODO
-        scaling_factor (`float`, *optional*, defaults to 0.18215):
-            The component-wise standard deviation of the trained latent space computed using the first batch of the
-            training set. This is used to scale the latent space to have unit variance when training the diffusion
-            model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
-            diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
-            / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
-            Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
-    """
-    _supports_gradient_checkpointing = True
-
-    @register_to_config
-    def __init__(
-        self,
-        in_channels: int = 3,
-        out_channels: int = 3,
-        down_block_types: Tuple[str] = ("DownEncoderBlock2D",),
-        down_block_out_channels: Tuple[int] = None,
-        up_block_types: Tuple[str] = ("UpDecoderBlock2D",),
-        up_block_out_channels: Tuple[int] = None,
-        block_out_channels: Tuple[int] = (64,),
-        layers_per_block: int = 1,
-        act_fn: str = "silu",
-        latent_channels: int = 4,
-        norm_num_groups: int = 32,
-        sample_size: int = 32,
-        scaling_factor: float = 0.18215,
-    ):
-        super().__init__()
-        # if down_block_out_channels not givien, we will use block_out_channels
-        _down_block_out_channels = (
-            self.config.block_out_channels if down_block_out_channels is None else self.config.down_block_out_channels
-        )
-        # if up_block_out_channels not givien, we will use block_out_channels
-        _up_block_out_channels = (
-            self.config.block_out_channels if up_block_out_channels is None else self.config.up_block_out_channels
-        )
-
-        # pass init params to Encoder
-        self.encoder = Encoder(
-            in_channels=in_channels,
-            out_channels=latent_channels,
-            down_block_types=down_block_types,
-            block_out_channels=_down_block_out_channels,
-            layers_per_block=layers_per_block,
-            act_fn=act_fn,
-            norm_num_groups=norm_num_groups,
-            double_z=True,
-        )
-
-        # pass init params to Decoder
-        self.decoder = Decoder(
-            in_channels=latent_channels,
-            out_channels=out_channels,
-            up_block_types=up_block_types,
-            block_out_channels=_up_block_out_channels,
-            layers_per_block=layers_per_block,
-            norm_num_groups=norm_num_groups,
-            act_fn=act_fn,
-        )
-
-        self.quant_conv = nn.Conv2D(2 * latent_channels, 2 * latent_channels, 1)
-        self.post_quant_conv = nn.Conv2D(latent_channels, latent_channels, 1)
-
-        self.use_slicing = False
-        self.use_tiling = False
-
-        # only relevant if vae tiling is enabled
-        self.tile_sample_min_size = self.config.sample_size
-        sample_size = (
-            self.config.sample_size[0]
-            if isinstance(self.config.sample_size, (list, tuple))
-            else self.config.sample_size
-        )
-        self.tile_latent_min_size = int(sample_size / (2 ** (len(_up_block_out_channels) - 1)))
-        self.tile_overlap_factor = 0.25
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, (Encoder, Decoder)):
-            module.gradient_checkpointing = value
-
-    def enable_tiling(self, use_tiling: bool = True):
-        r"""
-        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
-        compute decoding and encoding in several steps. This is useful to save a large amount of memory and to allow
-        the processing of larger images.
-        """
-        self.use_tiling = use_tiling
-
-    def disable_tiling(self):
-        r"""
-        Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to
-        computing decoding in one step.
-        """
-        self.enable_tiling(False)
-
-    def enable_slicing(self):
-        r"""
-        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
-        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
-        """
-        self.use_slicing = True
-
-    def disable_slicing(self):
-        r"""
-        Disable sliced VAE decoding. If `enable_slicing` was previously invoked, this method will go back to computing
-        decoding in one step.
-        """
-        self.use_slicing = False
-
-    @apply_forward_hook
-    def encode(self, x: paddle.Tensor, return_dict: bool = True) -> AutoencoderKLOutput:
-        # TODO junnyu, support float16
-        x = x.cast(self.dtype)
-        if self.use_tiling and (x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size):
-            return self.tiled_encode(x, return_dict=return_dict)
-
-        h = self.encoder(x)
-        moments = self.quant_conv(h)
-        posterior = DiagonalGaussianDistribution(moments)
-
-        if not return_dict:
-            return (posterior,)
-
-        return AutoencoderKLOutput(latent_dist=posterior)
-
-    def _decode(self, z: paddle.Tensor, return_dict: bool = True) -> Union[DecoderOutput, paddle.Tensor]:
-        if self.use_tiling and (z.shape[-1] > self.tile_latent_min_size or z.shape[-2] > self.tile_latent_min_size):
-            return self.tiled_decode(z, return_dict=return_dict)
-
-        z = self.post_quant_conv(z)
-        dec = self.decoder(z)
-
-        if not return_dict:
-            return (dec,)
-
-        return DecoderOutput(sample=dec)
-
-    @apply_forward_hook
-    def decode(self, z: paddle.Tensor, return_dict: bool = True) -> Union[DecoderOutput, paddle.Tensor]:
-        # TODO junnyu, add this to support pure fp16
-        z = z.cast(self.dtype)
-        if self.use_slicing and z.shape[0] > 1:
-            # split、chunk paddle vs pytorch may have some difference
-            decoded_slices = [self._decode(z_slice).sample for z_slice in z.chunk(z.shape[0])]
-            decoded = paddle.concat(decoded_slices)
-        else:
-            decoded = self._decode(z).sample
-
-        if not return_dict:
-            return (decoded,)
-
-        return DecoderOutput(sample=decoded)
-
-    def blend_v(self, a, b, blend_extent):
-        for y in range(min(a.shape[2], b.shape[2], blend_extent)):
-            b[:, :, y, :] = a[:, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, y, :] * (y / blend_extent)
-        return b
-
-    def blend_h(self, a, b, blend_extent):
-        for x in range(min(a.shape[3], b.shape[3], blend_extent)):
-            b[:, :, :, x] = a[:, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, x] * (x / blend_extent)
-        return b
-
-    def tiled_encode(self, x: paddle.Tensor, return_dict: bool = True) -> AutoencoderKLOutput:
-        r"""Encode a batch of images using a tiled encoder.
-        Args:
-        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
-        steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is:
-        different from non-tiled encoding due to each tile using a different encoder. To avoid tiling artifacts, the
-        tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
-        look of the output, but they should be much less noticeable.
-            x (`paddle.Tensor`): Input batch of images. return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`AutoencoderKLOutput`] instead of a plain tuple.
-        """
-        overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))
-        blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)
-        row_limit = self.tile_latent_min_size - blend_extent
-
-        # Split the image into 512x512 tiles and encode them separately.
-        rows = []
-        for i in range(0, x.shape[2], overlap_size):
-            row = []
-            for j in range(0, x.shape[3], overlap_size):
-                tile = x[:, :, i : i + self.tile_sample_min_size, j : j + self.tile_sample_min_size]
-                tile = self.encoder(tile)
-                tile = self.quant_conv(tile)
-                row.append(tile)
-            rows.append(row)
-        result_rows = []
-        for i, row in enumerate(rows):
-            result_row = []
-            for j, tile in enumerate(row):
-                # blend the above tile and the left tile
-                # to the current tile and add the current tile to the result row
-                if i > 0:
-                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
-                if j > 0:
-                    tile = self.blend_h(row[j - 1], tile, blend_extent)
-                result_row.append(tile[:, :, :row_limit, :row_limit])
-            result_rows.append(paddle.concat(result_row, axis=3))
-
-        moments = paddle.concat(result_rows, axis=2)
-        posterior = DiagonalGaussianDistribution(moments)
-
-        if not return_dict:
-            return (posterior,)
-
-        return AutoencoderKLOutput(latent_dist=posterior)
-
-    def tiled_decode(self, z: paddle.Tensor, return_dict: bool = True) -> Union[DecoderOutput, paddle.Tensor]:
-        r"""Decode a batch of images using a tiled decoder.
-        Args:
-        When this option is enabled, the VAE will split the input tensor into tiles to compute decoding in several
-        steps. This is useful to keep memory use constant regardless of image size. The end result of tiled decoding is:
-        different from non-tiled decoding due to each tile using a different decoder. To avoid tiling artifacts, the
-        tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
-        look of the output, but they should be much less noticeable.
-            z (`paddle.Tensor`): Input batch of latent vectors. return_dict (`bool`, *optional*, defaults to
-            `True`):
-                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
-        """
-        overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor))
-        blend_extent = int(self.tile_sample_min_size * self.tile_overlap_factor)
-        row_limit = self.tile_sample_min_size - blend_extent
-
-        # Split z into overlapping 64x64 tiles and decode them separately.
-        # The tiles have an overlap to avoid seams between tiles.
-        rows = []
-        for i in range(0, z.shape[2], overlap_size):
-            row = []
-            for j in range(0, z.shape[3], overlap_size):
-                tile = z[:, :, i : i + self.tile_latent_min_size, j : j + self.tile_latent_min_size]
-                tile = self.post_quant_conv(tile)
-                decoded = self.decoder(tile)
-                row.append(decoded)
-            rows.append(row)
-        result_rows = []
-        for i, row in enumerate(rows):
-            result_row = []
-            for j, tile in enumerate(row):
-                # blend the above tile and the left tile
-                # to the current tile and add the current tile to the result row
-                if i > 0:
-                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
-                if j > 0:
-                    tile = self.blend_h(row[j - 1], tile, blend_extent)
-                result_row.append(tile[:, :, :row_limit, :row_limit])
-            result_rows.append(paddle.concat(result_row, axis=3))
-
-        dec = paddle.concat(result_rows, axis=2)
-        if not return_dict:
-            return (dec,)
-
-        return DecoderOutput(sample=dec)
-
-    def forward(
-        self,
-        sample: paddle.Tensor,
-        sample_posterior: bool = False,
-        return_dict: bool = True,
-        generator: Optional[paddle.Generator] = None,
-    ) -> Union[DecoderOutput, paddle.Tensor]:
-        r"""
-        Args:
-            sample (`paddle.Tensor`): Input sample.
-            sample_posterior (`bool`, *optional*, defaults to `False`):
-                Whether to sample from the posterior.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
-        """
-        x = sample.cast(self.dtype)
-        posterior = self.encode(x).latent_dist
-        if sample_posterior:
-            z = posterior.sample(generator=generator)
-        else:
-            z = posterior.mode()
-        dec = self.decode(z).sample
-
-        if not return_dict:
-            return (dec,)
-
-        return DecoderOutput(sample=dec)
diff --git a/ppdiffusers/ppdiffusers/models/controlnet.py b/ppdiffusers/ppdiffusers/models/controlnet.py
deleted file mode 100644
index 353d63161e96..000000000000
--- a/ppdiffusers/ppdiffusers/models/controlnet.py
+++ /dev/null
@@ -1,618 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple, Union
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..initializer import zeros_
-from ..utils import BaseOutput, logging
-from .attention_processor import AttentionProcessor, AttnProcessor
-from .embeddings import TimestepEmbedding, Timesteps
-from .modeling_utils import ModelMixin
-from .unet_2d_blocks import (
-    CrossAttnDownBlock2D,
-    DownBlock2D,
-    UNetMidBlock2DCrossAttn,
-    get_down_block,
-)
-from .unet_2d_condition import UNet2DConditionModel
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-@dataclass
-class ControlNetOutput(BaseOutput):
-    down_block_res_samples: Tuple[paddle.Tensor]
-    mid_block_res_sample: paddle.Tensor
-
-
-class Mish(nn.Layer):
-    def forward(self, hidden_states):
-        return hidden_states * paddle.tanh(F.softplus(hidden_states))
-
-
-class ControlNetConditioningEmbedding(nn.Layer):
-    """
-    "Stable Diffusion uses a pre-processing method similar to VQ-GAN [11] to convert the entire dataset of 512 × 512
-    images into smaller 64 × 64 “latent images” for stabilized training. This requires ControlNets to convert
-    image-based conditions to 64 × 64 feature space to match the convolution size. We use a tiny network E(·) of four
-    convolution layers with 4 × 4 kernels and 2 × 2 strides (activated by ReLU, channels are 16, 32, 64, 128,
-    initialized with Gaussian weights, trained jointly with the full model) to encode image-space conditions ... into
-    feature maps ..."
-    """
-
-    def __init__(
-        self,
-        conditioning_embedding_channels: int,
-        conditioning_channels: int = 3,
-        block_out_channels: Tuple[int] = (16, 32, 96, 256),
-    ):
-        super().__init__()
-
-        self.conv_in = nn.Conv2D(conditioning_channels, block_out_channels[0], kernel_size=3, padding=1)
-
-        self.blocks = nn.LayerList([])
-
-        for i in range(len(block_out_channels) - 1):
-            channel_in = block_out_channels[i]
-            channel_out = block_out_channels[i + 1]
-            self.blocks.append(nn.Conv2D(channel_in, channel_in, kernel_size=3, padding=1))
-            self.blocks.append(nn.Conv2D(channel_in, channel_out, kernel_size=3, padding=1, stride=2))
-
-        self.conv_out = zero_module(
-            nn.Conv2D(block_out_channels[-1], conditioning_embedding_channels, kernel_size=3, padding=1)
-        )
-
-    def forward(self, conditioning):
-        embedding = self.conv_in(conditioning)
-        embedding = F.silu(embedding)
-
-        for block in self.blocks:
-            embedding = block(embedding)
-            embedding = F.silu(embedding)
-
-        embedding = self.conv_out(embedding)
-
-        return embedding
-
-
-class ControlNetModel(ModelMixin, ConfigMixin):
-    _supports_gradient_checkpointing = True
-
-    @register_to_config
-    def __init__(
-        self,
-        in_channels: int = 4,
-        flip_sin_to_cos: bool = True,
-        freq_shift: int = 0,
-        down_block_types: Tuple[str] = (
-            "CrossAttnDownBlock2D",
-            "CrossAttnDownBlock2D",
-            "CrossAttnDownBlock2D",
-            "DownBlock2D",
-        ),
-        only_cross_attention: Union[bool, Tuple[bool]] = False,
-        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
-        layers_per_block: int = 2,
-        downsample_padding: int = 1,
-        mid_block_scale_factor: float = 1,
-        act_fn: str = "silu",
-        norm_num_groups: Optional[int] = 32,
-        norm_eps: float = 1e-5,
-        cross_attention_dim: int = 1280,
-        attention_head_dim: Union[int, Tuple[int]] = 8,
-        use_linear_projection: bool = False,
-        class_embed_type: Optional[str] = None,
-        num_class_embeds: Optional[int] = None,
-        upcast_attention: bool = False,
-        resnet_time_scale_shift: str = "default",
-        projection_class_embeddings_input_dim: Optional[int] = None,
-        controlnet_conditioning_channel_order: str = "rgb",
-        conditioning_embedding_out_channels: Optional[Tuple[int]] = (16, 32, 96, 256),
-        global_pool_conditions: bool = False,
-        resnet_pre_temb_non_linearity: bool = False,
-    ):
-        super().__init__()
-
-        # Check inputs
-        if len(block_out_channels) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
-            )
-
-        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
-            )
-
-        if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
-            )
-
-        # input
-        conv_in_kernel = 3
-        conv_in_padding = (conv_in_kernel - 1) // 2
-        self.conv_in = nn.Conv2D(
-            in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
-        )
-
-        # time
-        time_embed_dim = block_out_channels[0] * 4
-
-        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
-        timestep_input_dim = block_out_channels[0]
-
-        self.time_embedding = TimestepEmbedding(
-            timestep_input_dim,
-            time_embed_dim,
-            act_fn=act_fn,
-        )
-
-        # class embedding
-        if class_embed_type is None and num_class_embeds is not None:
-            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
-        elif class_embed_type == "timestep":
-            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
-        elif class_embed_type == "identity":
-            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
-        elif class_embed_type == "projection":
-            if projection_class_embeddings_input_dim is None:
-                raise ValueError(
-                    "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
-                )
-            # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
-            # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
-            # 2. it projects from an arbitrary input dimension.
-            #
-            # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
-            # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
-            # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
-            self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
-        else:
-            self.class_embedding = None
-
-        # control net conditioning embedding
-        self.controlnet_cond_embedding = ControlNetConditioningEmbedding(
-            conditioning_embedding_channels=block_out_channels[0],
-            block_out_channels=conditioning_embedding_out_channels,
-        )
-
-        self.down_blocks = nn.LayerList([])
-        self.controlnet_down_blocks = nn.LayerList([])
-
-        if isinstance(only_cross_attention, bool):
-            only_cross_attention = [only_cross_attention] * len(down_block_types)
-
-        if isinstance(attention_head_dim, int):
-            attention_head_dim = (attention_head_dim,) * len(down_block_types)
-
-        # pre_temb_act_fun opt
-        self.resnet_pre_temb_non_linearity = resnet_pre_temb_non_linearity
-        if resnet_pre_temb_non_linearity:
-            if act_fn == "swish":
-                self.down_resnet_temb_nonlinearity = lambda x: F.silu(x)
-            elif act_fn == "mish":
-                self.down_resnet_temb_nonlinearity = nn.Mish()
-            elif act_fn == "silu":
-                self.down_resnet_temb_nonlinearity = nn.Silu()
-            elif act_fn == "gelu":
-                self.down_resnet_temb_nonlinearity = nn.GELU()
-
-        # down
-        output_channel = block_out_channels[0]
-
-        controlnet_block = nn.Conv2D(output_channel, output_channel, kernel_size=1)
-        controlnet_block = zero_module(controlnet_block)
-        self.controlnet_down_blocks.append(controlnet_block)
-
-        for i, down_block_type in enumerate(down_block_types):
-            input_channel = output_channel
-            output_channel = block_out_channels[i]
-            is_final_block = i == len(block_out_channels) - 1
-
-            down_block = get_down_block(
-                down_block_type,
-                num_layers=layers_per_block,
-                in_channels=input_channel,
-                out_channels=output_channel,
-                temb_channels=time_embed_dim,
-                add_downsample=not is_final_block,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                resnet_groups=norm_num_groups,
-                cross_attention_dim=cross_attention_dim,
-                attn_num_head_channels=attention_head_dim[i],
-                downsample_padding=downsample_padding,
-                use_linear_projection=use_linear_projection,
-                only_cross_attention=only_cross_attention[i],
-                upcast_attention=upcast_attention,
-                resnet_time_scale_shift=resnet_time_scale_shift,
-                resnet_pre_temb_non_linearity=self.resnet_pre_temb_non_linearity,
-            )
-            self.down_blocks.append(down_block)
-
-            for _ in range(layers_per_block):
-                controlnet_block = nn.Conv2D(output_channel, output_channel, kernel_size=1)
-                controlnet_block = zero_module(controlnet_block)
-                self.controlnet_down_blocks.append(controlnet_block)
-
-            if not is_final_block:
-                controlnet_block = nn.Conv2D(output_channel, output_channel, kernel_size=1)
-                controlnet_block = zero_module(controlnet_block)
-                self.controlnet_down_blocks.append(controlnet_block)
-
-        # mid
-        mid_block_channel = block_out_channels[-1]
-
-        controlnet_block = nn.Conv2D(mid_block_channel, mid_block_channel, kernel_size=1)
-        controlnet_block = zero_module(controlnet_block)
-        self.controlnet_mid_block = controlnet_block
-
-        self.mid_block = UNetMidBlock2DCrossAttn(
-            in_channels=mid_block_channel,
-            temb_channels=time_embed_dim,
-            resnet_eps=norm_eps,
-            resnet_act_fn=act_fn,
-            output_scale_factor=mid_block_scale_factor,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            cross_attention_dim=cross_attention_dim,
-            attn_num_head_channels=attention_head_dim[-1],
-            resnet_groups=norm_num_groups,
-            use_linear_projection=use_linear_projection,
-            upcast_attention=upcast_attention,
-            resnet_pre_temb_non_linearity=self.resnet_pre_temb_non_linearity,
-        )
-
-    @classmethod
-    def from_unet(
-        cls,
-        unet: UNet2DConditionModel,
-        controlnet_conditioning_channel_order: str = "rgb",
-        conditioning_embedding_out_channels: Optional[Tuple[int]] = (16, 32, 96, 256),
-        load_weights_from_unet: bool = True,
-    ):
-        r"""
-        Instantiate Controlnet class from UNet2DConditionModel.
-        Parameters:
-            unet (`UNet2DConditionModel`):
-                UNet model which weights are copied to the ControlNet. Note that all configuration options are also
-                copied where applicable.
-        """
-        controlnet = cls(
-            in_channels=unet.config.in_channels,
-            flip_sin_to_cos=unet.config.flip_sin_to_cos,
-            freq_shift=unet.config.freq_shift,
-            down_block_types=unet.config.down_block_types,
-            only_cross_attention=unet.config.only_cross_attention,
-            block_out_channels=unet.config.block_out_channels,
-            layers_per_block=unet.config.layers_per_block,
-            downsample_padding=unet.config.downsample_padding,
-            mid_block_scale_factor=unet.config.mid_block_scale_factor,
-            act_fn=unet.config.act_fn,
-            norm_num_groups=unet.config.norm_num_groups,
-            norm_eps=unet.config.norm_eps,
-            cross_attention_dim=unet.config.cross_attention_dim,
-            attention_head_dim=unet.config.attention_head_dim,
-            use_linear_projection=unet.config.use_linear_projection,
-            class_embed_type=unet.config.class_embed_type,
-            num_class_embeds=unet.config.num_class_embeds,
-            upcast_attention=unet.config.upcast_attention,
-            resnet_time_scale_shift=unet.config.resnet_time_scale_shift,
-            projection_class_embeddings_input_dim=unet.config.projection_class_embeddings_input_dim,
-            controlnet_conditioning_channel_order=controlnet_conditioning_channel_order,
-            conditioning_embedding_out_channels=conditioning_embedding_out_channels,
-            resnet_pre_temb_non_linearity=unet.config.resnet_pre_temb_non_linearity,
-        )
-
-        if load_weights_from_unet:
-            controlnet.conv_in.load_dict(unet.conv_in.state_dict())
-            controlnet.time_proj.load_dict(unet.time_proj.state_dict())
-            controlnet.time_embedding.load_dict(unet.time_embedding.state_dict())
-
-            if controlnet.class_embedding:
-                controlnet.class_embedding.load_dict(unet.class_embedding.state_dict())
-
-            controlnet.down_blocks.load_dict(unet.down_blocks.state_dict())
-            controlnet.mid_block.load_dict(unet.mid_block.state_dict())
-
-        return controlnet
-
-    @property
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: nn.Layer, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "set_processor"):
-                processors[f"{name}.processor"] = module.processor
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Parameters:
-            `processor (`dict` of `AttentionProcessor` or `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                of **all** `Attention` layers.
-            In case `processor` is a dict, the key needs to define the path to the corresponding cross attention processor. This is strongly recommended when setting trainable attention processors.:
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: nn.Layer, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
-    # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
-    def set_default_attn_processor(self):
-        """
-        Disables custom attention processors and sets the default attention implementation.
-        """
-        self.set_attn_processor(AttnProcessor())
-
-    def set_attention_slice(self, slice_size):
-        r"""
-        Enable sliced attention computation.
-        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
-        in several steps. This is useful to save some memory in exchange for a small speed decrease.
-        Args:
-            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
-                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
-                `"max"`, maximum amount of memory will be saved by running only one slice at a time. If a number is
-                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
-                must be a multiple of `slice_size`.
-        """
-        sliceable_head_dims = []
-
-        def fn_recursive_retrieve_sliceable_dims(module: nn.Layer):
-            if hasattr(module, "set_attention_slice"):
-                sliceable_head_dims.append(module.sliceable_head_dim)
-
-            for child in module.children():
-                fn_recursive_retrieve_sliceable_dims(child)
-
-        # retrieve number of attention layers
-        for module in self.children():
-            fn_recursive_retrieve_sliceable_dims(module)
-
-        num_sliceable_layers = len(sliceable_head_dims)
-
-        if slice_size == "auto":
-            # half the attention head size is usually a good trade-off between
-            # speed and memory
-            slice_size = [dim // 2 for dim in sliceable_head_dims]
-        elif slice_size == "max":
-            # make smallest slice possible
-            slice_size = num_sliceable_layers * [1]
-
-        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
-
-        if len(slice_size) != len(sliceable_head_dims):
-            raise ValueError(
-                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
-                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
-            )
-
-        for i in range(len(slice_size)):
-            size = slice_size[i]
-            dim = sliceable_head_dims[i]
-            if size is not None and size > dim:
-                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
-
-        # Recursively walk through all the children.
-        # Any children which exposes the set_attention_slice method
-        # gets the message
-        def fn_recursive_set_attention_slice(module: nn.Layer, slice_size: List[int]):
-            if hasattr(module, "set_attention_slice"):
-                module.set_attention_slice(slice_size.pop())
-
-            for child in module.children():
-                fn_recursive_set_attention_slice(child, slice_size)
-
-        reversed_slice_size = list(reversed(slice_size))
-        for module in self.children():
-            fn_recursive_set_attention_slice(module, reversed_slice_size)
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D)):
-            module.gradient_checkpointing = value
-
-    def forward(
-        self,
-        sample: paddle.Tensor,
-        timestep: Union[paddle.Tensor, float, int],
-        encoder_hidden_states: paddle.Tensor,
-        controlnet_cond: paddle.Tensor,
-        conditioning_scale: Union[List[float], float] = 1.0,
-        class_labels: Optional[paddle.Tensor] = None,
-        timestep_cond: Optional[paddle.Tensor] = None,
-        attention_mask: Optional[paddle.Tensor] = None,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        guess_mode: bool = False,
-        return_dict: bool = True,
-    ) -> Union[ControlNetOutput, Tuple]:
-        # TODO junnyu, add this to support pure fp16
-        sample = sample.cast(self.dtype)
-
-        # check channel order
-        channel_order = self.config.controlnet_conditioning_channel_order
-
-        if channel_order == "rgb":
-            # in rgb order by default
-            ...
-        elif channel_order == "bgr":
-            controlnet_cond = paddle.flip(controlnet_cond, axis=[1])
-        else:
-            raise ValueError(f"unknown `controlnet_conditioning_channel_order`: {channel_order}")
-
-        # prepare attention_mask
-        if attention_mask is not None:
-            attention_mask = (1 - attention_mask.cast(sample.dtype)) * -10000.0
-            attention_mask = attention_mask.unsqueeze(1)
-
-        # 1. time
-        timesteps = timestep
-        if not paddle.is_tensor(timesteps):
-            timesteps = paddle.to_tensor([timesteps], dtype="int64")
-        elif len(timesteps.shape) == 0:
-            timesteps = timesteps[None]
-
-        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-        timesteps = timesteps.expand(
-            [
-                sample.shape[0],
-            ]
-        )
-
-        t_emb = self.time_proj(timesteps)
-
-        # timesteps does not contain any weights and will always return f32 tensors
-        # but time_embedding might actually be running in fp16. so we need to cast here.
-        # there might be better ways to encapsulate this.
-        t_emb = t_emb.cast(dtype=self.dtype)
-
-        emb = self.time_embedding(t_emb, timestep_cond)
-
-        if self.class_embedding is not None:
-            if class_labels is None:
-                raise ValueError("class_labels should be provided when num_class_embeds > 0")
-
-            # maybe cast it to float16
-            class_labels = class_labels.cast(self.dtype)
-            if self.config.class_embed_type == "timestep":
-                class_labels = self.time_proj(class_labels)
-
-            # maybe cast it to int64
-            if isinstance(self.class_embedding, nn.Embedding):
-                class_labels = class_labels.cast(paddle.int64)
-            class_emb = self.class_embedding(class_labels).cast(self.dtype)
-            emb = emb + class_emb
-
-        if self.resnet_pre_temb_non_linearity:
-            emb = self.down_resnet_temb_nonlinearity(emb)
-
-        # 2. pre-process
-        sample = self.conv_in(sample)
-
-        controlnet_cond = self.controlnet_cond_embedding(controlnet_cond)
-
-        sample += controlnet_cond
-
-        # 3. down
-        down_block_res_samples = (sample,)
-
-        for downsample_block in self.down_blocks:
-            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
-                sample, res_samples = downsample_block(
-                    hidden_states=sample,
-                    temb=emb,
-                    encoder_hidden_states=encoder_hidden_states,
-                    attention_mask=attention_mask,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                )
-            else:
-                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
-
-            down_block_res_samples += res_samples
-
-        # 4. mid
-        if self.mid_block is not None:
-            sample = self.mid_block(
-                sample,
-                emb,
-                encoder_hidden_states=encoder_hidden_states,
-                attention_mask=attention_mask,
-                cross_attention_kwargs=cross_attention_kwargs,
-            )
-
-        # 5. Control net blocks
-
-        controlnet_down_block_res_samples = ()
-
-        for down_block_res_sample, controlnet_block in zip(down_block_res_samples, self.controlnet_down_blocks):
-            down_block_res_sample = controlnet_block(down_block_res_sample)
-            controlnet_down_block_res_samples += (down_block_res_sample,)
-
-        down_block_res_samples = controlnet_down_block_res_samples
-
-        mid_block_res_sample = self.controlnet_mid_block(sample)
-
-        # 6. scaling
-        if guess_mode:
-            scales = paddle.logspace(-1, 0, len(down_block_res_samples) + 1)  # 0.1 to 1.0
-            scales *= conditioning_scale
-            down_block_res_samples = [sample * scale for sample, scale in zip(down_block_res_samples, scales)]
-            mid_block_res_sample *= scales[-1]  # last one
-        else:
-            # add conditioning_scale https://github.com/huggingface/diffusers/pull/2627
-            if isinstance(conditioning_scale, (float, int)):
-                down_block_res_samples = [sample * conditioning_scale for sample in down_block_res_samples]
-                mid_block_res_sample *= conditioning_scale
-            else:
-                down_block_res_samples = [
-                    sample * ccs for sample, ccs in zip(down_block_res_samples, conditioning_scale[:-1])
-                ]
-                mid_block_res_sample *= conditioning_scale[-1]
-
-        if self.config.global_pool_conditions:
-            down_block_res_samples = [
-                paddle.mean(sample, axis=(2, 3), keepdim=True) for sample in down_block_res_samples
-            ]
-            mid_block_res_sample = paddle.mean(mid_block_res_sample, axis=(2, 3), keepdim=True)
-
-        if not return_dict:
-            return (down_block_res_samples, mid_block_res_sample)
-
-        return ControlNetOutput(
-            down_block_res_samples=down_block_res_samples, mid_block_res_sample=mid_block_res_sample
-        )
-
-
-def zero_module(module):
-    for p in module.parameters():
-        zeros_(p)
-    return module
diff --git a/ppdiffusers/ppdiffusers/models/cross_attention.py b/ppdiffusers/ppdiffusers/models/cross_attention.py
deleted file mode 100644
index 5e00026ccb49..000000000000
--- a/ppdiffusers/ppdiffusers/models/cross_attention.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from ..utils import deprecate
-from .attention_processor import (  # noqa: F401
-    Attention,
-    AttentionProcessor,
-    AttnAddedKVProcessor,
-)
-from .attention_processor import AttnProcessor as AttnProcessorRename  # noqa: F401
-from .attention_processor import (  # noqa: F401
-    AttnProcessor2_5,
-    LoRAAttnProcessor,
-    LoRALinearLayer,
-    LoRAXFormersAttnProcessor,
-    SlicedAttnAddedKVProcessor,
-    SlicedAttnProcessor,
-    XFormersAttnProcessor,
-)
-
-deprecate(
-    "cross_attention",
-    "0.18.0",
-    "Importing from cross_attention is deprecated. Please import from diffusers.models.attention_processor instead.",
-    standard_warn=False,
-)
-
-
-AttnProcessor = AttentionProcessor
-
-
-class CrossAttention(Attention):
-    def __init__(self, *args, **kwargs):
-        deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.18.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead."
-        deprecate("cross_attention", "0.18.0", deprecation_message, standard_warn=False)
-        super().__init__(*args, **kwargs)
-
-
-class CrossAttnProcessor(AttnProcessorRename):
-    def __init__(self, *args, **kwargs):
-        deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.18.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead."
-        deprecate("cross_attention", "0.18.0", deprecation_message, standard_warn=False)
-        super().__init__(*args, **kwargs)
-
-
-class LoRACrossAttnProcessor(LoRAAttnProcessor):
-    def __init__(self, *args, **kwargs):
-        deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.18.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead."
-        deprecate("cross_attention", "0.18.0", deprecation_message, standard_warn=False)
-        super().__init__(*args, **kwargs)
-
-
-class CrossAttnAddedKVProcessor(AttnAddedKVProcessor):
-    def __init__(self, *args, **kwargs):
-        deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.18.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead."
-        deprecate("cross_attention", "0.18.0", deprecation_message, standard_warn=False)
-        super().__init__(*args, **kwargs)
-
-
-class XFormersCrossAttnProcessor(XFormersAttnProcessor):
-    def __init__(self, *args, **kwargs):
-        deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.18.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead."
-        deprecate("cross_attention", "0.18.0", deprecation_message, standard_warn=False)
-        super().__init__(*args, **kwargs)
-
-
-class LoRAXFormersCrossAttnProcessor(LoRAXFormersAttnProcessor):
-    def __init__(self, *args, **kwargs):
-        deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.18.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead."
-        deprecate("cross_attention", "0.18.0", deprecation_message, standard_warn=False)
-        super().__init__(*args, **kwargs)
-
-
-class SlicedCrossAttnProcessor(SlicedAttnProcessor):
-    def __init__(self, *args, **kwargs):
-        deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.18.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead."
-        deprecate("cross_attention", "0.18.0", deprecation_message, standard_warn=False)
-        super().__init__(*args, **kwargs)
-
-
-class SlicedCrossAttnAddedKVProcessor(SlicedAttnAddedKVProcessor):
-    def __init__(self, *args, **kwargs):
-        deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.18.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead."
-        deprecate("cross_attention", "0.18.0", deprecation_message, standard_warn=False)
-        super().__init__(*args, **kwargs)
diff --git a/ppdiffusers/ppdiffusers/models/dual_transformer_2d.py b/ppdiffusers/ppdiffusers/models/dual_transformer_2d.py
deleted file mode 100644
index d6f680e81fc6..000000000000
--- a/ppdiffusers/ppdiffusers/models/dual_transformer_2d.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Optional
-
-import paddle.nn as nn
-
-from .transformer_2d import Transformer2DModel, Transformer2DModelOutput
-
-
-class DualTransformer2DModel(nn.Layer):
-    """
-    Dual transformer wrapper that combines two `Transformer2DModel`s for mixed inference.
-
-    Parameters:
-        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
-        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
-        in_channels (`int`, *optional*):
-            Pass if the input is continuous. The number of channels in the input and output.
-        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
-        dropout (`float`, *optional*, defaults to 0.1): The dropout probability to use.
-        cross_attention_dim (`int`, *optional*): The number of encoder_hidden_states dimensions to use.
-        sample_size (`int`, *optional*): Pass if the input is discrete. The width of the latent images.
-            Note that this is fixed at training time as it is used for learning a number of position embeddings. See
-            `ImagePositionalEmbeddings`.
-        num_vector_embeds (`int`, *optional*):
-            Pass if the input is discrete. The number of classes of the vector embeddings of the latent pixels.
-            Includes the class for the masked latent pixel.
-        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
-        num_embeds_ada_norm ( `int`, *optional*): Pass if at least one of the norm_layers is `AdaLayerNorm`.
-            The number of diffusion steps used during training. Note that this is fixed at training time as it is used
-            to learn a number of embeddings that are added to the hidden states. During inference, you can denoise for
-            up to but not more than steps than `num_embeds_ada_norm`.
-        attention_bias (`bool`, *optional*):
-            Configure if the TransformerBlocks' attention should contain a bias parameter.
-    """
-
-    def __init__(
-        self,
-        num_attention_heads: int = 16,
-        attention_head_dim: int = 88,
-        in_channels: Optional[int] = None,
-        num_layers: int = 1,
-        dropout: float = 0.0,
-        norm_num_groups: int = 32,
-        cross_attention_dim: Optional[int] = None,
-        attention_bias: bool = False,
-        sample_size: Optional[int] = None,
-        num_vector_embeds: Optional[int] = None,
-        activation_fn: str = "geglu",
-        num_embeds_ada_norm: Optional[int] = None,
-    ):
-        super().__init__()
-        self.transformers = nn.LayerList(
-            [
-                Transformer2DModel(
-                    num_attention_heads=num_attention_heads,
-                    attention_head_dim=attention_head_dim,
-                    in_channels=in_channels,
-                    num_layers=num_layers,
-                    dropout=dropout,
-                    norm_num_groups=norm_num_groups,
-                    cross_attention_dim=cross_attention_dim,
-                    attention_bias=attention_bias,
-                    sample_size=sample_size,
-                    num_vector_embeds=num_vector_embeds,
-                    activation_fn=activation_fn,
-                    num_embeds_ada_norm=num_embeds_ada_norm,
-                )
-                for _ in range(2)
-            ]
-        )
-
-        # Variables that can be set by a pipeline:
-
-        # The ratio of transformer1 to transformer2's output states to be combined during inference
-        self.mix_ratio = 0.5
-
-        # The shape of `encoder_hidden_states` is expected to be
-        # `(batch_size, condition_lengths[0]+condition_lengths[1], num_features)`
-        self.condition_lengths = [77, 257]
-
-        # Which transformer to use to encode which condition.
-        # E.g. `(1, 0)` means that we'll use `transformers[1](conditions[0])` and `transformers[0](conditions[1])`
-        self.transformer_index_for_condition = [1, 0]
-
-    def forward(
-        self,
-        hidden_states,
-        encoder_hidden_states,
-        timestep=None,
-        attention_mask=None,
-        cross_attention_kwargs=None,
-        return_dict: bool = True,
-    ):
-        """
-        Args:
-            hidden_states ( When discrete, `paddle.Tensor` of shape `(batch size, num latent pixels)`.
-                When continuous, `paddle.Tensor` of shape `(batch size, channel, height, width)`): Input
-                hidden_states
-            encoder_hidden_states ( `paddle.Tensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*):
-                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
-                self-attention.
-            timestep ( `paddle.Tensor`, *optional*):
-                Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step.
-            attention_mask (`paddle.Tensor`, *optional*):
-                Optional attention mask to be applied in Attention
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
-
-        Returns:
-            [`~models.transformer_2d.Transformer2DModelOutput`] or `tuple`:
-            [`~models.transformer_2d.Transformer2DModelOutput`] if `return_dict` is True, otherwise a `tuple`. When
-            returning a tuple, the first element is the sample tensor.
-        """
-        input_states = hidden_states
-
-        encoded_states = []
-        tokens_start = 0
-        # attention_mask is not used yet
-        for i in range(2):
-            # for each of the two transformers, pass the corresponding condition tokens
-            condition_state = encoder_hidden_states[:, tokens_start : tokens_start + self.condition_lengths[i]]
-            transformer_index = self.transformer_index_for_condition[i]
-            encoded_state = self.transformers[transformer_index](
-                input_states,
-                encoder_hidden_states=condition_state,
-                timestep=timestep,
-                cross_attention_kwargs=cross_attention_kwargs,
-                return_dict=False,
-            )[0]
-            encoded_states.append(encoded_state - input_states)
-            tokens_start += self.condition_lengths[i]
-
-        output_states = encoded_states[0] * self.mix_ratio + encoded_states[1] * (1 - self.mix_ratio)
-        output_states = output_states + input_states
-
-        if not return_dict:
-            return (output_states,)
-
-        return Transformer2DModelOutput(sample=output_states)
diff --git a/ppdiffusers/ppdiffusers/models/ema.py b/ppdiffusers/ppdiffusers/models/ema.py
deleted file mode 100644
index b42e0c2ad02a..000000000000
--- a/ppdiffusers/ppdiffusers/models/ema.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-from paddle import nn
-
-
-class LitEma(nn.Layer):
-    """
-    Exponential Moving Average (EMA) of model updates
-
-    Parameters:
-        model: The model architecture for apply EMA.
-        decay: The exponential decay. Default 0.9999.
-        use_num_updates: Whether to use number of updates when computing
-            averages.
-    """
-
-    def __init__(self, model, decay=0.9999, use_num_upates=True):
-        super().__init__()
-        if decay < 0.0 or decay > 1.0:
-            raise ValueError("Decay must be between 0 and 1")
-
-        self.m_name2s_name = {}
-        self.register_buffer("decay", paddle.to_tensor(decay, dtype=paddle.float32))
-        self.register_buffer(
-            "num_updates",
-            paddle.to_tensor(0, dtype=paddle.int64) if use_num_upates else paddle.to_tensor(-1, dtype=paddle.int64),
-        )
-
-        for name, p in model.named_parameters():
-            if not p.stop_gradient:
-                # remove as '.'-character is not allowed in buffers
-                s_name = name.replace(".", "")
-                self.m_name2s_name.update({name: s_name})
-                self.register_buffer(s_name, p.clone().detach())
-
-        self.collected_params = []
-
-    def forward(self, model):
-        decay = self.decay
-
-        if self.num_updates >= 0:
-            self.num_updates += 1
-            decay = min(self.decay, (1 + self.num_updates) / (10 + self.num_updates))
-
-        one_minus_decay = 1.0 - decay
-
-        with paddle.no_grad():
-            m_param = dict(model.named_parameters())
-            shadow_params = dict(self.named_buffers())
-
-            for key in m_param:
-                if not m_param[key].stop_gradient:
-                    sname = self.m_name2s_name[key]
-                    shadow_params[sname].scale_(decay)
-                    shadow_params[sname].add_(m_param[key] * one_minus_decay)
-                else:
-                    assert key not in self.m_name2s_name
-
-    def copy_to(self, model):
-        m_param = dict(model.named_parameters())
-        shadow_params = dict(self.named_buffers())
-        for key in m_param:
-            if not m_param[key].stop_gradient:
-                m_param[key].copy_(shadow_params[self.m_name2s_name[key]], False)
-            else:
-                assert key not in self.m_name2s_name
-
-    def store(self, parameters):
-        """
-        Save the current parameters for restoring later.
-        Args:
-          parameters: Iterable of `EagerParamBase`; the parameters to be
-            temporarily stored.
-        """
-        self.collected_params = [param.detach().cpu().clone() for param in parameters]
-
-    def restore(self, parameters):
-        """
-        Restore the parameters stored with the `store` method.
-        Useful to validate the model with EMA parameters without affecting the
-        original optimization process. Store the parameters before the
-        `copy_to` method. After validation (or model saving), use this to
-        restore the former parameters.
-        Args:
-          parameters: Iterable of `EagerParamBase`; the parameters to be
-            updated with the stored parameters.
-        """
-        for c_param, param in zip(self.collected_params, parameters):
-            param.copy_(c_param, False)
-        self.collected_params = None
diff --git a/ppdiffusers/ppdiffusers/models/embeddings.py b/ppdiffusers/ppdiffusers/models/embeddings.py
deleted file mode 100644
index 2d0320eb88d9..000000000000
--- a/ppdiffusers/ppdiffusers/models/embeddings.py
+++ /dev/null
@@ -1,464 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-from typing import Optional
-
-import numpy as np
-import paddle
-from paddle import nn
-
-
-def get_timestep_embedding(
-    timesteps: paddle.Tensor,
-    embedding_dim: int,
-    flip_sin_to_cos: bool = False,
-    downscale_freq_shift: float = 1,
-    scale: float = 1,
-    max_period: int = 10000,
-):
-    """
-    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
-
-    :param timesteps: a 1-D Tensor of N indices, one per batch element.
-                      These may be fractional.
-    :param embedding_dim: the dimension of the output. :param max_period: controls the minimum frequency of the
-    embeddings. :return: an [N x dim] Tensor of positional embeddings.
-    """
-    assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
-
-    half_dim = embedding_dim // 2
-    exponent = -math.log(max_period) * paddle.arange(start=0, end=half_dim, dtype="float32")
-
-    exponent = exponent / (half_dim - downscale_freq_shift)
-
-    emb = paddle.exp(exponent)
-    emb = timesteps[:, None].cast("float32") * emb[None, :]
-
-    # scale embeddings
-    emb = scale * emb
-
-    # concat sine and cosine embeddings
-    emb = paddle.concat([paddle.sin(emb), paddle.cos(emb)], axis=-1)
-
-    # flip sine and cosine embeddings
-    if flip_sin_to_cos:
-        emb = paddle.concat([emb[:, half_dim:], emb[:, :half_dim]], axis=-1)
-
-    # zero pad
-    if embedding_dim % 2 == 1:
-        emb = paddle.concat(emb, paddle.zeros([emb.shape[0], 1]), axis=-1)
-    return emb
-
-
-def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0):
-    """
-    grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or
-    [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
-    """
-    grid_h = np.arange(grid_size, dtype=np.float32)
-    grid_w = np.arange(grid_size, dtype=np.float32)
-    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
-    grid = np.stack(grid, axis=0)
-
-    grid = grid.reshape([2, 1, grid_size, grid_size])
-    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
-    if cls_token and extra_tokens > 0:
-        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
-    return pos_embed
-
-
-def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
-    if embed_dim % 2 != 0:
-        raise ValueError("embed_dim must be divisible by 2")
-
-    # use half of dimensions to encode grid_h
-    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
-    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
-
-    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
-    return emb
-
-
-def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
-    """
-    embed_dim: output dimension for each position pos: a list of positions to be encoded: size (M,) out: (M, D)
-    """
-    if embed_dim % 2 != 0:
-        raise ValueError("embed_dim must be divisible by 2")
-
-    omega = np.arange(embed_dim // 2, dtype=np.float64)
-    omega /= embed_dim / 2.0
-    omega = 1.0 / 10000**omega  # (D/2,)
-
-    pos = pos.reshape(-1)  # (M,)
-    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
-
-    emb_sin = np.sin(out)  # (M, D/2)
-    emb_cos = np.cos(out)  # (M, D/2)
-
-    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
-    return emb
-
-
-class PatchEmbed(nn.Layer):
-    """2D Image to Patch Embedding"""
-
-    def __init__(
-        self,
-        height=224,
-        width=224,
-        patch_size=16,
-        in_channels=3,
-        embed_dim=768,
-        layer_norm=False,
-        flatten=True,
-        bias=True,
-        add_pos_embed=True,
-    ):
-        super().__init__()
-
-        num_patches = (height // patch_size) * (width // patch_size)
-        self.flatten = flatten
-        self.layer_norm = layer_norm
-
-        self.proj = nn.Conv2D(
-            in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=patch_size, bias_attr=bias
-        )
-        if layer_norm:
-            # elementwise_affine=False  -> weight_attr=False, bias_attr=False
-            self.norm = nn.LayerNorm(embed_dim, epsilon=1e-6, weight_attr=False, bias_attr=False)
-        else:
-            self.norm = None
-
-        self.add_pos_embed = add_pos_embed
-        if add_pos_embed:
-            pos_embed = get_2d_sincos_pos_embed(embed_dim, int(num_patches**0.5))
-            self.register_buffer(
-                "pos_embed", paddle.to_tensor(pos_embed).cast("float32").unsqueeze(0), persistable=False
-            )
-
-    def forward(self, latent):
-        latent = self.proj(latent)
-        if self.flatten:
-            latent = latent.flatten(2).transpose([0, 2, 1])  # BCHW -> BNC
-        if self.layer_norm:
-            latent = self.norm(latent)
-        if self.add_pos_embed:
-            return latent + self.pos_embed
-        else:
-            return latent
-
-
-class TimestepEmbedding(nn.Layer):
-    def __init__(
-        self,
-        in_channels: int,
-        time_embed_dim: int,
-        act_fn: str = "silu",
-        out_dim: int = None,
-        post_act_fn: Optional[str] = None,
-        cond_proj_dim=None,
-    ):
-        super().__init__()
-
-        self.linear_1 = nn.Linear(in_channels, time_embed_dim)
-
-        if cond_proj_dim is not None:
-            self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias_attr=False)
-        else:
-            self.cond_proj = None
-
-        if act_fn == "silu":
-            self.act = nn.Silu()
-        elif act_fn == "mish":
-            self.act = nn.Mish()
-        elif act_fn == "gelu":
-            self.act = nn.GELU()
-        else:
-            raise ValueError(f"{act_fn} does not exist. Make sure to define one of 'silu', 'mish', or 'gelu'")
-
-        if out_dim is not None:
-            time_embed_dim_out = out_dim
-        else:
-            time_embed_dim_out = time_embed_dim
-        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out)
-
-        if post_act_fn is None:
-            self.post_act = None
-        elif post_act_fn == "silu":
-            self.post_act = nn.Silu()
-        elif post_act_fn == "mish":
-            self.post_act = nn.Mish()
-        elif post_act_fn == "gelu":
-            self.post_act = nn.GELU()
-        else:
-            raise ValueError(f"{post_act_fn} does not exist. Make sure to define one of 'silu', 'mish', or 'gelu'")
-
-    def forward(self, sample, condition=None):
-        if condition is not None:
-            sample = sample + self.cond_proj(condition.cast(sample.dtype))
-        sample = self.linear_1(sample)
-
-        if self.act is not None:
-            sample = self.act(sample)
-
-        sample = self.linear_2(sample)
-
-        if self.post_act is not None:
-            sample = self.post_act(sample)
-        return sample
-
-
-class Timesteps(nn.Layer):
-    def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float):
-        super().__init__()
-        self.num_channels = num_channels
-        self.flip_sin_to_cos = flip_sin_to_cos
-        self.downscale_freq_shift = downscale_freq_shift
-
-    def forward(self, timesteps):
-        t_emb = get_timestep_embedding(
-            timesteps,
-            self.num_channels,
-            flip_sin_to_cos=self.flip_sin_to_cos,
-            downscale_freq_shift=self.downscale_freq_shift,
-        )
-        return t_emb
-
-
-class GaussianFourierProjection(nn.Layer):
-    """Gaussian Fourier embeddings for noise levels."""
-
-    def __init__(
-        self, embedding_size: int = 256, scale: float = 1.0, set_W_to_weight=True, log=True, flip_sin_to_cos=False
-    ):
-        super().__init__()
-        self.register_buffer("weight", paddle.randn((embedding_size,)) * scale)
-        self.log = log
-        self.flip_sin_to_cos = flip_sin_to_cos
-
-        if set_W_to_weight:
-            # to delete later
-            self.register_buffer("W", paddle.randn((embedding_size,)) * scale)
-
-            self.weight = self.W
-
-    def forward(self, x):
-        # TODO must cast x to float32
-        x = x.cast(self.weight.dtype)
-        if self.log:
-            x = paddle.log(x)
-
-        x_proj = x[:, None] * self.weight[None, :] * 2 * np.pi
-
-        if self.flip_sin_to_cos:
-            out = paddle.concat([paddle.cos(x_proj), paddle.sin(x_proj)], axis=-1)
-        else:
-            out = paddle.concat([paddle.sin(x_proj), paddle.cos(x_proj)], axis=-1)
-        return out
-
-
-class ImagePositionalEmbeddings(nn.Layer):
-    """
-    Converts latent image classes into vector embeddings. Sums the vector embeddings with positional embeddings for the
-    height and width of the latent space.
-
-    For more details, see figure 10 of the dall-e paper: https://arxiv.org/abs/2102.12092
-
-    For VQ-diffusion:
-
-    Output vector embeddings are used as input for the transformer.
-
-    Note that the vector embeddings for the transformer are different than the vector embeddings from the VQVAE.
-
-    Args:
-        num_embed (`int`):
-            Number of embeddings for the latent pixels embeddings.
-        height (`int`):
-            Height of the latent image i.e. the number of height embeddings.
-        width (`int`):
-            Width of the latent image i.e. the number of width embeddings.
-        embed_dim (`int`):
-            Dimension of the produced vector embeddings. Used for the latent pixel, height, and width embeddings.
-    """
-
-    def __init__(
-        self,
-        num_embed: int,
-        height: int,
-        width: int,
-        embed_dim: int,
-    ):
-        super().__init__()
-
-        self.height = height
-        self.width = width
-        self.num_embed = num_embed
-        self.embed_dim = embed_dim
-
-        self.emb = nn.Embedding(self.num_embed, embed_dim)
-        self.height_emb = nn.Embedding(self.height, embed_dim)
-        self.width_emb = nn.Embedding(self.width, embed_dim)
-
-    def forward(self, index):
-        emb = self.emb(index)
-
-        height_emb = self.height_emb(paddle.arange(self.height).reshape([1, self.height]))
-
-        # 1 x H x D -> 1 x H x 1 x D
-        height_emb = height_emb.unsqueeze(2)
-
-        width_emb = self.width_emb(paddle.arange(self.width).reshape([1, self.width]))
-
-        # 1 x W x D -> 1 x 1 x W x D
-        width_emb = width_emb.unsqueeze(1)
-
-        pos_emb = height_emb + width_emb
-
-        # 1 x H x W x D -> 1 x L xD
-        pos_emb = pos_emb.reshape([1, self.height * self.width, -1])
-
-        emb = emb + pos_emb[:, : emb.shape[1], :]
-
-        return emb
-
-
-class LabelEmbedding(nn.Layer):
-    """
-    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
-
-    Args:
-        num_classes (`int`): The number of classes.
-        hidden_size (`int`): The size of the vector embeddings.
-        dropout_prob (`float`): The probability of dropping a label.
-    """
-
-    def __init__(self, num_classes, hidden_size, dropout_prob):
-        super().__init__()
-        use_cfg_embedding = dropout_prob > 0
-        self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size)
-        self.num_classes = num_classes
-        self.dropout_prob = dropout_prob
-
-    def token_drop(self, labels, force_drop_ids=None):
-        """
-        Drops labels to enable classifier-free guidance.
-        """
-        if force_drop_ids is None:
-            drop_ids = (
-                paddle.rand(
-                    (labels.shape[0],),
-                )
-                < self.dropout_prob
-            )
-        else:
-            drop_ids = paddle.to_tensor(force_drop_ids == 1)
-        labels = paddle.where(drop_ids, self.num_classes, labels)
-        return labels
-
-    def forward(self, labels, force_drop_ids=None):
-        use_dropout = self.dropout_prob > 0
-        if (self.training and use_dropout) or (force_drop_ids is not None):
-            labels = self.token_drop(labels, force_drop_ids)
-        embeddings = self.embedding_table(labels)
-        return embeddings
-
-
-class CombinedTimestepLabelEmbeddings(nn.Layer):
-    def __init__(self, num_classes, embedding_dim, class_dropout_prob=0.1):
-        super().__init__()
-
-        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=1)
-        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
-        self.class_embedder = LabelEmbedding(num_classes, embedding_dim, class_dropout_prob)
-
-    def forward(self, timestep, class_labels, hidden_dtype=None):
-        timesteps_proj = self.time_proj(timestep)
-        timesteps_emb = self.timestep_embedder(timesteps_proj.cast(hidden_dtype))  # (N, D)
-
-        class_labels = self.class_embedder(class_labels)  # (N, D)
-
-        conditioning = timesteps_emb + class_labels  # (N, D)
-
-        return conditioning
-
-
-class TextTimeEmbedding(nn.Layer):
-    def __init__(self, encoder_dim: int, time_embed_dim: int, num_heads: int = 64):
-        super().__init__()
-        self.norm1 = nn.LayerNorm(encoder_dim)
-        self.pool = AttentionPooling(num_heads, encoder_dim)
-        self.proj = nn.Linear(encoder_dim, time_embed_dim)
-        self.norm2 = nn.LayerNorm(time_embed_dim)
-
-    def forward(self, hidden_states):
-        hidden_states = self.norm1(hidden_states)
-        hidden_states = self.pool(hidden_states)
-        hidden_states = self.proj(hidden_states)
-        hidden_states = self.norm2(hidden_states)
-        return hidden_states
-
-
-class AttentionPooling(nn.Layer):
-    # Copied from https://github.com/deep-floyd/IF/blob/2f91391f27dd3c468bf174be5805b4cc92980c0b/deepfloyd_if/model/nn.py#L54
-
-    def __init__(self, num_heads, embed_dim, dtype=None):
-        super().__init__()
-        self.positional_embedding = self.create_parameter(
-            (1, embed_dim), default_initializer=nn.initializer.Assign(paddle.randn((1, embed_dim)) / embed_dim**0.5)
-        )
-        self.k_proj = nn.Linear(embed_dim, embed_dim)
-        self.q_proj = nn.Linear(embed_dim, embed_dim)
-        self.v_proj = nn.Linear(embed_dim, embed_dim)
-        self.num_heads = num_heads
-        self.dim_per_head = embed_dim // self.num_heads
-        self.scale = 1 / math.sqrt(math.sqrt(self.dim_per_head))
-
-    def forward(self, x):
-        bs, length, width = x.shape
-
-        def shape(x):
-            # (bs, length, width) --> (bs, length, n_heads, dim_per_head)
-            x = x.reshape([bs, -1, self.num_heads, self.dim_per_head])
-            # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
-            x = x.transpose([0, 2, 1, 3])
-            # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head)
-            x = x.reshape([bs * self.num_heads, -1, self.dim_per_head])
-            # (bs*n_heads, length, dim_per_head) --> (bs*n_heads, dim_per_head, length)
-            x = x.transpose([0, 2, 1])
-            return x
-
-        class_token = x.mean(axis=1, keepdim=True) + self.positional_embedding.cast(x.dtype)
-        x = paddle.concat([class_token, x], axis=1)  # (bs, length+1, width)
-
-        # (bs*n_heads, class_token_length, dim_per_head)
-        q = shape(self.q_proj(class_token))
-        # (bs*n_heads, length+class_token_length, dim_per_head)
-        k = shape(self.k_proj(x))
-        v = shape(self.v_proj(x))
-
-        # (bs*n_heads, class_token_length, length+class_token_length):
-        weight = paddle.einsum(
-            "bct,bcs->bts", q * self.scale, k * self.scale
-        )  # More stable with f16 than dividing afterwards
-        weight = nn.functional.softmax(weight.cast("float32"), axis=-1).cast(weight.dtype)
-
-        # (bs*n_heads, dim_per_head, class_token_length)
-        a = paddle.einsum("bts,bcs->bct", weight, v)
-
-        # (bs, length+1, width)
-        a = a.reshape([bs, -1, 1]).transpose([0, 2, 1])
-
-        return a[:, 0, :]  # cls_token
diff --git a/ppdiffusers/ppdiffusers/models/modeling_pytorch_paddle_utils.py b/ppdiffusers/ppdiffusers/models/modeling_pytorch_paddle_utils.py
deleted file mode 100644
index 213b2efdd2ca..000000000000
--- a/ppdiffusers/ppdiffusers/models/modeling_pytorch_paddle_utils.py
+++ /dev/null
@@ -1,167 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch - Paddle general utilities."""
-import paddle.nn as nn
-
-#####################
-# PyTorch => Paddle #
-#####################
-
-
-def convert_pytorch_state_dict_to_paddle(pt_state_dict, paddle_model: nn.Layer, sub_layer=None):
-    # Step 1: Find Linear layer which need transpose weight
-    linear_need_transpose = []
-    for k, v in paddle_model.named_sublayers(include_self=True):
-        if isinstance(v, nn.Linear):
-            if sub_layer is not None and sub_layer not in k:
-                continue
-            linear_need_transpose.append(k + ".weight")
-
-    paddle_state_dict = {}
-    ignore_keys = ["position_ids", ".num_batches_tracked"]
-    ptname2pdname = {
-        # torch.nn.BatchNorm2d -> paddle.nn.BatchNorm2D
-        ".running_var": "._variance",
-        ".running_mean": "._mean",
-    }
-    # Need to change some parameters name to match paddle names
-    for pt_key, pt_tensor in pt_state_dict.items():
-        # only convert sub_layer state dict
-        if sub_layer is not None and sub_layer not in pt_key:
-            continue
-        # (0) ignore_keys
-        if any(i in pt_key for i in ignore_keys):
-            continue
-        # (1) transpose linear
-        if pt_key in linear_need_transpose and pt_tensor.ndim == 2:
-            pt_tensor = pt_tensor.T
-        # (2) 0d tensor -> 1d tensor
-        if pt_tensor.ndim == 0:
-            pt_tensor = pt_tensor.reshape((1,))
-        # (3) name mapping
-        for old_key, new_key in ptname2pdname.items():
-            pt_key = pt_key.replace(old_key, new_key)
-
-        paddle_state_dict[pt_key] = pt_tensor
-    return paddle_state_dict
-
-
-@classmethod
-def convert_pytorch_state_dict_to_paddle_class_method(cls, pt_state_dict, paddle_model: nn.Layer, sub_layer=None):
-    # Step 1: Find Linear layer which need transpose weight
-    linear_need_transpose = []
-    for k, v in paddle_model.named_sublayers(include_self=True):
-        if isinstance(v, nn.Linear):
-            if sub_layer is not None and sub_layer not in k:
-                continue
-            linear_need_transpose.append(k + ".weight")
-
-    paddle_state_dict = {}
-    ignore_keys = ["position_ids", ".num_batches_tracked"]
-    ptname2pdname = {
-        # torch.nn.BatchNorm2d -> paddle.nn.BatchNorm2D
-        ".running_var": "._variance",
-        ".running_mean": "._mean",
-    }
-    if not hasattr(cls, "paddle_torch_name_mapping"):
-        cls.paddle_torch_name_mapping = {}
-    # Need to change some parameters name to match paddle names
-    for pt_key, pt_tensor in pt_state_dict.items():
-        torch_name = pt_key
-        # only convert sub_layer state dict
-        if sub_layer is not None and sub_layer not in pt_key:
-            continue
-        # (0) ignore_keys
-        if any(i in pt_key for i in ignore_keys):
-            continue
-        # (1) transpose linear
-        if pt_key in linear_need_transpose and pt_tensor.ndim == 2:
-            pt_tensor = pt_tensor.T
-        # (2) 0d tensor -> 1d tensor
-        if pt_tensor.ndim == 0:
-            pt_tensor = pt_tensor.reshape((1,))
-        # (3) name mapping
-        for old_key, new_key in ptname2pdname.items():
-            pt_key = pt_key.replace(old_key, new_key)
-
-        cls.paddle_torch_name_mapping[pt_key] = torch_name
-        paddle_state_dict[pt_key] = pt_tensor
-    return paddle_state_dict
-
-
-def convert_paddle_state_dict_to_pytorch(pd_state_dict, paddle_model: nn.Layer):
-    # Step 2: Find Linear layer which need transpose weight
-    linear_need_transpose = []
-    for k, v in paddle_model.named_sublayers(include_self=True):
-        if isinstance(v, nn.Linear):
-            linear_need_transpose.append(k + ".weight")
-
-    pytorch_state_dict = {}
-    ignore_keys = ["position_ids"]
-    ptname2pdname = {
-        # torch.nn.BatchNorm2d -> paddle.nn.BatchNorm2D
-        ".running_var": "._variance",
-        ".running_mean": "._mean",
-    }
-    # Need to change some parameters name to match Flax names
-    for pd_key, pd_tensor in pd_state_dict.items():
-        # (0) ignore_keys
-        if any(i in pd_key for i in ignore_keys):
-            continue
-        # (1) transpose linear
-        if pd_key in linear_need_transpose and pd_tensor.ndim == 2:
-            pd_tensor = pd_tensor.T
-        # TODO maybe not true
-        # (2) 1d tensor -> 0d tensor
-        if pd_tensor.ndim == 1:
-            pd_tensor = pd_tensor.squeeze()
-        # (3) name mapping
-        for old_key, new_key in ptname2pdname.items():
-            pd_key = pd_key.replace(new_key, old_key)
-        if hasattr(paddle_model, "paddle_torch_name_mapping"):
-            pd_key = paddle_model.paddle_torch_name_mapping.get(pd_key, pd_key)
-        pytorch_state_dict[pd_key] = pd_tensor.contiguous() if hasattr(pd_tensor, "contiguous") else pd_tensor
-    return pytorch_state_dict
-
-
-# if __name__ == "__main__":
-#     from paddlenlp.transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPVisionModel, CLIPVisionModelWithProjection, BertModel, DPTForDepthEstimation, BitBackbone
-#     from ppdiffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
-#     from ppdiffusers.pipelines.stable_diffusion_safe.safety_checker import SafeStableDiffusionSafetyChecker
-#     from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation
-#     from ppdiffusers.pipelines.paint_by_example.image_encoder import PaintByExampleImageEncoder
-
-#     clip = [(CLIPTextModel, "runwayml/stable-diffusion-v1-5", "text_encoder"),  # test safetensors
-#             (CLIPTextModel, "CompVis/stable-diffusion-v1-4", "text_encoder"),
-#             (CLIPTextModelWithProjection, "shi-labs/versatile-diffusion", "text_encoder"),
-#             (StableDiffusionSafetyChecker,"CompVis/stable-diffusion-v1-4", "safety_checker"),
-#             (SafeStableDiffusionSafetyChecker,"CompVis/stable-diffusion-v1-4", "safety_checker"),
-#             (CLIPVisionModelWithProjection, "shi-labs/versatile-diffusion", "image_encoder"),
-#             (PaintByExampleImageEncoder, "Fantasy-Studio/Paint-by-Example", "image_encoder"),
-#         ]
-#     bert = [(BertModel, "IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-v0.1", "text_encoder"),
-#             (RobertaSeriesModelWithTransformation, "BAAI/AltDiffusion", "text_encoder")]
-#     other = [(DPTForDepthEstimation, "stabilityai/stable-diffusion-2-depth", "depth_estimator")] # test safetensors
-#     for cls_, name, subfolder in clip+bert+other:
-#         print(name + "======" + subfolder)
-#         model, load_info = cls_.from_pretrained(
-#             name,
-#             output_loading_info=True,
-#             subfolder=subfolder,
-#             from_hf_hub=True,
-#             from_diffusers=True,
-#             resume_download=True,
-#             cache_dir="nihao",
-#         )
diff --git a/ppdiffusers/ppdiffusers/models/modeling_utils.py b/ppdiffusers/ppdiffusers/models/modeling_utils.py
deleted file mode 100644
index 7daad96a8195..000000000000
--- a/ppdiffusers/ppdiffusers/models/modeling_utils.py
+++ /dev/null
@@ -1,791 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from functools import partial
-from typing import Any, Callable, Optional, Union
-
-import paddle
-import paddle.nn as nn
-
-from ..utils import (
-    CONFIG_NAME,
-    DIFFUSERS_CACHE,
-    FROM_DIFFUSERS,
-    FROM_HF_HUB,
-    HF_HUB_OFFLINE,
-    LOW_CPU_MEM_USAGE_DEFAULT,
-    PADDLE_WEIGHTS_NAME,
-    PPDIFFUSERS_CACHE,
-    TO_DIFFUSERS,
-    TORCH_SAFETENSORS_WEIGHTS_NAME,
-    TORCH_WEIGHTS_NAME,
-    _add_variant,
-    _get_model_file,
-    deprecate,
-    is_paddlenlp_available,
-    is_safetensors_available,
-    is_torch_available,
-    is_torch_file,
-    logging,
-    smart_load,
-)
-from ..version import VERSION as __version__
-from .modeling_pytorch_paddle_utils import (
-    convert_paddle_state_dict_to_pytorch,
-    convert_pytorch_state_dict_to_paddle,
-)
-
-logger = logging.get_logger(__name__)
-
-
-if is_safetensors_available():
-    from safetensors.numpy import save_file as safetensors_numpy_save_file
-
-    if is_torch_available():
-        from safetensors.torch import save_file as safetensors_torch_save_file
-
-if is_torch_available():
-    import torch
-
-if is_paddlenlp_available:
-    try:
-        from paddlenlp.transformers.model_utils import no_init_weights
-    except ImportError:
-        from ..utils.paddle_utils import no_init_weights
-
-
-def get_parameter_device(parameter: nn.Layer):
-    try:
-        # TODO https://github.com/huggingface/diffusers/compare/v0.15.0...v0.16.0#diff-6a3b9a08c1d37dbc341131632415fea800af242a84fb31f1bcd40d725e2eeeebR64
-        return next(parameter.named_parameters())[1].place
-    except StopIteration:
-        try:
-            return next(parameter.named_buffers())[1].place
-        except StopIteration:
-            return paddle.get_device()
-
-
-def get_parameter_dtype(parameter: nn.Layer) -> paddle.dtype:
-    try:
-        # TODO https://github.com/huggingface/diffusers/compare/v0.15.0...v0.16.0#diff-6a3b9a08c1d37dbc341131632415fea800af242a84fb31f1bcd40d725e2eeeebR80
-        return next(parameter.named_parameters())[1].dtype
-    except StopIteration:
-        try:
-            return next(parameter.named_buffers())[1].dtype
-        except StopIteration:
-            return parameter._dtype
-
-
-def convert_state_dict(state_dict, framework="torch"):
-    if framework in ["torch", "pt"]:
-        # support bfloat16
-        newstate_dict = {}
-        for k, v in state_dict.items():
-            if v.dtype == paddle.bfloat16:
-                v = v.cast("float32").cpu().numpy()
-                newstate_dict[k] = torch.tensor(v).to(torch.bfloat16)
-            else:
-                newstate_dict[k] = torch.tensor(v.cpu().numpy())
-        return newstate_dict
-    elif framework in ["numpy", "np"]:
-        state_dict = {k: v.cpu().numpy() for k, v in state_dict.items()}
-        return state_dict
-    elif framework in ["paddle", "pd"]:
-        state_dict = {k: paddle.to_tensor(v, place="cpu") for k, v in state_dict.items()}
-        return state_dict
-    else:
-        raise NotImplementedError(f"Not Implemented {framework} framework!")
-
-
-from contextlib import ExitStack
-
-
-class ContextManagers:
-    """
-    Wrapper for `contextlib.ExitStack` which enters a collection of context managers. Adaptation of `ContextManagers`
-    in the `fastcore` library.
-    """
-
-    def __init__(self, context_managers):
-        self.context_managers = context_managers
-        self.stack = ExitStack()
-
-    def __enter__(self):
-        for context_manager in self.context_managers:
-            self.stack.enter_context(context_manager)
-
-    def __exit__(self, *args, **kwargs):
-        self.stack.__exit__(*args, **kwargs)
-
-
-class ModelMixin(nn.Layer):
-    r"""
-    Base class for all models.
-
-    [`ModelMixin`] takes care of storing the configuration of the models and handles methods for loading, downloading
-    and saving models.
-
-        - **config_name** ([`str`]) -- A filename under which the model should be stored when calling
-          [`~models.ModelMixin.save_pretrained`].
-    """
-    config_name = CONFIG_NAME
-    _automatically_saved_args = ["_ppdiffusers_version", "_class_name", "_name_or_path"]
-    _supports_gradient_checkpointing = False
-
-    def __init__(self):
-        super().__init__()
-
-    def __getattr__(self, name: str) -> Any:
-        """The only reason we overwrite `getattr` here is to gracefully deprecate accessing
-        config attributes directly. See https://github.com/huggingface/diffusers/pull/3129 We need to overwrite
-        __getattr__ here in addition so that we don't trigger `nn.Layer`'s __getattr__':
-        https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module
-        """
-
-        is_in_config = "_internal_dict" in self.__dict__ and hasattr(self.__dict__["_internal_dict"], name)
-        is_attribute = name in self.__dict__
-
-        if is_in_config and not is_attribute:
-            deprecation_message = f"Accessing config attribute `{name}` directly via '{type(self).__name__}' object attribute is deprecated. Please access '{name}' over '{type(self).__name__}'s config object instead, e.g. 'unet.config.{name}'."
-            deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False, stacklevel=3)
-            return self._internal_dict[name]
-
-        # call PyTorch's https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module
-        return super().__getattr__(name)
-
-    @property
-    def is_gradient_checkpointing(self) -> bool:
-        """
-        Whether gradient checkpointing is activated for this model or not.
-
-        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
-        activations".
-        """
-        return any(
-            hasattr(m, "gradient_checkpointing") and m.gradient_checkpointing
-            for m in self.sublayers(include_self=True)
-        )
-
-    def enable_gradient_checkpointing(self):
-        """
-        Activates gradient checkpointing for the current model.
-
-        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
-        activations".
-        """
-        if not self._supports_gradient_checkpointing:
-            raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
-        self.apply(partial(self._set_gradient_checkpointing, value=True))
-
-    def disable_gradient_checkpointing(self):
-        """
-        Deactivates gradient checkpointing for the current model.
-
-        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
-        activations".
-        """
-        if self._supports_gradient_checkpointing:
-            self.apply(partial(self._set_gradient_checkpointing, value=False))
-
-    def set_use_memory_efficient_attention_xformers(self, valid: bool, attention_op: Optional[str] = None) -> None:
-        # Recursively walk through all the children.
-        # Any children which exposes the set_use_memory_efficient_attention_xformers method
-        # gets the message
-        def fn_recursive_set_mem_eff(module: nn.Layer):
-            if hasattr(module, "set_use_memory_efficient_attention_xformers"):
-                module.set_use_memory_efficient_attention_xformers(valid, attention_op)
-
-            for child in module.children():
-                fn_recursive_set_mem_eff(child)
-
-        for module in self.children():
-            if isinstance(module, nn.Layer):
-                fn_recursive_set_mem_eff(module)
-
-    def enable_xformers_memory_efficient_attention(self, attention_op: Optional[str] = None):
-        r"""
-        Enable memory efficient attention as implemented in xformers.
-
-        When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
-        time. Speed up at training time is not guaranteed.
-
-        Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
-        is used.
-
-        Parameters:
-            attention_op (`Callable`, *optional*):
-                Override the default `None`
-
-        Examples:
-
-        ```py
-        >>> import paddle
-        >>> from ppdiffusers import UNet2DConditionModel
-
-        >>> model = UNet2DConditionModel.from_pretrained(
-        ...     "stabilityai/stable-diffusion-2-1", subfolder="unet", paddle_dtype=paddle.float16
-        ... )
-        >>> model.enable_xformers_memory_efficient_attention()
-        ```
-        """
-        self.set_use_memory_efficient_attention_xformers(True, attention_op)
-
-    def disable_xformers_memory_efficient_attention(self):
-        r"""
-        Disable memory efficient attention as implemented in xformers.
-        """
-        self.set_use_memory_efficient_attention_xformers(False)
-
-    def save_pretrained(
-        self,
-        save_directory: Union[str, os.PathLike],
-        is_main_process: bool = True,
-        save_function: Callable = None,
-        safe_serialization: bool = False,
-        variant: Optional[str] = None,
-        to_diffusers: Optional[bool] = None,
-    ):
-        """
-        Save a model and its configuration file to a directory, so that it can be re-loaded using the
-        `[`~models.ModelMixin.from_pretrained`]` class method.
-
-        Arguments:
-            save_directory (`str` or `os.PathLike`):
-                Directory to which to save. Will be created if it doesn't exist.
-            is_main_process (`bool`, *optional*, defaults to `True`):
-                Whether the process calling this is the main process or not. Useful when in distributed training like
-                TPUs and need to call this function on all processes. In this case, set `is_main_process=True` only on
-                the main process to avoid race conditions.
-            save_function (`Callable`):
-                The function to use to save the state dictionary. Useful on distributed training like TPUs when one
-                need to replace `paddle.save` by another method. Can be configured with the environment variable
-                `DIFFUSERS_SAVE_MODE`.
-            variant (`str`, *optional*):
-                If specified, weights are saved in the format pytorch_model.<variant>.bin.
-            to_diffusers (`bool`, *optional*, defaults to `False`):
-                If specified, weights are saved in the format of torch. eg. linear need transpose.
-            safe_serialization (`bool`, *optional*, defaults to `False`):
-                Only when `to_diffusers` is True, Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
-        """
-        if to_diffusers is None:
-            to_diffusers = TO_DIFFUSERS
-        if to_diffusers and safe_serialization and not is_safetensors_available():
-            raise ImportError("`safe_serialization` requires the `safetensors library: `pip install safetensors`.")
-
-        if os.path.isfile(save_directory):
-            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
-            return
-
-        os.makedirs(save_directory, exist_ok=True)
-
-        model_to_save = self
-
-        # Attach architecture to the config
-        # Save the config
-        if is_main_process:
-            model_to_save.save_config(save_directory, to_diffusers=to_diffusers)
-
-        # Save the model
-        state_dict = model_to_save.state_dict()
-
-        # save ignore lora_weights
-        fn = lambda k: ".lora_" in k or ".alpha" in k
-        state_dict = {k: v for k, v in state_dict.items() if not fn(k)}
-
-        # choose save_function
-        if save_function is None:
-            if to_diffusers:
-                if safe_serialization:
-                    if is_torch_available():
-                        save_function = safetensors_torch_save_file
-                        state_dict = convert_state_dict(state_dict, framework="torch")
-                    else:
-                        save_function = safetensors_numpy_save_file
-                        state_dict = convert_state_dict(state_dict, framework="numpy")
-                    weights_name = _add_variant(TORCH_SAFETENSORS_WEIGHTS_NAME, variant)
-                else:
-                    if not is_torch_available():
-                        raise ImportError(
-                            "`to_diffusers=True` with `safe_serialization=False` requires the `torch library: `pip install torch`."
-                        )
-                    save_function = torch.save
-                    weights_name = _add_variant(TORCH_WEIGHTS_NAME, variant)
-                    state_dict = convert_state_dict(state_dict, framework="torch")
-
-                state_dict = convert_paddle_state_dict_to_pytorch(state_dict, model_to_save)
-            else:
-                save_function = paddle.save
-                weights_name = _add_variant(PADDLE_WEIGHTS_NAME, variant)
-
-        # Save the model
-        save_function(state_dict, os.path.join(save_directory, weights_name))
-
-        logger.info(f"Model weights saved in {os.path.join(save_directory, weights_name)}")
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
-        r"""
-        Instantiate a pretrained pytorch model from a pre-trained model configuration.
-
-        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train
-        the model, you should first set it back in training mode with `model.train()`.
-
-        The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come
-        pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
-        task.
-
-        The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those
-        weights are discarded.
-
-        Parameters:
-            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
-                Can be either:
-
-                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids should have an organization name, like `google/ddpm-celebahq-256`.
-                    - A path to a *directory* containing model weights saved using [`~ModelMixin.save_config`], e.g.,
-                      `./my_model_directory/`.
-
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory in which a downloaded pretrained model configuration should be cached if the
-                standard cache should not be used.
-            paddle_dtype (`str` or `paddle.dtype`, *optional*):
-                Override the default `paddle.dtype` and load the model under this dtype. If `"auto"` is passed the dtype
-                will be automatically derived from the model's weights.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
-                file exists.
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            output_loading_info(`bool`, *optional*, defaults to `False`):
-                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
-            local_files_only(`bool`, *optional*, defaults to `False`):
-                Whether or not to only look at local files (i.e., do not try to download the model).
-            use_auth_token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-                when running `diffusers-cli login` (stored in `~/.huggingface`).
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
-                identifier allowed by git.
-            from_diffusers (`bool`, *optional*, defaults to `False`):
-                Load the model weights from a torch checkpoint save file.
-            subfolder (`str`, *optional*, defaults to `""`):
-                In case the relevant files are located inside a subfolder of the model repo (either remote in
-                huggingface.co or downloaded locally), you can specify the folder name here.
-
-            mirror (`str`, *optional*):
-                Mirror source to accelerate downloads in China. If you are from China and have an accessibility
-                problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
-                Please refer to the mirror site for more information.
-            variant (`str`, *optional*):
-                If specified load weights from `variant` filename, *e.g.* pytorch_model.<variant>.bin.
-                model_state.<variant>.pdparams.
-            use_safetensors (`bool`, *optional* ):
-                If set to `True`, the pipeline will forcibly load the models from `safetensors` weights. If set to
-                `None` (the default). The pipeline will load using `safetensors` if safetensors weights are available
-                *and* if `safetensors` is installed. If the to `False` the pipeline will *not* use `safetensors`.
-
-        <Tip>
-
-         It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated
-         models](https://huggingface.co/docs/hub/models-gated#gated-models).
-
-        </Tip>
-
-        """
-        from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB)
-        cache_dir = (
-            kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)
-        )
-        ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False)
-        force_download = kwargs.pop("force_download", False)
-        from_diffusers = kwargs.pop("from_diffusers", FROM_DIFFUSERS)
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-        output_loading_info = kwargs.pop("output_loading_info", False)
-        local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE)
-        use_auth_token = kwargs.pop("use_auth_token", None)
-        revision = kwargs.pop("revision", None)
-        paddle_dtype = kwargs.pop("paddle_dtype", None)
-        subfolder = kwargs.pop("subfolder", None)
-        ignore_keys = kwargs.pop("ignore_keys", None)
-        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", LOW_CPU_MEM_USAGE_DEFAULT)
-        variant = kwargs.pop("variant", None)
-        use_safetensors = kwargs.pop("use_safetensors", None)
-
-        if from_diffusers and use_safetensors and not is_safetensors_available():
-            raise ValueError(
-                "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors"
-            )
-        if use_safetensors is None:
-            use_safetensors = is_safetensors_available()
-
-        # Load config if we don't provide a configuration
-        config_path = pretrained_model_name_or_path
-
-        user_agent = {
-            "ppdiffusers": __version__,
-            "file_type": "model",
-            "framework": "pytorch" if from_diffusers else "paddle",
-        }
-
-        # load config
-        config, unused_kwargs, commit_hash = cls.load_config(
-            config_path,
-            cache_dir=cache_dir,
-            return_unused_kwargs=True,
-            return_commit_hash=True,
-            force_download=force_download,
-            resume_download=resume_download,
-            proxies=proxies,
-            local_files_only=local_files_only,
-            use_auth_token=use_auth_token,
-            revision=revision,
-            subfolder=subfolder,
-            user_agent=user_agent,
-            from_hf_hub=from_hf_hub,  # whether or not from_hf_hub
-            **kwargs,
-        )
-
-        # This variable will flag if we're loading a sharded checkpoint. In this case the archive file is just the
-        # Load model
-        model_file = None
-        if from_diffusers:
-            if use_safetensors:
-                try:
-                    model_file = _get_model_file(
-                        pretrained_model_name_or_path,
-                        weights_name=_add_variant(TORCH_SAFETENSORS_WEIGHTS_NAME, variant),
-                        cache_dir=cache_dir,
-                        force_download=force_download,
-                        resume_download=resume_download,
-                        proxies=proxies,
-                        local_files_only=local_files_only,
-                        use_auth_token=use_auth_token,
-                        revision=revision,
-                        subfolder=subfolder,
-                        user_agent=user_agent,
-                        commit_hash=commit_hash,
-                        from_hf_hub=from_hf_hub,
-                    )
-                    # try load model_file with paddle / torch / safetensor
-                    state_dict = smart_load(model_file)
-                except Exception:
-                    model_file = None
-                    pass
-            if model_file is None:
-                model_file = _get_model_file(
-                    pretrained_model_name_or_path,
-                    weights_name=_add_variant(TORCH_WEIGHTS_NAME, variant),
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    resume_download=resume_download,
-                    proxies=proxies,
-                    local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
-                    revision=revision,
-                    subfolder=subfolder,
-                    user_agent=user_agent,
-                    commit_hash=commit_hash,
-                    from_hf_hub=from_hf_hub,
-                )
-                # try load model_file with paddle / torch / safetensor
-                state_dict = smart_load(model_file)
-        else:
-            model_file = _get_model_file(
-                pretrained_model_name_or_path,
-                weights_name=_add_variant(PADDLE_WEIGHTS_NAME, variant),
-                cache_dir=cache_dir,
-                force_download=force_download,
-                resume_download=resume_download,
-                proxies=proxies,
-                local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
-                revision=revision,
-                subfolder=subfolder,
-                user_agent=user_agent,
-                commit_hash=commit_hash,
-                from_hf_hub=from_hf_hub,
-            )
-            # try load model_file with paddle / torch / safetensor
-            state_dict = smart_load(model_file)
-
-        init_contexts = []
-
-        dtype = set(v.dtype for v in state_dict.values() if paddle.is_tensor(v) and paddle.is_floating_point(v))
-        if len(dtype) > 1 and paddle.float32 not in dtype:
-            raise ValueError(
-                f"The weights of the model file {model_file} have a mixture of incompatible dtypes {dtype}. Please"
-                f" make sure that {model_file} weights have only one dtype."
-            )
-        elif len(dtype) > 1 and paddle.float32 in dtype:
-            dtype = paddle.float32
-        elif len(dtype) == 0:
-            dtype = paddle.float32
-        else:
-            dtype = dtype.pop()
-
-        # for
-        if "uint8" in str(dtype):
-            state_dict = {k: v.astype("float32") for k, v in state_dict.items()}
-            dtype = paddle.float32
-
-        init_contexts.append(paddle.dtype_guard(dtype))
-
-        if low_cpu_mem_usage:
-            # Instantiate model.
-            init_contexts.append(no_init_weights(_enable=True))
-            if hasattr(paddle, "LazyGuard"):
-                init_contexts.append(paddle.LazyGuard())
-
-        with ContextManagers(init_contexts):
-            model = cls.from_config(config, **unused_kwargs)
-
-        # convert weights
-        if from_diffusers or is_torch_file(model_file):
-            state_dict = convert_pytorch_state_dict_to_paddle(state_dict, model)
-
-        # remove keys
-        if ignore_keys is not None:
-            keys = list(state_dict.keys())
-            for k in keys:
-                for ik in ignore_keys:
-                    if k.startswith(ik):
-                        logger.warning("Deleting key {} from state_dict.".format(k))
-                        del state_dict[k]
-
-        model, missing_keys, unexpected_keys, mismatched_keys, error_msgs = cls._load_pretrained_model(
-            model,
-            state_dict,
-            model_file,
-            pretrained_model_name_or_path,
-            ignore_mismatched_sizes=ignore_mismatched_sizes,
-        )
-
-        loading_info = {
-            "missing_keys": missing_keys,
-            "unexpected_keys": unexpected_keys,
-            "mismatched_keys": mismatched_keys,
-            "error_msgs": error_msgs,
-        }
-
-        # if paddle_dtype is not None and not isinstance(paddle_dtype, paddle.dtype):
-        #     raise ValueError(
-        #         f"{paddle_dtype} needs to be of type `paddle.dtype`, e.g. `paddle.float16`, but is {type(paddle_dtype)}."
-        #     )
-        if paddle_dtype is not None:
-            model = model.to(dtype=paddle_dtype)
-
-        model.register_to_config(_name_or_path=pretrained_model_name_or_path)
-
-        # Set model in evaluation mode to deactivate DropOut modules by default
-        model.eval()
-        if output_loading_info:
-            return model, loading_info
-
-        return model
-
-    @classmethod
-    def _load_pretrained_model(
-        cls,
-        model,
-        state_dict,
-        resolved_archive_file,
-        pretrained_model_name_or_path,
-        ignore_mismatched_sizes=False,
-    ):
-        # Retrieve missing & unexpected_keys
-        model_state_dict = model.state_dict()
-        loaded_keys = list(state_dict.keys())
-
-        expected_keys = list(model_state_dict.keys())
-
-        original_loaded_keys = loaded_keys
-
-        missing_keys = list(set(expected_keys) - set(loaded_keys))
-        unexpected_keys = list(set(loaded_keys) - set(expected_keys))
-
-        # Make sure we are able to load base models as well as derived models (with heads)
-        model_to_load = model
-
-        def _find_mismatched_keys(
-            state_dict,
-            model_state_dict,
-            loaded_keys,
-            ignore_mismatched_sizes,
-        ):
-            mismatched_keys = []
-            for checkpoint_key in loaded_keys:
-                model_key = checkpoint_key
-
-                if model_key in model_state_dict and list(state_dict[checkpoint_key].shape) != list(
-                    model_state_dict[model_key].shape
-                ):
-                    mismatched_keys.append(
-                        (checkpoint_key, state_dict[checkpoint_key].shape, model_state_dict[model_key].shape)
-                    )
-                    del state_dict[checkpoint_key]
-            if ignore_mismatched_sizes:
-                mismatched_keys = []
-            return mismatched_keys
-
-        if state_dict is not None:
-            # Whole checkpoint
-            mismatched_keys = _find_mismatched_keys(
-                state_dict,
-                model_state_dict,
-                original_loaded_keys,
-                ignore_mismatched_sizes,
-            )
-            error_msgs = []
-            for key_name, loaded_shape, model_shape in mismatched_keys:
-                error_msgs.append(
-                    f"Error size mismatch, {key_name} receives a shape {loaded_shape}, but the expected shape is {model_shape}."
-                )
-            model_to_load.load_dict(state_dict)
-
-        if len(error_msgs) > 0:
-            error_msg = "\n\t".join(error_msgs)
-            if "size mismatch" in error_msg:
-                error_msg += (
-                    "\n\tYou may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method."
-                )
-            raise RuntimeError(f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}")
-
-        if len(unexpected_keys) > 0:
-            logger.warning(
-                f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
-                f" initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are"
-                f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task"
-                " or with another architecture (e.g. initializing a BertForSequenceClassification model from a"
-                " BertForPreTraining model).\n- This IS NOT expected if you are initializing"
-                f" {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly"
-                " identical (initializing a BertForSequenceClassification model from a"
-                " BertForSequenceClassification model)."
-            )
-        else:
-            logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n")
-        if len(missing_keys) > 0:
-            logger.warning(
-                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
-                f" {pretrained_model_name_or_path} and are newly initialized: {missing_keys}\nYou should probably"
-                " TRAIN this model on a down-stream task to be able to use it for predictions and inference."
-            )
-        elif len(mismatched_keys) == 0:
-            logger.info(
-                f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at"
-                f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the"
-                f" checkpoint was trained on, you can already use {model.__class__.__name__} for predictions"
-                " without further training."
-            )
-        if len(mismatched_keys) > 0:
-            mismatched_warning = "\n".join(
-                [
-                    f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated"
-                    for key, shape1, shape2 in mismatched_keys
-                ]
-            )
-            logger.warning(
-                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
-                f" {pretrained_model_name_or_path} and are newly initialized because the shapes did not"
-                f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be"
-                " able to use it for predictions and inference."
-            )
-
-        return model, missing_keys, unexpected_keys, mismatched_keys, error_msgs
-
-    @property
-    def device(self):
-        """
-        `paddle.place`: The device on which the module is (assuming that all the module parameters are on the same
-        device).
-        """
-        return get_parameter_device(self)
-
-    @property
-    def dtype(self) -> paddle.dtype:
-        """
-        `paddle.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype).
-        """
-        return get_parameter_dtype(self)
-
-    def num_parameters(self, only_trainable: bool = False, exclude_embeddings: bool = False) -> int:
-        """
-        Get number of (optionally, trainable or non-embeddings) parameters in the module.
-
-        Args:
-            only_trainable (`bool`, *optional*, defaults to `False`):
-                Whether or not to return only the number of trainable parameters
-
-            exclude_embeddings (`bool`, *optional*, defaults to `False`):
-                Whether or not to return only the number of non-embeddings parameters
-
-        Returns:
-            `int`: The number of parameters.
-        """
-
-        if exclude_embeddings:
-            embedding_param_names = [
-                f"{name}.weight"
-                for name, module_type in self.named_sublayers(include_self=True)
-                if isinstance(module_type, nn.Embedding)
-            ]
-            non_embedding_parameters = [
-                parameter for name, parameter in self.named_parameters() if name not in embedding_param_names
-            ]
-            return sum(p.numel() for p in non_embedding_parameters if not p.stop_gradient or not only_trainable)
-        else:
-            return sum(p.numel() for p in self.parameters() if not p.stop_gradient or not only_trainable)
-
-
-def unfreeze_params(params):
-    for param in params:
-        param.stop_gradient = False
-
-
-def freeze_params(params):
-    for param in params:
-        param.stop_gradient = True
-
-
-def unfreeze_model(model: nn.Layer):
-    for param in model.parameters():
-        param.stop_gradient = False
-
-
-def freeze_model(model: nn.Layer):
-    for param in model.parameters():
-        param.stop_gradient = True
-
-
-def unwrap_model(model: nn.Layer) -> nn.Layer:
-    """
-    Recursively unwraps a model from potential containers (as used in distributed training).
-
-    Args:
-        model (`nn.Layer`): The model to unwrap.
-    """
-    # since there could be multiple levels of wrapping, unwrap recursively
-    if hasattr(model, "_layers"):
-        return unwrap_model(model._layers)
-    else:
-        return model
diff --git a/ppdiffusers/ppdiffusers/models/prior_transformer.py b/ppdiffusers/ppdiffusers/models/prior_transformer.py
deleted file mode 100644
index 21abe1f9b2b5..000000000000
--- a/ppdiffusers/ppdiffusers/models/prior_transformer.py
+++ /dev/null
@@ -1,219 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from dataclasses import dataclass
-from typing import Optional, Union
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import NEG_INF, BaseOutput
-from .attention import BasicTransformerBlock
-from .embeddings import TimestepEmbedding, Timesteps
-from .modeling_utils import ModelMixin
-
-
-@dataclass
-class PriorTransformerOutput(BaseOutput):
-    """
-    Args:
-        predicted_image_embedding (`paddle.Tensor` of shape `(batch_size, embedding_dim)`):
-            The predicted CLIP image embedding conditioned on the CLIP text embedding input.
-    """
-
-    predicted_image_embedding: paddle.Tensor
-
-
-class PriorTransformer(ModelMixin, ConfigMixin):
-    """
-    The prior transformer from unCLIP is used to predict CLIP image embeddings from CLIP text embeddings. Note that the
-    transformer predicts the image embeddings through a denoising diffusion process.
-
-    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
-    implements for all the models (such as downloading or saving, etc.)
-
-    For more details, see the original paper: https://arxiv.org/abs/2204.06125
-
-    Parameters:
-        num_attention_heads (`int`, *optional*, defaults to 32): The number of heads to use for multi-head attention.
-        attention_head_dim (`int`, *optional*, defaults to 64): The number of channels in each head.
-        num_layers (`int`, *optional*, defaults to 20): The number of layers of Transformer blocks to use.
-        embedding_dim (`int`, *optional*, defaults to 768): The dimension of the CLIP embeddings. Note that CLIP
-            image embeddings and text embeddings are both the same dimension.
-        num_embeddings (`int`, *optional*, defaults to 77): The max number of clip embeddings allowed. I.e. the
-            length of the prompt after it has been tokenized.
-        additional_embeddings (`int`, *optional*, defaults to 4): The number of additional tokens appended to the
-            projected hidden_states. The actual length of the used hidden_states is `num_embeddings +
-            additional_embeddings`.
-        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
-
-    """
-
-    @register_to_config
-    def __init__(
-        self,
-        num_attention_heads: int = 32,
-        attention_head_dim: int = 64,
-        num_layers: int = 20,
-        embedding_dim: int = 768,
-        num_embeddings=77,
-        additional_embeddings=4,
-        dropout: float = 0.0,
-    ):
-        super().__init__()
-        self.num_attention_heads = num_attention_heads
-        self.attention_head_dim = attention_head_dim
-        inner_dim = num_attention_heads * attention_head_dim
-        self.additional_embeddings = additional_embeddings
-
-        self.time_proj = Timesteps(inner_dim, True, 0)
-        self.time_embedding = TimestepEmbedding(inner_dim, inner_dim)
-
-        self.proj_in = nn.Linear(embedding_dim, inner_dim)
-
-        self.embedding_proj = nn.Linear(embedding_dim, inner_dim)
-        self.encoder_hidden_states_proj = nn.Linear(embedding_dim, inner_dim)
-
-        self.positional_embedding = self.create_parameter(
-            (1, num_embeddings + additional_embeddings, inner_dim),
-            dtype=paddle.get_default_dtype(),
-            default_initializer=nn.initializer.Constant(0.0),
-        )
-        self.prd_embedding = self.create_parameter(
-            (1, 1, inner_dim), dtype=paddle.get_default_dtype(), default_initializer=nn.initializer.Constant(0.0)
-        )
-        self.transformer_blocks = nn.LayerList(
-            [
-                BasicTransformerBlock(
-                    inner_dim,
-                    num_attention_heads,
-                    attention_head_dim,
-                    dropout=dropout,
-                    activation_fn="gelu",
-                    attention_bias=True,
-                )
-                for d in range(num_layers)
-            ]
-        )
-
-        self.norm_out = nn.LayerNorm(inner_dim)
-        self.proj_to_clip_embeddings = nn.Linear(inner_dim, embedding_dim)
-
-        causal_attention_mask = paddle.triu(
-            paddle.full([num_embeddings + additional_embeddings, num_embeddings + additional_embeddings], NEG_INF), 1
-        )
-        causal_attention_mask = causal_attention_mask.unsqueeze(0)
-        self.register_buffer("causal_attention_mask", causal_attention_mask, persistable=False)
-
-        self.clip_mean = self.create_parameter(
-            (1, embedding_dim), dtype=paddle.get_default_dtype(), default_initializer=nn.initializer.Constant(0.0)
-        )
-        self.clip_std = self.create_parameter(
-            (1, embedding_dim), dtype=paddle.get_default_dtype(), default_initializer=nn.initializer.Constant(0.0)
-        )
-
-    def forward(
-        self,
-        hidden_states,
-        timestep: Union[paddle.Tensor, float, int],
-        proj_embedding: paddle.Tensor,
-        encoder_hidden_states: paddle.Tensor,
-        attention_mask: Optional[paddle.Tensor] = None,
-        return_dict: bool = True,
-    ):
-        """
-        Args:
-            hidden_states (`paddle.Tensor` of shape `(batch_size, embedding_dim)`):
-                x_t, the currently predicted image embeddings.
-            timestep (`paddle.Tensor`):
-                Current denoising step.
-            proj_embedding (`paddle.Tensor` of shape `(batch_size, embedding_dim)`):
-                Projected embedding vector the denoising process is conditioned on.
-            encoder_hidden_states (`paddle.Tensor` of shape `(batch_size, num_embeddings, embedding_dim)`):
-                Hidden states of the text embeddings the denoising process is conditioned on.
-            attention_mask (`paddle.Tensor` of shape `(batch_size, num_embeddings)`):
-                Text mask for the text embeddings.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`models.prior_transformer.PriorTransformerOutput`] instead of a plain
-                tuple.
-
-        Returns:
-            [`~models.prior_transformer.PriorTransformerOutput`] or `tuple`:
-            [`~models.prior_transformer.PriorTransformerOutput`] if `return_dict` is True, otherwise a `tuple`. When
-            returning a tuple, the first element is the sample tensor.
-        """
-        hidden_states = hidden_states.cast(self.dtype)
-        batch_size = hidden_states.shape[0]
-
-        timesteps = timestep
-        if not paddle.is_tensor(timesteps):
-            timesteps = paddle.to_tensor([timesteps], dtype=paddle.int64)
-        elif paddle.is_tensor(timesteps) and len(timesteps.shape) == 0:
-            timesteps = timesteps[None]
-
-        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-        timesteps = timesteps * paddle.ones((batch_size,), dtype=timesteps.dtype)
-
-        timesteps_projected = self.time_proj(timesteps)
-
-        # timesteps does not contain any weights and will always return f32 tensors
-        # but time_embedding might be fp16, so we need to cast here.
-        timesteps_projected = timesteps_projected.cast(hidden_states.dtype)
-        time_embeddings = self.time_embedding(timesteps_projected)
-
-        proj_embeddings = self.embedding_proj(proj_embedding)
-        encoder_hidden_states = self.encoder_hidden_states_proj(encoder_hidden_states)
-        hidden_states = self.proj_in(hidden_states)
-        prd_embedding = self.prd_embedding.cast(hidden_states.dtype).expand([batch_size, -1, -1])
-        positional_embeddings = self.positional_embedding.cast(hidden_states.dtype)
-
-        hidden_states = paddle.concat(
-            [
-                encoder_hidden_states,
-                proj_embeddings[:, None, :],
-                time_embeddings[:, None, :],
-                hidden_states[:, None, :],
-                prd_embedding,
-            ],
-            axis=1,
-        )
-
-        hidden_states = hidden_states + positional_embeddings
-
-        if attention_mask is not None:
-            attention_mask = (1 - attention_mask.cast(hidden_states.dtype)) * NEG_INF
-            attention_mask = F.pad(
-                attention_mask.unsqueeze(0), (0, self.additional_embeddings), value=0.0, data_format="NCL"
-            ).squeeze(0)
-            attention_mask = (attention_mask[:, None, :] + self.causal_attention_mask).cast(hidden_states.dtype)
-            attention_mask = attention_mask.repeat_interleave(self.config.num_attention_heads, axis=0)
-
-        for block in self.transformer_blocks:
-            hidden_states = block(hidden_states, attention_mask=attention_mask)
-
-        hidden_states = self.norm_out(hidden_states)
-        hidden_states = hidden_states[:, -1]
-        predicted_image_embedding = self.proj_to_clip_embeddings(hidden_states)
-
-        if not return_dict:
-            return (predicted_image_embedding,)
-
-        return PriorTransformerOutput(predicted_image_embedding=predicted_image_embedding)
-
-    def post_process_latents(self, prior_latents):
-        prior_latents = (prior_latents * self.clip_std) + self.clip_mean
-        return prior_latents
diff --git a/ppdiffusers/ppdiffusers/models/resnet.py b/ppdiffusers/ppdiffusers/models/resnet.py
deleted file mode 100644
index 26ed745f93dd..000000000000
--- a/ppdiffusers/ppdiffusers/models/resnet.py
+++ /dev/null
@@ -1,866 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-# `TemporalConvLayer` Copyright 2023 Alibaba DAMO-VILAB, The ModelScope Team and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from functools import partial
-from typing import Optional
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-
-from ..initializer import zeros_
-from .attention import AdaGroupNorm
-
-
-class Upsample1D(nn.Layer):
-    """
-    An upsampling layer with an optional convolution.
-
-    Parameters:
-            channels: channels in the inputs and outputs.
-            use_conv: a bool determining if a convolution is applied.
-            use_conv_transpose:
-            out_channels:
-    """
-
-    def __init__(self, channels, use_conv=False, use_conv_transpose=False, out_channels=None, name="conv"):
-        super().__init__()
-        self.channels = channels
-        self.out_channels = out_channels or channels
-        self.use_conv = use_conv
-        self.use_conv_transpose = use_conv_transpose
-        self.name = name
-
-        self.conv = None
-        if use_conv_transpose:
-            self.conv = nn.Conv1DTranspose(channels, self.out_channels, 4, 2, 1)
-        elif use_conv:
-            self.conv = nn.Conv1D(self.channels, self.out_channels, 3, padding=1)
-
-    def forward(self, x):
-        assert x.shape[1] == self.channels
-        if self.use_conv_transpose:
-            return self.conv(x)
-
-        x = F.interpolate(x, scale_factor=2.0, mode="nearest")
-
-        if self.use_conv:
-            x = self.conv(x)
-
-        return x
-
-
-class Downsample1D(nn.Layer):
-    """
-    A downsampling layer with an optional convolution.
-
-    Parameters:
-        channels: channels in the inputs and outputs.
-        use_conv: a bool determining if a convolution is applied.
-        out_channels:
-        padding:
-    """
-
-    def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name="conv"):
-        super().__init__()
-        self.channels = channels
-        self.out_channels = out_channels or channels
-        self.use_conv = use_conv
-        self.padding = padding
-        stride = 2
-        self.name = name
-
-        if use_conv:
-            self.conv = nn.Conv1D(self.channels, self.out_channels, 3, stride=stride, padding=padding)
-        else:
-            assert self.channels == self.out_channels
-            self.conv = nn.AvgPool1D(kernel_size=stride, stride=stride)
-
-    def forward(self, x):
-        assert x.shape[1] == self.channels
-        return self.conv(x)
-
-
-class Upsample2D(nn.Layer):
-    """
-    An upsampling layer with an optional convolution.
-
-    Parameters:
-        channels: channels in the inputs and outputs.
-        use_conv: a bool determining if a convolution is applied.
-        use_conv_transpose:
-        out_channels:
-    """
-
-    def __init__(self, channels, use_conv=False, use_conv_transpose=False, out_channels=None, name="conv"):
-        super().__init__()
-        self.channels = channels
-        self.out_channels = out_channels or channels
-        self.use_conv = use_conv
-        self.use_conv_transpose = use_conv_transpose
-        self.name = name
-
-        conv = None
-        if use_conv_transpose:
-            conv = nn.Conv2DTranspose(channels, self.out_channels, 4, 2, 1)
-        elif use_conv:
-            conv = nn.Conv2D(self.channels, self.out_channels, 3, padding=1)
-
-        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
-        if name == "conv":
-            self.conv = conv
-        else:
-            self.Conv2d_0 = conv
-
-    def forward(self, hidden_states, output_size=None):
-        assert hidden_states.shape[1] == self.channels
-
-        if self.use_conv_transpose:
-            return self.conv(hidden_states)
-
-        # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16
-        # TODO(Suraj): Remove this cast once the issue is fixed in PyTorch
-        # https://github.com/pytorch/pytorch/issues/86679
-        dtype = hidden_states.dtype
-        if dtype == paddle.bfloat16:
-            hidden_states = hidden_states.cast("float32")
-
-        # if `output_size` is passed we force the interpolation output
-        # size and do not make use of `scale_factor=2`
-        if output_size is None:
-            hidden_states = F.interpolate(hidden_states, scale_factor=2.0, mode="nearest")
-        else:
-            hidden_states = F.interpolate(hidden_states, size=output_size, mode="nearest")
-
-        # If the input is bfloat16, we cast back to bfloat16
-        if dtype == paddle.bfloat16:
-            hidden_states = hidden_states.cast(dtype)
-
-        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
-        if self.use_conv:
-            if self.name == "conv":
-                hidden_states = self.conv(hidden_states)
-            else:
-                hidden_states = self.Conv2d_0(hidden_states)
-
-        return hidden_states
-
-
-class Downsample2D(nn.Layer):
-    """
-    A downsampling layer with an optional convolution.
-
-    Parameters:
-        channels: channels in the inputs and outputs.
-        use_conv: a bool determining if a convolution is applied.
-        out_channels:
-        padding:
-    """
-
-    def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name="conv"):
-        super().__init__()
-        self.channels = channels
-        self.out_channels = out_channels or channels
-        self.use_conv = use_conv
-        self.padding = padding
-        stride = 2
-        self.name = name
-
-        if use_conv:
-            conv = nn.Conv2D(self.channels, self.out_channels, 3, stride=stride, padding=padding)
-        else:
-            assert self.channels == self.out_channels
-            conv = nn.AvgPool2D(kernel_size=stride, stride=stride)
-
-        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
-        if name == "conv":
-            self.Conv2d_0 = conv
-            self.conv = conv
-        elif name == "Conv2d_0":
-            self.conv = conv
-        else:
-            self.conv = conv
-
-    def forward(self, hidden_states):
-        assert hidden_states.shape[1] == self.channels
-        if self.use_conv and self.padding == 0:
-            pad = (0, 1, 0, 1)
-            hidden_states = F.pad(hidden_states, pad, mode="constant", value=0)
-
-        assert hidden_states.shape[1] == self.channels
-        hidden_states = self.conv(hidden_states)
-
-        return hidden_states
-
-
-class FirUpsample2D(nn.Layer):
-    def __init__(self, channels=None, out_channels=None, use_conv=False, fir_kernel=(1, 3, 3, 1)):
-        super().__init__()
-        out_channels = out_channels if out_channels else channels
-        if use_conv:
-            self.Conv2d_0 = nn.Conv2D(channels, out_channels, kernel_size=3, stride=1, padding=1)
-        self.use_conv = use_conv
-        self.fir_kernel = fir_kernel
-        self.out_channels = out_channels
-
-    def _upsample_2d(self, hidden_states, weight=None, kernel=None, factor=2, gain=1):
-        """Fused `upsample_2d()` followed by `Conv2d()`.
-
-        Padding is performed only once at the beginning, not between the operations. The fused op is considerably more
-        efficient than performing the same calculation using standard TensorFlow ops. It supports gradients of
-        arbitrary order.
-
-        Args:
-            hidden_states: Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`.
-            weight: Weight tensor of the shape `[filterH, filterW, inChannels,
-                outChannels]`. Grouped convolution can be performed by `inChannels = x.shape[0] // numGroups`.
-            kernel: FIR filter of the shape `[firH, firW]` or `[firN]`
-                (separable). The default is `[1] * factor`, which corresponds to nearest-neighbor upsampling.
-            factor: Integer upsampling factor (default: 2).
-            gain: Scaling factor for signal magnitude (default: 1.0).
-
-        Returns:
-            output: Tensor of the shape `[N, C, H * factor, W * factor]` or `[N, H * factor, W * factor, C]`, and same
-            datatype as `hidden_states`.
-        """
-
-        assert isinstance(factor, int) and factor >= 1
-
-        # Setup filter kernel.
-        if kernel is None:
-            kernel = [1] * factor
-
-        # setup kernel
-        kernel = paddle.to_tensor(kernel, dtype="float32")
-        if kernel.ndim == 1:
-            kernel = paddle.outer(kernel, kernel)
-        kernel /= paddle.sum(kernel)
-
-        kernel = kernel * (gain * (factor**2))
-
-        if self.use_conv:
-            convH = weight.shape[2]
-            convW = weight.shape[3]
-            inC = weight.shape[1]
-
-            pad_value = (kernel.shape[0] - factor) - (convW - 1)
-
-            stride = (factor, factor)
-            # Determine data dimensions.
-            output_shape = (
-                (hidden_states.shape[2] - 1) * factor + convH,
-                (hidden_states.shape[3] - 1) * factor + convW,
-            )
-            output_padding = (
-                output_shape[0] - (hidden_states.shape[2] - 1) * stride[0] - convH,
-                output_shape[1] - (hidden_states.shape[3] - 1) * stride[1] - convW,
-            )
-            assert output_padding[0] >= 0 and output_padding[1] >= 0
-            num_groups = hidden_states.shape[1] // inC
-
-            # Transpose weights.
-            weight = weight.reshape([num_groups, -1, inC, convH, convW])
-            weight = paddle.flip(weight, axis=[3, 4]).transpose([0, 2, 1, 3, 4])
-            weight = weight.reshape([num_groups * inC, -1, convH, convW])
-
-            inverse_conv = F.conv2d_transpose(
-                hidden_states, weight, stride=stride, output_padding=output_padding, padding=0
-            )
-
-            output = upfirdn2d_native(
-                inverse_conv,
-                paddle.to_tensor(kernel),
-                pad=((pad_value + 1) // 2 + factor - 1, pad_value // 2 + 1),
-            )
-        else:
-            pad_value = kernel.shape[0] - factor
-            output = upfirdn2d_native(
-                hidden_states,
-                paddle.to_tensor(kernel),
-                up=factor,
-                pad=((pad_value + 1) // 2 + factor - 1, pad_value // 2),
-            )
-
-        return output
-
-    def forward(self, hidden_states):
-        if self.use_conv:
-            height = self._upsample_2d(hidden_states, self.Conv2d_0.weight, kernel=self.fir_kernel)
-            height = height + self.Conv2d_0.bias.reshape([1, -1, 1, 1])
-        else:
-            height = self._upsample_2d(hidden_states, kernel=self.fir_kernel, factor=2)
-
-        return height
-
-
-class FirDownsample2D(nn.Layer):
-    def __init__(self, channels=None, out_channels=None, use_conv=False, fir_kernel=(1, 3, 3, 1)):
-        super().__init__()
-        out_channels = out_channels if out_channels else channels
-        if use_conv:
-            self.Conv2d_0 = nn.Conv2D(channels, out_channels, kernel_size=3, stride=1, padding=1)
-        self.fir_kernel = fir_kernel
-        self.use_conv = use_conv
-        self.out_channels = out_channels
-
-    def _downsample_2d(self, hidden_states, weight=None, kernel=None, factor=2, gain=1):
-        """Fused `Conv2d()` followed by `downsample_2d()`.
-        Padding is performed only once at the beginning, not between the operations. The fused op is considerably more
-        efficient than performing the same calculation using standard TensorFlow ops. It supports gradients of
-        arbitrary order.
-
-        Args:
-            hidden_states: Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`.
-            weight:
-                Weight tensor of the shape `[filterH, filterW, inChannels, outChannels]`. Grouped convolution can be
-                performed by `inChannels = x.shape[0] // numGroups`.
-            kernel: FIR filter of the shape `[firH, firW]` or `[firN]` (separable). The default is `[1] *
-            factor`, which corresponds to average pooling.
-            factor: Integer downsampling factor (default: 2).
-            gain: Scaling factor for signal magnitude (default: 1.0).
-
-        Returns:
-            output: Tensor of the shape `[N, C, H // factor, W // factor]` or `[N, H // factor, W // factor, C]`, and
-            same datatype as `x`.
-        """
-
-        assert isinstance(factor, int) and factor >= 1
-        if kernel is None:
-            kernel = [1] * factor
-
-        # setup kernel
-        kernel = paddle.to_tensor(kernel, dtype="float32")
-        if kernel.ndim == 1:
-            kernel = paddle.outer(kernel, kernel)
-        kernel /= paddle.sum(kernel)
-
-        kernel = kernel * gain
-
-        if self.use_conv:
-            _, _, convH, convW = weight.shape
-            pad_value = (kernel.shape[0] - factor) + (convW - 1)
-            stride_value = [factor, factor]
-            upfirdn_input = upfirdn2d_native(
-                hidden_states,
-                paddle.to_tensor(kernel),
-                pad=((pad_value + 1) // 2, pad_value // 2),
-            )
-            output = F.conv2d(upfirdn_input, weight, stride=stride_value, padding=0)
-        else:
-            pad_value = kernel.shape[0] - factor
-            output = upfirdn2d_native(
-                hidden_states,
-                paddle.to_tensor(kernel),
-                down=factor,
-                pad=((pad_value + 1) // 2, pad_value // 2),
-            )
-
-        return output
-
-    def forward(self, hidden_states):
-        if self.use_conv:
-            downsample_input = self._downsample_2d(hidden_states, weight=self.Conv2d_0.weight, kernel=self.fir_kernel)
-            hidden_states = downsample_input + self.Conv2d_0.bias.reshape([1, -1, 1, 1])
-        else:
-            hidden_states = self._downsample_2d(hidden_states, kernel=self.fir_kernel, factor=2)
-
-        return hidden_states
-
-
-# downsample/upsample layer used in k-upscaler, might be able to use FirDownsample2D/DirUpsample2D instead
-class KDownsample2D(nn.Layer):
-    def __init__(self, pad_mode="reflect"):
-        super().__init__()
-        self.pad_mode = pad_mode
-        kernel_1d = paddle.to_tensor([[1 / 8, 3 / 8, 3 / 8, 1 / 8]])
-        self.pad = kernel_1d.shape[1] // 2 - 1
-        self.register_buffer("kernel", paddle.matmul(kernel_1d, kernel_1d, transpose_x=True), persistable=False)
-
-    def forward(self, x):
-        x = F.pad(x, (self.pad,) * 4, self.pad_mode)
-        weight = paddle.zeros([x.shape[1], x.shape[1], self.kernel.shape[0], self.kernel.shape[1]], dtype=x.dtype)
-        indices = paddle.arange(x.shape[1])
-        # TODO verify this method
-        weight[indices, indices] = self.kernel.cast(weight.dtype)
-        return F.conv2d(x, weight, stride=2)
-
-
-class KUpsample2D(nn.Layer):
-    def __init__(self, pad_mode="reflect"):
-        super().__init__()
-        self.pad_mode = pad_mode
-        kernel_1d = paddle.to_tensor([[1 / 8, 3 / 8, 3 / 8, 1 / 8]]) * 2
-        self.pad = kernel_1d.shape[1] // 2 - 1
-        self.register_buffer("kernel", paddle.matmul(kernel_1d, kernel_1d, transpose_x=True), persistable=False)
-
-    def forward(self, x):
-        x = F.pad(x, ((self.pad + 1) // 2,) * 4, self.pad_mode)
-        weight = paddle.zeros([x.shape[1], x.shape[1], self.kernel.shape[0], self.kernel.shape[1]], dtype=x.dtype)
-        indices = paddle.arange(x.shape[1])
-        # TODO verify this method
-        weight[indices, indices] = self.kernel.cast(weight.dtype)
-        return F.conv2d_transpose(x, weight, stride=2, padding=self.pad * 2 + 1)
-
-
-class ResnetBlock2D(nn.Layer):
-    r"""
-    A Resnet block.
-
-    Parameters:
-        in_channels (`int`): The number of channels in the input.
-        out_channels (`int`, *optional*, default to be `None`):
-            The number of output channels for the first conv2d layer. If None, same as `in_channels`.
-        dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
-        temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
-        groups (`int`, *optional*, default to `32`): The number of groups to use for the first normalization layer.
-        groups_out (`int`, *optional*, default to None):
-            The number of groups to use for the second normalization layer. if set to None, same as `groups`.
-        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
-        non_linearity (`str`, *optional*, default to `"swish"`): the activation function to use.
-        time_embedding_norm (`str`, *optional*, default to `"default"` ): Time scale shift config.
-            By default, apply timestep embedding conditioning with a simple shift mechanism. Choose "scale_shift" or
-            "ada_group" for a stronger conditioning with scale and shift.
-        kernel (`paddle.Tensor`, optional, default to None): FIR filter, see
-            [`~models.resnet.FirUpsample2D`] and [`~models.resnet.FirDownsample2D`].
-        output_scale_factor (`float`, *optional*, default to be `1.0`): the scale factor to use for the output.
-        use_in_shortcut (`bool`, *optional*, default to `True`):
-            If `True`, add a 1x1 nn.conv2d layer for skip-connection.
-        up (`bool`, *optional*, default to `False`): If `True`, add an upsample layer.
-        down (`bool`, *optional*, default to `False`): If `True`, add a downsample layer.
-        conv_shortcut_bias (`bool`, *optional*, default to `True`):  If `True`, adds a learnable bias to the
-            `conv_shortcut` output.
-        conv_2d_out_channels (`int`, *optional*, default to `None`): the number of channels in the output.
-            If None, same as `out_channels`.
-    """
-
-    def __init__(
-        self,
-        *,
-        in_channels,
-        out_channels=None,
-        conv_shortcut=False,
-        dropout=0.0,
-        temb_channels=512,
-        groups=32,
-        groups_out=None,
-        pre_norm=True,
-        eps=1e-6,
-        non_linearity="swish",
-        skip_time_act: bool = False,  # skip_time_act is the same as pre_temb_non_linearity
-        time_embedding_norm="default",  # default, scale_shift, ada_group
-        kernel=None,
-        output_scale_factor=1.0,
-        use_in_shortcut=None,
-        up=False,
-        down=False,
-        conv_shortcut_bias: bool = True,
-        conv_2d_out_channels: Optional[int] = None,
-        pre_temb_non_linearity: bool = False,  # skip_time_act is the same as pre_temb_non_linearity
-    ):
-        super().__init__()
-        self.pre_temb_non_linearity = pre_temb_non_linearity
-        self.pre_norm = pre_norm
-        self.pre_norm = True
-        self.in_channels = in_channels
-        out_channels = in_channels if out_channels is None else out_channels
-        self.out_channels = out_channels
-        self.use_conv_shortcut = conv_shortcut
-        self.up = up
-        self.down = down
-        self.output_scale_factor = output_scale_factor
-        self.time_embedding_norm = time_embedding_norm
-        self.skip_time_act = skip_time_act
-
-        if groups_out is None:
-            groups_out = groups
-
-        if self.time_embedding_norm == "ada_group":
-            self.norm1 = AdaGroupNorm(temb_channels, in_channels, groups, eps=eps)
-        else:
-            self.norm1 = nn.GroupNorm(num_groups=groups, num_channels=in_channels, epsilon=eps)
-
-        self.conv1 = nn.Conv2D(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
-
-        if temb_channels is not None:
-            if self.time_embedding_norm == "default":
-                self.time_emb_proj = nn.Linear(temb_channels, out_channels)
-            elif self.time_embedding_norm == "scale_shift":
-                self.time_emb_proj = nn.Linear(temb_channels, 2 * out_channels)
-            elif self.time_embedding_norm == "ada_group":
-                self.time_emb_proj = None
-            else:
-                raise ValueError(f"unknown time_embedding_norm : {self.time_embedding_norm} ")
-        else:
-            self.time_emb_proj = None
-
-        if self.time_embedding_norm == "ada_group":
-            self.norm2 = AdaGroupNorm(temb_channels, out_channels, groups_out, eps=eps)
-        else:
-            self.norm2 = nn.GroupNorm(num_groups=groups_out, num_channels=out_channels, epsilon=eps)
-
-        self.dropout = nn.Dropout(dropout)
-        conv_2d_out_channels = conv_2d_out_channels or out_channels
-        self.conv2 = nn.Conv2D(out_channels, conv_2d_out_channels, kernel_size=3, stride=1, padding=1)
-
-        if non_linearity == "swish":
-            self.nonlinearity = lambda x: F.silu(x)
-        elif non_linearity == "mish":
-            self.nonlinearity = nn.Mish()
-        elif non_linearity == "silu":
-            self.nonlinearity = nn.Silu()
-        elif non_linearity == "gelu":
-            self.nonlinearity = nn.GELU()
-
-        self.upsample = self.downsample = None
-        if self.up:
-            if kernel == "fir":
-                fir_kernel = (1, 3, 3, 1)
-                self.upsample = lambda x: upsample_2d(x, kernel=fir_kernel)
-            elif kernel == "sde_vp":
-                self.upsample = partial(F.interpolate, scale_factor=2.0, mode="nearest")
-            else:
-                self.upsample = Upsample2D(in_channels, use_conv=False)
-        elif self.down:
-            if kernel == "fir":
-                fir_kernel = (1, 3, 3, 1)
-                self.downsample = lambda x: downsample_2d(x, kernel=fir_kernel)
-            elif kernel == "sde_vp":
-                self.downsample = partial(F.avg_pool2d, kernel_size=2, stride=2)
-            else:
-                self.downsample = Downsample2D(in_channels, use_conv=False, padding=1, name="op")
-
-        self.use_in_shortcut = self.in_channels != conv_2d_out_channels if use_in_shortcut is None else use_in_shortcut
-
-        self.conv_shortcut = None
-        if self.use_in_shortcut:
-            self.conv_shortcut = nn.Conv2D(
-                in_channels, conv_2d_out_channels, kernel_size=1, stride=1, padding=0, bias_attr=conv_shortcut_bias
-            )
-
-    def forward(self, input_tensor, temb):
-        hidden_states = input_tensor
-
-        if self.time_embedding_norm == "ada_group":
-            hidden_states = self.norm1(hidden_states, temb)
-        else:
-            hidden_states = self.norm1(hidden_states)
-
-        hidden_states = self.nonlinearity(hidden_states)
-
-        if self.upsample is not None:
-            input_tensor = self.upsample(input_tensor)
-            hidden_states = self.upsample(hidden_states)
-        elif self.downsample is not None:
-            input_tensor = self.downsample(input_tensor)
-            hidden_states = self.downsample(hidden_states)
-
-        hidden_states = self.conv1(hidden_states)
-
-        if self.time_emb_proj is not None:
-            if not self.pre_temb_non_linearity and not self.skip_time_act:
-                temb = self.nonlinearity(temb)
-            temb = self.time_emb_proj(temb)[:, :, None, None]
-
-        if temb is not None and self.time_embedding_norm == "default":
-            hidden_states = hidden_states + temb
-
-        if self.time_embedding_norm == "ada_group":
-            hidden_states = self.norm2(hidden_states, temb)
-        else:
-            hidden_states = self.norm2(hidden_states)
-
-        if temb is not None and self.time_embedding_norm == "scale_shift":
-            scale, shift = temb.chunk(2, axis=1)
-            hidden_states = hidden_states * (1 + scale) + shift
-
-        hidden_states = self.nonlinearity(hidden_states)
-
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.conv2(hidden_states)
-
-        if self.conv_shortcut is not None:
-            input_tensor = self.conv_shortcut(input_tensor)
-
-        # TODO this maybe result -inf, input_tensor's min value -57644  hidden_states's min value -10000
-        output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
-
-        return output_tensor
-
-
-class Mish(nn.Layer):
-    def forward(self, hidden_states):
-        return hidden_states * paddle.tanh(F.softplus(hidden_states))
-
-
-# unet_rl.py
-def rearrange_dims(tensor):
-    if len(tensor.shape) == 2:
-        return tensor[:, :, None]
-    if len(tensor.shape) == 3:
-        return tensor[:, :, None, :]
-    elif len(tensor.shape) == 4:
-        return tensor[:, :, 0, :]
-    else:
-        raise ValueError(f"`len(tensor)`: {len(tensor)} has to be 2, 3 or 4.")
-
-
-class Conv1dBlock(nn.Layer):
-    """
-    Conv1d --> GroupNorm --> Mish
-    """
-
-    def __init__(self, inp_channels, out_channels, kernel_size, n_groups=8):
-        super().__init__()
-
-        self.conv1d = nn.Conv1D(inp_channels, out_channels, kernel_size, padding=kernel_size // 2)
-        self.group_norm = nn.GroupNorm(n_groups, out_channels)
-        self.mish = nn.Mish()
-
-    def forward(self, x):
-        x = self.conv1d(x)
-        x = rearrange_dims(x)
-        x = self.group_norm(x)
-        x = rearrange_dims(x)
-        x = self.mish(x)
-        return x
-
-
-# unet_rl.py
-class ResidualTemporalBlock1D(nn.Layer):
-    def __init__(self, inp_channels, out_channels, embed_dim, kernel_size=5):
-        super().__init__()
-        self.conv_in = Conv1dBlock(inp_channels, out_channels, kernel_size)
-        self.conv_out = Conv1dBlock(out_channels, out_channels, kernel_size)
-
-        self.time_emb_act = nn.Mish()
-        self.time_emb = nn.Linear(embed_dim, out_channels)
-
-        self.residual_conv = (
-            nn.Conv1D(inp_channels, out_channels, 1) if inp_channels != out_channels else nn.Identity()
-        )
-
-    def forward(self, x, t):
-        """
-        Args:
-            x : [ batch_size x inp_channels x horizon ]
-            t : [ batch_size x embed_dim ]
-
-        returns:
-            out : [ batch_size x out_channels x horizon ]
-        """
-        t = self.time_emb_act(t)
-        t = self.time_emb(t)
-        out = self.conv_in(x) + rearrange_dims(t)
-        out = self.conv_out(out)
-        return out + self.residual_conv(x)
-
-
-def upsample_2d(hidden_states, kernel=None, factor=2, gain=1):
-    r"""Upsample2D a batch of 2D images with the given filter.
-    Accepts a batch of 2D images of the shape `[N, C, H, W]` or `[N, H, W, C]` and upsamples each image with the given
-    filter. The filter is normalized so that if the input pixels are constant, they will be scaled by the specified
-    `gain`. Pixels outside the image are assumed to be zero, and the filter is padded with zeros so that its shape is
-    a: multiple of the upsampling factor.
-
-    Args:
-        hidden_states: Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`.
-        kernel: FIR filter of the shape `[firH, firW]` or `[firN]`
-          (separable). The default is `[1] * factor`, which corresponds to nearest-neighbor upsampling.
-        factor: Integer upsampling factor (default: 2).
-        gain: Scaling factor for signal magnitude (default: 1.0).
-
-    Returns:
-        output: Tensor of the shape `[N, C, H * factor, W * factor]`
-    """
-    assert isinstance(factor, int) and factor >= 1
-    if kernel is None:
-        kernel = [1] * factor
-
-    kernel = paddle.to_tensor(kernel, dtype=paddle.float32)
-    if kernel.ndim == 1:
-        kernel = paddle.outer(kernel, kernel)
-    kernel /= paddle.sum(kernel)
-
-    kernel = kernel * (gain * (factor**2))
-    pad_value = kernel.shape[0] - factor
-    output = upfirdn2d_native(
-        hidden_states,
-        kernel,
-        up=factor,
-        pad=((pad_value + 1) // 2 + factor - 1, pad_value // 2),
-    )
-    return output
-
-
-def downsample_2d(hidden_states, kernel=None, factor=2, gain=1):
-    r"""Downsample2D a batch of 2D images with the given filter.
-    Accepts a batch of 2D images of the shape `[N, C, H, W]` or `[N, H, W, C]` and downsamples each image with the
-    given filter. The filter is normalized so that if the input pixels are constant, they will be scaled by the
-    specified `gain`. Pixels outside the image are assumed to be zero, and the filter is padded with zeros so that its
-    shape is a multiple of the downsampling factor.
-
-    Args:
-        hidden_states: Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`.
-        kernel: FIR filter of the shape `[firH, firW]` or `[firN]`
-          (separable). The default is `[1] * factor`, which corresponds to average pooling.
-        factor: Integer downsampling factor (default: 2).
-        gain: Scaling factor for signal magnitude (default: 1.0).
-
-    Returns:
-        output: Tensor of the shape `[N, C, H // factor, W // factor]`
-    """
-
-    assert isinstance(factor, int) and factor >= 1
-    if kernel is None:
-        kernel = [1] * factor
-
-    kernel = paddle.to_tensor(kernel, dtype=paddle.float32)
-    if kernel.ndim == 1:
-        kernel = paddle.outer(kernel, kernel)
-    kernel /= paddle.sum(kernel)
-
-    kernel = kernel * gain
-    pad_value = kernel.shape[0] - factor
-    output = upfirdn2d_native(hidden_states, kernel, down=factor, pad=((pad_value + 1) // 2, pad_value // 2))
-    return output
-
-
-def dummy_pad(tensor, up_x=0, up_y=0):
-    if up_x > 0:
-        tensor = paddle.concat(
-            [
-                tensor,
-                paddle.zeros(
-                    [tensor.shape[0], tensor.shape[1], tensor.shape[2], tensor.shape[3], up_x, tensor.shape[5]],
-                    dtype=tensor.dtype,
-                ),
-            ],
-            axis=4,
-        )
-    if up_y > 0:
-        tensor = paddle.concat(
-            [
-                tensor,
-                paddle.zeros(
-                    [tensor.shape[0], tensor.shape[1], up_y, tensor.shape[3], tensor.shape[4], tensor.shape[5]],
-                    dtype=tensor.dtype,
-                ),
-            ],
-            axis=2,
-        )
-    return tensor
-
-
-def upfirdn2d_native(tensor, kernel, up=1, down=1, pad=(0, 0)):
-    up_x = up_y = up
-    down_x = down_y = down
-    pad_x0 = pad_y0 = pad[0]
-    pad_x1 = pad_y1 = pad[1]
-
-    _, channel, in_h, in_w = tensor.shape
-    tensor = tensor.reshape([-1, in_h, in_w, 1])
-
-    _, in_h, in_w, minor = tensor.shape
-    kernel_h, kernel_w = kernel.shape
-
-    out = tensor.reshape([-1, in_h, 1, in_w, 1, minor])
-    # (TODO, junnyu F.pad bug)
-    # F.pad(out, [0, 0, 0, up_x - 1, 0, 0, 0, up_y - 1])
-    out = dummy_pad(out, up_x - 1, up_y - 1)
-    out = out.reshape([-1, in_h * up_y, in_w * up_x, minor])
-
-    # (TODO, junnyu F.pad bug)
-    # out = F.pad(out, [0, 0, max(pad_x0, 0), max(pad_x1, 0), max(pad_y0, 0), max(pad_y1, 0)])
-    out = out.unsqueeze(0)
-    out = F.pad(out, [max(pad_x0, 0), max(pad_x1, 0), max(pad_y0, 0), max(pad_y1, 0), 0, 0], data_format="NDHWC")
-    out = out.squeeze(0)
-
-    out = out[
-        :,
-        max(-pad_y0, 0) : out.shape[1] - max(-pad_y1, 0),
-        max(-pad_x0, 0) : out.shape[2] - max(-pad_x1, 0),
-        :,
-    ]
-
-    out = out.transpose([0, 3, 1, 2])
-    out = out.reshape([-1, 1, in_h * up_y + pad_y0 + pad_y1, in_w * up_x + pad_x0 + pad_x1])
-    w = paddle.flip(kernel, [0, 1]).reshape([1, 1, kernel_h, kernel_w])
-    out = F.conv2d(out, w)
-    out = out.reshape(
-        [-1, minor, in_h * up_y + pad_y0 + pad_y1 - kernel_h + 1, in_w * up_x + pad_x0 + pad_x1 - kernel_w + 1]
-    )
-    out = out.transpose([0, 2, 3, 1])
-    out = out[:, ::down_y, ::down_x, :]
-
-    out_h = (in_h * up_y + pad_y0 + pad_y1 - kernel_h) // down_y + 1
-    out_w = (in_w * up_x + pad_x0 + pad_x1 - kernel_w) // down_x + 1
-
-    return out.reshape([-1, channel, out_h, out_w])
-
-
-class TemporalConvLayer(nn.Layer):
-    """
-    Temporal convolutional layer that can be used for video (sequence of images) input Code mostly copied from:
-    https://github.com/modelscope/modelscope/blob/1509fdb973e5871f37148a4b5e5964cafd43e64d/modelscope/models/multi_modal/video_synthesis/unet_sd.py#L1016
-    """
-
-    def __init__(self, in_dim, out_dim=None, dropout=0.0):
-        super().__init__()
-        out_dim = out_dim or in_dim
-        self.in_dim = in_dim
-        self.out_dim = out_dim
-        self.conv1 = nn.Sequential(
-            nn.GroupNorm(num_groups=32, num_channels=in_dim),
-            nn.Silu(),
-            nn.Conv3D(in_channels=in_dim, out_channels=out_dim, kernel_size=(3, 1, 1), padding=(1, 0, 0)),
-        )
-        self.conv2 = nn.Sequential(
-            nn.GroupNorm(num_groups=32, num_channels=out_dim),
-            nn.Silu(),
-            nn.Dropout(p=dropout),
-            nn.Conv3D(in_channels=out_dim, out_channels=in_dim, kernel_size=(3, 1, 1), padding=(1, 0, 0)),
-        )
-        self.conv3 = nn.Sequential(
-            nn.GroupNorm(num_groups=32, num_channels=out_dim),
-            nn.Silu(),
-            nn.Dropout(p=dropout),
-            nn.Conv3D(in_channels=out_dim, out_channels=in_dim, kernel_size=(3, 1, 1), padding=(1, 0, 0)),
-        )
-        self.conv4 = nn.Sequential(
-            nn.GroupNorm(num_groups=32, num_channels=out_dim),
-            nn.Silu(),
-            nn.Dropout(p=dropout),
-            nn.Conv3D(in_channels=out_dim, out_channels=in_dim, kernel_size=(3, 1, 1), padding=(1, 0, 0)),
-        )
-        zeros_(self.conv4[-1].weight)
-        zeros_(self.conv4[-1].bias)
-
-    def forward(self, hidden_states, num_frames=1):
-        hidden_states = (
-            hidden_states[None, :]
-            .reshape((-1, num_frames) + tuple(hidden_states.shape[1:]))
-            .transpose(perm=[0, 2, 1, 3, 4])
-        )
-        identity = hidden_states
-        hidden_states = self.conv1(hidden_states)
-        hidden_states = self.conv2(hidden_states)
-        hidden_states = self.conv3(hidden_states)
-        hidden_states = self.conv4(hidden_states)
-        hidden_states = identity + hidden_states
-        hidden_states = hidden_states.transpose(perm=[0, 2, 1, 3, 4]).reshape(
-            (hidden_states.shape[0] * hidden_states.shape[2], -1) + tuple(hidden_states.shape[3:])
-        )
-        return hidden_states
diff --git a/ppdiffusers/ppdiffusers/models/t5_film_transformer.py b/ppdiffusers/ppdiffusers/models/t5_film_transformer.py
deleted file mode 100644
index 33986470c8c7..000000000000
--- a/ppdiffusers/ppdiffusers/models/t5_film_transformer.py
+++ /dev/null
@@ -1,323 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-
-import paddle
-import paddle.nn as nn
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from .attention_processor import Attention
-from .embeddings import get_timestep_embedding
-from .modeling_utils import ModelMixin
-
-
-class T5FilmDecoder(ModelMixin, ConfigMixin):
-    @register_to_config
-    def __init__(
-        self,
-        input_dims: int = 128,
-        targets_length: int = 256,
-        max_decoder_noise_time: float = 2000.0,
-        d_model: int = 768,
-        num_layers: int = 12,
-        num_heads: int = 12,
-        d_kv: int = 64,
-        d_ff: int = 2048,
-        dropout_rate: float = 0.1,
-    ):
-        super().__init__()
-
-        self.conditioning_emb = nn.Sequential(
-            nn.Linear(d_model, d_model * 4, bias_attr=False),
-            nn.Silu(),
-            nn.Linear(d_model * 4, d_model * 4, bias_attr=False),
-            nn.Silu(),
-        )
-
-        self.position_encoding = nn.Embedding(targets_length, d_model)
-        self.position_encoding.weight.stop_gradient = True
-
-        self.continuous_inputs_projection = nn.Linear(input_dims, d_model, bias_attr=False)
-
-        self.dropout = nn.Dropout(p=dropout_rate)
-
-        self.decoders = nn.LayerList()
-        for lyr_num in range(num_layers):
-            # FiLM conditional T5 decoder
-            lyr = DecoderLayer(d_model=d_model, d_kv=d_kv, num_heads=num_heads, d_ff=d_ff, dropout_rate=dropout_rate)
-            self.decoders.append(lyr)
-
-        self.decoder_norm = T5LayerNorm(d_model)
-
-        self.post_dropout = nn.Dropout(p=dropout_rate)
-        self.spec_out = nn.Linear(d_model, input_dims, bias_attr=False)
-
-    def encoder_decoder_mask(self, query_input, key_input):
-        mask = paddle.multiply(query_input.unsqueeze(-1), key_input.unsqueeze(-2).cast(query_input.dtype))
-        return mask.unsqueeze(-3)
-
-    def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time):
-        batch, _, _ = decoder_input_tokens.shape
-        assert decoder_noise_time.shape[0] == batch
-
-        # decoder_noise_time is in [0, 1), so rescale to expected timing range.
-        time_steps = get_timestep_embedding(
-            decoder_noise_time * self.config.max_decoder_noise_time,
-            embedding_dim=self.config.d_model,
-            max_period=self.config.max_decoder_noise_time,
-        ).cast(self.dtype)
-
-        conditioning_emb = self.conditioning_emb(time_steps).unsqueeze(1)
-
-        assert conditioning_emb.shape == [batch, 1, self.config.d_model * 4]
-
-        seq_length = decoder_input_tokens.shape[1]
-
-        # If we want to use relative positions for audio context, we can just offset
-        # this sequence by the length of encodings_and_masks.
-        decoder_positions = paddle.broadcast_to(
-            paddle.arange(
-                seq_length,
-            ),
-            shape=(batch, seq_length),
-        )
-
-        position_encodings = self.position_encoding(decoder_positions)
-        inputs = self.continuous_inputs_projection(decoder_input_tokens.cast(position_encodings.dtype))
-        inputs += position_encodings
-        y = self.dropout(inputs)
-
-        # decoder: No padding present.
-        decoder_mask = paddle.ones(decoder_input_tokens.shape[:2], dtype=inputs.dtype)
-
-        # Translate encoding masks to encoder-decoder masks.
-        encodings_and_encdec_masks = [(x, self.encoder_decoder_mask(decoder_mask, y)) for x, y in encodings_and_masks]
-
-        # cross attend style: concat encodings
-        encoded = paddle.concat([x[0] for x in encodings_and_encdec_masks], axis=1)
-        encoder_decoder_mask = paddle.concat([x[1] for x in encodings_and_encdec_masks], axis=-1)
-
-        for lyr in self.decoders:
-            y = lyr(
-                y,
-                conditioning_emb=conditioning_emb,
-                encoder_hidden_states=encoded,
-                encoder_attention_mask=encoder_decoder_mask,
-            )[0]
-
-        y = self.decoder_norm(y)
-        y = self.post_dropout(y)
-
-        spec_out = self.spec_out(y)
-        return spec_out
-
-
-class DecoderLayer(nn.Layer):
-    def __init__(self, d_model, d_kv, num_heads, d_ff, dropout_rate, layer_norm_epsilon=1e-6):
-        super().__init__()
-        self.layer = nn.LayerList()
-
-        # cond self attention: layer 0
-        self.layer.append(
-            T5LayerSelfAttentionCond(d_model=d_model, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate)
-        )
-
-        # cross attention: layer 1
-        self.layer.append(
-            T5LayerCrossAttention(
-                d_model=d_model,
-                d_kv=d_kv,
-                num_heads=num_heads,
-                dropout_rate=dropout_rate,
-                layer_norm_epsilon=layer_norm_epsilon,
-            )
-        )
-
-        # Film Cond MLP + dropout: last layer
-        self.layer.append(
-            T5LayerFFCond(d_model=d_model, d_ff=d_ff, dropout_rate=dropout_rate, layer_norm_epsilon=layer_norm_epsilon)
-        )
-
-    def forward(
-        self,
-        hidden_states,
-        conditioning_emb=None,
-        attention_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        encoder_decoder_position_bias=None,
-    ):
-        hidden_states = self.layer[0](
-            hidden_states,
-            conditioning_emb=conditioning_emb,
-            attention_mask=attention_mask,
-        )
-
-        if encoder_hidden_states is not None:
-            encoder_extended_attention_mask = paddle.where(encoder_attention_mask > 0, 0.0, -1e10).cast(
-                encoder_hidden_states.dtype
-            )
-
-            hidden_states = self.layer[1](
-                hidden_states,
-                key_value_states=encoder_hidden_states,
-                attention_mask=encoder_extended_attention_mask,
-            )
-
-        # Apply Film Conditional Feed Forward layer
-        hidden_states = self.layer[-1](hidden_states, conditioning_emb)
-
-        return (hidden_states,)
-
-
-class T5LayerSelfAttentionCond(nn.Layer):
-    def __init__(self, d_model, d_kv, num_heads, dropout_rate):
-        super().__init__()
-        self.layer_norm = T5LayerNorm(d_model)
-        self.FiLMLayer = T5FiLMLayer(in_features=d_model * 4, out_features=d_model)
-        self.attention = Attention(query_dim=d_model, heads=num_heads, dim_head=d_kv, out_bias=False, scale_qk=False)
-        self.dropout = nn.Dropout(dropout_rate)
-
-    def forward(
-        self,
-        hidden_states,
-        conditioning_emb=None,
-        attention_mask=None,
-    ):
-        # pre_self_attention_layer_norm
-        normed_hidden_states = self.layer_norm(hidden_states)
-
-        if conditioning_emb is not None:
-            normed_hidden_states = self.FiLMLayer(normed_hidden_states, conditioning_emb)
-
-        # Self-attention block
-        attention_output = self.attention(normed_hidden_states)
-
-        hidden_states = hidden_states + self.dropout(attention_output)
-
-        return hidden_states
-
-
-class T5LayerCrossAttention(nn.Layer):
-    def __init__(self, d_model, d_kv, num_heads, dropout_rate, layer_norm_epsilon):
-        super().__init__()
-        self.attention = Attention(query_dim=d_model, heads=num_heads, dim_head=d_kv, out_bias=False, scale_qk=False)
-        self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon)
-        self.dropout = nn.Dropout(dropout_rate)
-
-    def forward(
-        self,
-        hidden_states,
-        key_value_states=None,
-        attention_mask=None,
-    ):
-        normed_hidden_states = self.layer_norm(hidden_states)
-        attention_output = self.attention(
-            normed_hidden_states,
-            encoder_hidden_states=key_value_states,
-            attention_mask=attention_mask.squeeze(1),
-        )
-        layer_output = hidden_states + self.dropout(attention_output)
-        return layer_output
-
-
-class T5LayerFFCond(nn.Layer):
-    def __init__(self, d_model, d_ff, dropout_rate, layer_norm_epsilon):
-        super().__init__()
-        self.DenseReluDense = T5DenseGatedActDense(d_model=d_model, d_ff=d_ff, dropout_rate=dropout_rate)
-        self.film = T5FiLMLayer(in_features=d_model * 4, out_features=d_model)
-        self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon)
-        self.dropout = nn.Dropout(dropout_rate)
-
-    def forward(self, hidden_states, conditioning_emb=None):
-        forwarded_states = self.layer_norm(hidden_states)
-        if conditioning_emb is not None:
-            forwarded_states = self.film(forwarded_states, conditioning_emb)
-
-        forwarded_states = self.DenseReluDense(forwarded_states)
-        hidden_states = hidden_states + self.dropout(forwarded_states)
-        return hidden_states
-
-
-class T5DenseGatedActDense(nn.Layer):
-    def __init__(self, d_model, d_ff, dropout_rate):
-        super().__init__()
-        self.wi_0 = nn.Linear(d_model, d_ff, bias_attr=False)
-        self.wi_1 = nn.Linear(d_model, d_ff, bias_attr=False)
-        self.wo = nn.Linear(d_ff, d_model, bias_attr=False)
-        self.dropout = nn.Dropout(dropout_rate)
-        self.act = NewGELUActivation()
-
-    def forward(self, hidden_states):
-        hidden_gelu = self.act(self.wi_0(hidden_states))
-        hidden_linear = self.wi_1(hidden_states)
-        hidden_states = hidden_gelu * hidden_linear
-        hidden_states = self.dropout(hidden_states)
-
-        hidden_states = self.wo(hidden_states)
-        return hidden_states
-
-
-class T5LayerNorm(nn.Layer):
-    """
-    Construct a layernorm module in the T5 style No bias and no subtraction of mean.
-    """
-
-    def __init__(self, hidden_size, eps=1e-6):
-        super().__init__()
-        self.weight = self.create_parameter(shape=[hidden_size], default_initializer=nn.initializer.Constant(1.0))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
-        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus variance is calculated
-        # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
-        # half-precision inputs is done in fp32
-
-        variance = paddle.pow(hidden_states.cast(paddle.float32), 2).mean(axis=-1, keepdim=True)
-        hidden_states = hidden_states * paddle.rsqrt(variance + self.variance_epsilon)
-
-        # convert into half-precision if necessary
-        if self.weight.dtype == paddle.float16:
-            hidden_states = hidden_states.cast(paddle.float16)
-        return self.weight * hidden_states
-
-
-class NewGELUActivation(nn.Layer):
-    """
-    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
-    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
-    """
-
-    def forward(self, input: paddle.Tensor) -> paddle.Tensor:
-        return (
-            0.5 * input * (1.0 + paddle.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * paddle.pow(input, 3.0))))
-        )
-
-
-class T5FiLMLayer(nn.Layer):
-    """
-    FiLM Layer
-    """
-
-    def __init__(self, in_features, out_features):
-        super().__init__()
-        self.scale_bias = nn.Linear(in_features, out_features * 2, bias_attr=False)
-
-    def forward(self, x, conditioning_emb):
-        emb = self.scale_bias(conditioning_emb)
-        scale, shift = emb.chunk(2, axis=-1)
-        x = x * (1 + scale) + shift
-        return x
diff --git a/ppdiffusers/ppdiffusers/models/transformer_2d.py b/ppdiffusers/ppdiffusers/models/transformer_2d.py
deleted file mode 100644
index 93c1a3c15834..000000000000
--- a/ppdiffusers/ppdiffusers/models/transformer_2d.py
+++ /dev/null
@@ -1,309 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from dataclasses import dataclass
-from typing import Optional
-
-import paddle
-import paddle.nn.functional as F
-from paddle import nn
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..models.embeddings import ImagePositionalEmbeddings
-from ..utils import BaseOutput, deprecate
-from .attention import BasicTransformerBlock
-from .embeddings import PatchEmbed
-from .modeling_utils import ModelMixin
-
-
-@dataclass
-class Transformer2DModelOutput(BaseOutput):
-    """
-    Args:
-        sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete):
-            Hidden states conditioned on `encoder_hidden_states` input. If discrete, returns probability distributions
-            for the unnoised latent pixels.
-    """
-
-    sample: paddle.Tensor
-
-
-class Transformer2DModel(ModelMixin, ConfigMixin):
-    """
-    Transformer model for image-like data. Takes either discrete (classes of vector embeddings) or continuous (actual
-    embeddings) inputs.
-
-    When input is continuous: First, project the input (aka embedding) and reshape to b, t, d. Then apply standard
-    transformer action. Finally, reshape to image.
-
-    When input is discrete: First, input (classes of latent pixels) is converted to embeddings and has positional
-    embeddings applied, see `ImagePositionalEmbeddings`. Then apply standard transformer action. Finally, predict
-    classes of unnoised image.
-
-    Note that it is assumed one of the input classes is the masked latent pixel. The predicted classes of the unnoised
-    image do not contain a prediction for the masked pixel as the unnoised image cannot be masked.
-
-    Parameters:
-        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
-        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
-        in_channels (`int`, *optional*):
-            Pass if the input is continuous. The number of channels in the input and output.
-        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
-        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
-        cross_attention_dim (`int`, *optional*): The number of encoder_hidden_states dimensions to use.
-        sample_size (`int`, *optional*): Pass if the input is discrete. The width of the latent images.
-            Note that this is fixed at training time as it is used for learning a number of position embeddings. See
-            `ImagePositionalEmbeddings`.
-        num_vector_embeds (`int`, *optional*):
-            Pass if the input is discrete. The number of classes of the vector embeddings of the latent pixels.
-            Includes the class for the masked latent pixel.
-        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
-        num_embeds_ada_norm ( `int`, *optional*): Pass if at least one of the norm_layers is `AdaLayerNorm`.
-            The number of diffusion steps used during training. Note that this is fixed at training time as it is used
-            to learn a number of embeddings that are added to the hidden states. During inference, you can denoise for
-            up to but not more than steps than `num_embeds_ada_norm`.
-        attention_bias (`bool`, *optional*):
-            Configure if the TransformerBlocks' attention should contain a bias parameter.
-    """
-
-    @register_to_config
-    def __init__(
-        self,
-        num_attention_heads: int = 16,
-        attention_head_dim: int = 88,
-        in_channels: Optional[int] = None,
-        out_channels: Optional[int] = None,
-        num_layers: int = 1,
-        dropout: float = 0.0,
-        norm_num_groups: int = 32,
-        cross_attention_dim: Optional[int] = None,
-        attention_bias: bool = False,
-        sample_size: Optional[int] = None,
-        num_vector_embeds: Optional[int] = None,
-        patch_size: Optional[int] = None,
-        activation_fn: str = "geglu",
-        num_embeds_ada_norm: Optional[int] = None,
-        use_linear_projection: bool = False,
-        only_cross_attention: bool = False,
-        upcast_attention: bool = False,
-        norm_type: str = "layer_norm",
-        norm_elementwise_affine: bool = True,
-    ):
-        super().__init__()
-        self.use_linear_projection = use_linear_projection
-        self.num_attention_heads = num_attention_heads
-        self.attention_head_dim = attention_head_dim
-        self.inner_dim = inner_dim = num_attention_heads * attention_head_dim
-
-        # 1. Transformer2DModel can process both standard continuous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
-        # Define whether input is continuous or discrete depending on configuration
-        self.is_input_continuous = (in_channels is not None) and (patch_size is None)
-        self.is_input_vectorized = num_vector_embeds is not None
-        self.is_input_patches = in_channels is not None and patch_size is not None
-
-        if norm_type == "layer_norm" and num_embeds_ada_norm is not None:
-            deprecation_message = (
-                f"The configuration file of this model: {self.__class__} is outdated. `norm_type` is either not set or"
-                " incorrectly set to `'layer_norm'`.Make sure to set `norm_type` to `'ada_norm'` in the config."
-                " Please make sure to update the config accordingly as leaving `norm_type` might led to incorrect"
-                " results in future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it"
-                " would be very nice if you could open a Pull request for the `transformer/config.json` file"
-            )
-            deprecate("norm_type!=num_embeds_ada_norm", "1.0.0", deprecation_message, standard_warn=False)
-            norm_type = "ada_norm"
-
-        if self.is_input_continuous and self.is_input_vectorized:
-            raise ValueError(
-                f"Cannot define both `in_channels`: {in_channels} and `num_vector_embeds`: {num_vector_embeds}. Make"
-                " sure that either `in_channels` or `num_vector_embeds` is None."
-            )
-        elif self.is_input_vectorized and self.is_input_patches:
-            raise ValueError(
-                f"Cannot define both `num_vector_embeds`: {num_vector_embeds} and `patch_size`: {patch_size}. Make"
-                " sure that either `num_vector_embeds` or `num_patches` is None."
-            )
-        elif not self.is_input_continuous and not self.is_input_vectorized and not self.is_input_patches:
-            raise ValueError(
-                f"Has to define `in_channels`: {in_channels}, `num_vector_embeds`: {num_vector_embeds}, or patch_size:"
-                f" {patch_size}. Make sure that `in_channels`, `num_vector_embeds` or `num_patches` is not None."
-            )
-
-        # 2. Define input layers
-        if self.is_input_continuous:
-            self.in_channels = in_channels
-
-            self.norm = nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, epsilon=1e-6)
-            if use_linear_projection:
-                self.proj_in = nn.Linear(in_channels, inner_dim)
-            else:
-                self.proj_in = nn.Conv2D(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
-        elif self.is_input_vectorized:
-            assert sample_size is not None, "Transformer2DModel over discrete input must provide sample_size"
-            assert num_vector_embeds is not None, "Transformer2DModel over discrete input must provide num_embed"
-
-            self.height = sample_size
-            self.width = sample_size
-            self.num_vector_embeds = num_vector_embeds
-            self.num_latent_pixels = self.height * self.width
-
-            self.latent_image_embedding = ImagePositionalEmbeddings(
-                num_embed=num_vector_embeds, embed_dim=inner_dim, height=self.height, width=self.width
-            )
-        elif self.is_input_patches:
-            assert sample_size is not None, "Transformer2DModel over patched input must provide sample_size"
-
-            self.height = sample_size
-            self.width = sample_size
-
-            self.patch_size = patch_size
-            self.pos_embed = PatchEmbed(
-                height=sample_size,
-                width=sample_size,
-                patch_size=patch_size,
-                in_channels=in_channels,
-                embed_dim=inner_dim,
-            )
-
-        # 3. Define transformers blocks
-        self.transformer_blocks = nn.LayerList(
-            [
-                BasicTransformerBlock(
-                    inner_dim,
-                    num_attention_heads,
-                    attention_head_dim,
-                    dropout=dropout,
-                    cross_attention_dim=cross_attention_dim,
-                    activation_fn=activation_fn,
-                    num_embeds_ada_norm=num_embeds_ada_norm,
-                    attention_bias=attention_bias,
-                    only_cross_attention=only_cross_attention,
-                    upcast_attention=upcast_attention,
-                    norm_type=norm_type,
-                    norm_elementwise_affine=norm_elementwise_affine,
-                )
-                for d in range(num_layers)
-            ]
-        )
-
-        # 4. Define output layers
-        self.out_channels = in_channels if out_channels is None else out_channels
-        if self.is_input_continuous:
-            # TODO: should use out_channels for continuous projections
-            if use_linear_projection:
-                self.proj_out = nn.Linear(inner_dim, in_channels)
-            else:
-                self.proj_out = nn.Conv2D(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
-        elif self.is_input_vectorized:
-            self.norm_out = nn.LayerNorm(inner_dim)
-            self.out = nn.Linear(inner_dim, self.num_vector_embeds - 1)
-        elif self.is_input_patches:
-            # elementwise_affine=False
-            norm_kwargs = {"weight_attr": False, "bias_attr": False}
-            self.norm_out = nn.LayerNorm(inner_dim, epsilon=1e-6, **norm_kwargs)
-            self.proj_out_1 = nn.Linear(inner_dim, 2 * inner_dim)
-            self.proj_out_2 = nn.Linear(inner_dim, patch_size * patch_size * self.out_channels)
-
-    def forward(
-        self,
-        hidden_states,
-        encoder_hidden_states=None,
-        timestep=None,
-        class_labels=None,
-        cross_attention_kwargs=None,
-        return_dict: bool = True,
-    ):
-        """
-        Args:
-            hidden_states ( When discrete, `paddle.Tensor` of shape `(batch size, num latent pixels)`.
-                When continuous, `paddle.Tensor` of shape `(batch size, channel, height, width)`): Input
-                hidden_states
-            encoder_hidden_states ( `paddle.Tensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
-                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
-                self-attention.
-            timestep ( `paddle.Tensor`, *optional*):
-                Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step.
-            class_labels ( `paddle.Tensor` of shape `(batch size, num classes)`, *optional*):
-                Optional class labels to be applied as an embedding in AdaLayerZeroNorm. Used to indicate class labels
-                conditioning.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
-
-        Returns:
-            [`~models.transformer_2d.Transformer2DModelOutput`] or `tuple`:
-            [`~models.transformer_2d.Transformer2DModelOutput`] if `return_dict` is True, otherwise a `tuple`. When
-            returning a tuple, the first element is the sample tensor.
-        """
-        hidden_states = hidden_states.cast(self.dtype)
-        # 1. Input
-        if self.is_input_continuous:
-            _, _, height, width = hidden_states.shape
-            residual = hidden_states
-            hidden_states = self.norm(hidden_states)
-            if not self.use_linear_projection:
-                hidden_states = self.proj_in(hidden_states)
-            hidden_states = hidden_states.transpose([0, 2, 3, 1]).flatten(1, 2)
-            if self.use_linear_projection:
-                hidden_states = self.proj_in(hidden_states)
-        elif self.is_input_vectorized:
-            hidden_states = self.latent_image_embedding(hidden_states.cast("int64"))
-        elif self.is_input_patches:
-            hidden_states = self.pos_embed(hidden_states)
-
-        # 2. Blocks
-        for block in self.transformer_blocks:
-            hidden_states = block(
-                hidden_states,
-                encoder_hidden_states=encoder_hidden_states,
-                timestep=timestep,
-                cross_attention_kwargs=cross_attention_kwargs,
-                class_labels=class_labels,
-            )
-
-        # 3. Output
-        if self.is_input_continuous:
-            if self.use_linear_projection:
-                hidden_states = self.proj_out(hidden_states)
-            hidden_states = hidden_states.reshape([-1, height, width, self.inner_dim]).transpose([0, 3, 1, 2])
-            if not self.use_linear_projection:
-                hidden_states = self.proj_out(hidden_states)
-            output = hidden_states + residual
-        elif self.is_input_vectorized:
-            hidden_states = self.norm_out(hidden_states)
-            logits = self.out(hidden_states)
-            # (batch, self.num_vector_embeds - 1, self.num_latent_pixels)
-            logits = logits.transpose([0, 2, 1])
-
-            # log(p(x_0))
-            output = F.log_softmax(logits.cast("float64"), axis=1).cast("float32")
-        elif self.is_input_patches:
-            # TODO: cleanup!
-            conditioning = self.transformer_blocks[0].norm1.emb(
-                timestep, class_labels, hidden_dtype=hidden_states.dtype
-            )
-            shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, axis=1)
-            hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None]
-            hidden_states = self.proj_out_2(hidden_states)
-
-            # unpatchify
-            height = width = int(hidden_states.shape[1] ** 0.5)
-            hidden_states = hidden_states.reshape(
-                (-1, height, width, self.patch_size, self.patch_size, self.out_channels)
-            )
-            hidden_states = paddle.einsum("nhwpqc->nchpwq", hidden_states)
-            output = hidden_states.reshape((-1, self.out_channels, height * self.patch_size, width * self.patch_size))
-
-        if not return_dict:
-            return (output,)
-
-        return Transformer2DModelOutput(sample=output)
diff --git a/ppdiffusers/ppdiffusers/models/transformer_temporal.py b/ppdiffusers/ppdiffusers/models/transformer_temporal.py
deleted file mode 100644
index bfd1985eb99a..000000000000
--- a/ppdiffusers/ppdiffusers/models/transformer_temporal.py
+++ /dev/null
@@ -1,165 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from dataclasses import dataclass
-from typing import Optional
-
-import paddle
-import paddle.nn as nn
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput
-from .attention import BasicTransformerBlock
-from .modeling_utils import ModelMixin
-
-
-@dataclass
-class TransformerTemporalModelOutput(BaseOutput):
-    """
-    Args:
-        sample (`paddle.Tensor` of shape `(batch_size x num_frames, num_channels, height, width)`)
-            Hidden states conditioned on `encoder_hidden_states` input.
-    """
-
-    sample: paddle.Tensor
-
-
-class TransformerTemporalModel(ModelMixin, ConfigMixin):
-    """
-    Transformer model for video-like data.
-
-    Parameters:
-        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
-        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
-        in_channels (`int`, *optional*):
-            Pass if the input is continuous. The number of channels in the input and output.
-        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
-        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
-        cross_attention_dim (`int`, *optional*): The number of encoder_hidden_states dimensions to use.
-        sample_size (`int`, *optional*): Pass if the input is discrete. The width of the latent images.
-            Note that this is fixed at training time as it is used for learning a number of position embeddings. See
-            `ImagePositionalEmbeddings`.
-        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
-        attention_bias (`bool`, *optional*):
-            Configure if the TransformerBlocks' attention should contain a bias parameter.
-        double_self_attention (`bool`, *optional*):
-            Configure if each TransformerBlock should contain two self-attention layers
-    """
-
-    @register_to_config
-    def __init__(
-        self,
-        num_attention_heads: int = 16,
-        attention_head_dim: int = 88,
-        in_channels: Optional[int] = None,
-        out_channels: Optional[int] = None,
-        num_layers: int = 1,
-        dropout: float = 0.0,
-        norm_num_groups: int = 32,
-        cross_attention_dim: Optional[int] = None,
-        attention_bias: bool = False,
-        sample_size: Optional[int] = None,
-        activation_fn: str = "geglu",
-        norm_elementwise_affine: bool = True,
-        double_self_attention: bool = True,
-    ):
-        super().__init__()
-        self.num_attention_heads = num_attention_heads
-        self.attention_head_dim = attention_head_dim
-        inner_dim = num_attention_heads * attention_head_dim
-        self.in_channels = in_channels
-        self.norm = nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, epsilon=1e-06)
-        self.proj_in = nn.Linear(in_channels, inner_dim)
-        self.transformer_blocks = nn.LayerList(
-            [
-                BasicTransformerBlock(
-                    inner_dim,
-                    num_attention_heads,
-                    attention_head_dim,
-                    dropout=dropout,
-                    cross_attention_dim=cross_attention_dim,
-                    activation_fn=activation_fn,
-                    attention_bias=attention_bias,
-                    double_self_attention=double_self_attention,
-                    norm_elementwise_affine=norm_elementwise_affine,
-                )
-                for d in range(num_layers)
-            ]
-        )
-        self.proj_out = nn.Linear(inner_dim, in_channels)
-
-    def forward(
-        self,
-        hidden_states,
-        encoder_hidden_states=None,
-        timestep=None,
-        class_labels=None,
-        num_frames=1,
-        cross_attention_kwargs=None,
-        return_dict: bool = True,
-    ):
-        """
-        Args:
-            hidden_states ( When discrete, `paddle.Tensor` of shape `(batch size, num latent pixels)`.
-                When continuous, `paddle.Tensor` of shape `(batch size, channel, height, width)`): Input
-                hidden_states
-            encoder_hidden_states ( `paddleTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*):
-                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
-                self-attention.
-            timestep ( `paddle.int64`, *optional*):
-                Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step.
-            class_labels ( `paddle.Tensor` of shape `(batch size, num classes)`, *optional*):
-                Optional class labels to be applied as an embedding in AdaLayerZeroNorm. Used to indicate class labels
-                conditioning.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
-
-        Returns:
-            [`~models.transformer_2d.TransformerTemporalModelOutput`] or `tuple`:
-            [`~models.transformer_2d.TransformerTemporalModelOutput`] if `return_dict` is True, otherwise a `tuple`.
-            When returning a tuple, the first element is the sample tensor.
-        """
-        # 1. Input
-        batch_frames, channel, height, width = hidden_states.shape
-        batch_size = batch_frames // num_frames
-        residual = hidden_states
-        hidden_states = hidden_states[None, :].reshape((batch_size, num_frames, channel, height, width))
-        hidden_states = hidden_states.transpose([0, 2, 1, 3, 4])
-        hidden_states = self.norm(hidden_states)
-        hidden_states = hidden_states.transpose([0, 3, 4, 2, 1]).reshape(
-            (batch_size * height * width, num_frames, channel)
-        )
-        hidden_states = self.proj_in(hidden_states)
-        # 2. Blocks
-        for block in self.transformer_blocks:
-            hidden_states = block(
-                hidden_states,
-                encoder_hidden_states=encoder_hidden_states,
-                timestep=timestep,
-                cross_attention_kwargs=cross_attention_kwargs,
-                class_labels=class_labels,
-            )
-        # 3. Output
-        hidden_states = self.proj_out(hidden_states)
-        hidden_states = (
-            hidden_states[None, None, :]
-            .reshape((batch_size, height, width, channel, num_frames))
-            .transpose([0, 3, 4, 1, 2])
-        )
-        hidden_states = hidden_states.reshape((batch_frames, channel, height, width))
-        output = hidden_states + residual
-        if not return_dict:
-            return (output,)
-        return TransformerTemporalModelOutput(sample=output)
diff --git a/ppdiffusers/ppdiffusers/models/unet_1d.py b/ppdiffusers/ppdiffusers/models/unet_1d.py
deleted file mode 100644
index 62ad365df630..000000000000
--- a/ppdiffusers/ppdiffusers/models/unet_1d.py
+++ /dev/null
@@ -1,250 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from dataclasses import dataclass
-from typing import Optional, Tuple, Union
-
-import paddle
-import paddle.nn as nn
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput
-from .embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps
-from .modeling_utils import ModelMixin
-from .unet_1d_blocks import get_down_block, get_mid_block, get_out_block, get_up_block
-
-
-@dataclass
-class UNet1DOutput(BaseOutput):
-    """
-    Args:
-        sample (`paddle.Tensor` of shape `(batch_size, num_channels, sample_size)`):
-            Hidden states output. Output of last layer of model.
-    """
-
-    sample: paddle.Tensor
-
-
-class UNet1DModel(ModelMixin, ConfigMixin):
-    r"""
-    UNet1DModel is a 1D UNet model that takes in a noisy sample and a timestep and returns sample shaped output.
-
-    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
-    implements for all the model (such as downloading or saving, etc.)
-
-    Parameters:
-        sample_size (`int`, *optional*): Default length of sample. Should be adaptable at runtime.
-        in_channels (`int`, *optional*, defaults to 2): Number of channels in the input sample.
-        out_channels (`int`, *optional*, defaults to 2): Number of channels in the output.
-        extra_in_channels (`int`, *optional*, defaults to 0):
-            Number of additional channels to be added to the input of the first down block. Useful for cases where the
-            input data has more channels than what the model is initially designed for.
-        time_embedding_type (`str`, *optional*, defaults to `"fourier"`): Type of time embedding to use.
-        freq_shift (`float`, *optional*, defaults to 0.0): Frequency shift for fourier time embedding.
-        flip_sin_to_cos (`bool`, *optional*, defaults to :
-            obj:`False`): Whether to flip sin to cos for fourier time embedding.
-        down_block_types (`Tuple[str]`, *optional*, defaults to :
-            obj:`("DownBlock1D", "DownBlock1DNoSkip", "AttnDownBlock1D")`): Tuple of downsample block types.
-        up_block_types (`Tuple[str]`, *optional*, defaults to :
-            obj:`("UpBlock1D", "UpBlock1DNoSkip", "AttnUpBlock1D")`): Tuple of upsample block types.
-        block_out_channels (`Tuple[int]`, *optional*, defaults to :
-            obj:`(32, 32, 64)`): Tuple of block output channels.
-        mid_block_type (`str`, *optional*, defaults to "UNetMidBlock1D"): block type for middle of UNet.
-        out_block_type (`str`, *optional*, defaults to `None`): optional output processing of UNet.
-        act_fn (`str`, *optional*, defaults to None): optional activation function in UNet blocks.
-        norm_num_groups (`int`, *optional*, defaults to 8): group norm member count in UNet blocks.
-        layers_per_block (`int`, *optional*, defaults to 1): added number of layers in a UNet block.
-        downsample_each_block (`int`, *optional*, defaults to False:
-            experimental feature for using a UNet without upsampling.
-    """
-
-    @register_to_config
-    def __init__(
-        self,
-        sample_size: int = 65536,
-        sample_rate: Optional[int] = None,
-        in_channels: int = 2,
-        out_channels: int = 2,
-        extra_in_channels: int = 0,
-        time_embedding_type: str = "fourier",
-        flip_sin_to_cos: bool = True,
-        use_timestep_embedding: bool = False,
-        freq_shift: float = 0.0,
-        down_block_types: Tuple[str] = ("DownBlock1DNoSkip", "DownBlock1D", "AttnDownBlock1D"),
-        up_block_types: Tuple[str] = ("AttnUpBlock1D", "UpBlock1D", "UpBlock1DNoSkip"),
-        mid_block_type: Tuple[str] = "UNetMidBlock1D",
-        out_block_type: str = None,
-        block_out_channels: Tuple[int] = (32, 32, 64),
-        act_fn: str = None,
-        norm_num_groups: int = 8,
-        layers_per_block: int = 1,
-        downsample_each_block: bool = False,
-    ):
-        super().__init__()
-        self.sample_size = sample_size
-
-        # time
-        if time_embedding_type == "fourier":
-            self.time_proj = GaussianFourierProjection(
-                embedding_size=8, set_W_to_weight=False, log=False, flip_sin_to_cos=flip_sin_to_cos
-            )
-            timestep_input_dim = 2 * block_out_channels[0]
-        elif time_embedding_type == "positional":
-            self.time_proj = Timesteps(
-                block_out_channels[0], flip_sin_to_cos=flip_sin_to_cos, downscale_freq_shift=freq_shift
-            )
-            timestep_input_dim = block_out_channels[0]
-
-        if use_timestep_embedding:
-            time_embed_dim = block_out_channels[0] * 4
-            self.time_mlp = TimestepEmbedding(
-                in_channels=timestep_input_dim,
-                time_embed_dim=time_embed_dim,
-                act_fn=act_fn,
-                out_dim=block_out_channels[0],
-            )
-
-        self.down_blocks = nn.LayerList([])
-        self.mid_block = None
-        self.up_blocks = nn.LayerList([])
-        self.out_block = None
-
-        # down
-        output_channel = in_channels
-        for i, down_block_type in enumerate(down_block_types):
-            input_channel = output_channel
-            output_channel = block_out_channels[i]
-
-            if i == 0:
-                input_channel += extra_in_channels
-
-            is_final_block = i == len(block_out_channels) - 1
-
-            down_block = get_down_block(
-                down_block_type,
-                num_layers=layers_per_block,
-                in_channels=input_channel,
-                out_channels=output_channel,
-                temb_channels=block_out_channels[0],
-                add_downsample=not is_final_block or downsample_each_block,
-            )
-            self.down_blocks.append(down_block)
-
-        # mid
-        self.mid_block = get_mid_block(
-            mid_block_type,
-            in_channels=block_out_channels[-1],
-            mid_channels=block_out_channels[-1],
-            out_channels=block_out_channels[-1],
-            embed_dim=block_out_channels[0],
-            num_layers=layers_per_block,
-            add_downsample=downsample_each_block,
-        )
-
-        # up
-        reversed_block_out_channels = list(reversed(block_out_channels))
-        output_channel = reversed_block_out_channels[0]
-        if out_block_type is None:
-            final_upsample_channels = out_channels
-        else:
-            final_upsample_channels = block_out_channels[0]
-
-        for i, up_block_type in enumerate(up_block_types):
-            prev_output_channel = output_channel
-            output_channel = (
-                reversed_block_out_channels[i + 1] if i < len(up_block_types) - 1 else final_upsample_channels
-            )
-
-            is_final_block = i == len(block_out_channels) - 1
-
-            up_block = get_up_block(
-                up_block_type,
-                num_layers=layers_per_block,
-                in_channels=prev_output_channel,
-                out_channels=output_channel,
-                temb_channels=block_out_channels[0],
-                add_upsample=not is_final_block,
-            )
-            self.up_blocks.append(up_block)
-            prev_output_channel = output_channel
-
-        # out
-        num_groups_out = norm_num_groups if norm_num_groups is not None else min(block_out_channels[0] // 4, 32)
-        self.out_block = get_out_block(
-            out_block_type=out_block_type,
-            num_groups_out=num_groups_out,
-            embed_dim=block_out_channels[0],
-            out_channels=out_channels,
-            act_fn=act_fn,
-            fc_dim=block_out_channels[-1] // 4,
-        )
-
-    def forward(
-        self,
-        sample: paddle.Tensor,
-        timestep: Union[paddle.Tensor, float, int],
-        return_dict: bool = True,
-    ) -> Union[UNet1DOutput, Tuple]:
-        r"""
-        Args:
-            sample (`paddle.Tensor`): `(batch_size, num_channels, sample_size)` noisy inputs tensor
-            timestep (`paddle.Tensor` or `float` or `int): (batch) timesteps
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.unet_1d.UNet1DOutput`] instead of a plain tuple.
-
-        Returns:
-            [`~models.unet_1d.UNet1DOutput`] or `tuple`: [`~models.unet_1d.UNet1DOutput`] if `return_dict` is True,
-            otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
-        """
-        sample = sample.cast(self.dtype)
-        # 1. time
-        timesteps = timestep
-        if not paddle.is_tensor(timesteps):
-            timesteps = paddle.to_tensor([timesteps], dtype="int64")
-        elif paddle.is_tensor(timesteps) and len(timesteps.shape) == 0:
-            timesteps = timesteps[None]
-
-        timestep_embed = self.time_proj(timesteps)
-        if self.config.use_timestep_embedding:
-            timestep_embed = self.time_mlp(timestep_embed)
-        else:
-            timestep_embed = timestep_embed[..., None]
-            timestep_embed = timestep_embed.tile([1, 1, sample.shape[2]]).cast(sample.dtype)
-            timestep_embed = timestep_embed.broadcast_to((sample.shape[:1] + timestep_embed.shape[1:]))
-
-        # 2. down
-        down_block_res_samples = ()
-        for downsample_block in self.down_blocks:
-            sample, res_samples = downsample_block(hidden_states=sample, temb=timestep_embed)
-            down_block_res_samples += res_samples
-
-        # 3. mid
-        if self.mid_block:
-            sample = self.mid_block(sample, timestep_embed)
-
-        # 4. up
-        for i, upsample_block in enumerate(self.up_blocks):
-            res_samples = down_block_res_samples[-1:]
-            down_block_res_samples = down_block_res_samples[:-1]
-            sample = upsample_block(sample, res_hidden_states_tuple=res_samples, temb=timestep_embed)
-
-        # 5. post-process
-        if self.out_block:
-            sample = self.out_block(sample, timestep_embed)
-
-        if not return_dict:
-            return (sample,)
-
-        return UNet1DOutput(sample=sample)
diff --git a/ppdiffusers/ppdiffusers/models/unet_1d_blocks.py b/ppdiffusers/ppdiffusers/models/unet_1d_blocks.py
deleted file mode 100644
index dd2d1eb87a51..000000000000
--- a/ppdiffusers/ppdiffusers/models/unet_1d_blocks.py
+++ /dev/null
@@ -1,724 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-from typing import Optional
-
-import paddle
-import paddle.nn.functional as F
-from paddle import nn
-
-from ..utils import is_ppxformers_available
-from .resnet import Downsample1D, ResidualTemporalBlock1D, Upsample1D, rearrange_dims
-
-
-class DownResnetBlock1D(nn.Layer):
-    def __init__(
-        self,
-        in_channels,
-        out_channels=None,
-        num_layers=1,
-        conv_shortcut=False,
-        temb_channels=32,
-        groups=32,
-        groups_out=None,
-        non_linearity=None,
-        time_embedding_norm="default",
-        output_scale_factor=1.0,
-        add_downsample=True,
-    ):
-        super().__init__()
-        self.in_channels = in_channels
-        out_channels = in_channels if out_channels is None else out_channels
-        self.out_channels = out_channels
-        self.use_conv_shortcut = conv_shortcut
-        self.time_embedding_norm = time_embedding_norm
-        self.add_downsample = add_downsample
-        self.output_scale_factor = output_scale_factor
-
-        if groups_out is None:
-            groups_out = groups
-
-        # there will always be at least one resnet
-        resnets = [ResidualTemporalBlock1D(in_channels, out_channels, embed_dim=temb_channels)]
-
-        for _ in range(num_layers):
-            resnets.append(ResidualTemporalBlock1D(out_channels, out_channels, embed_dim=temb_channels))
-
-        self.resnets = nn.LayerList(resnets)
-
-        if non_linearity == "swish":
-            self.nonlinearity = lambda x: F.silu(x)
-        elif non_linearity == "mish":
-            self.nonlinearity = nn.Mish()
-        elif non_linearity == "silu":
-            self.nonlinearity = nn.Silu()
-        else:
-            self.nonlinearity = None
-
-        self.downsample = None
-        if add_downsample:
-            self.downsample = Downsample1D(out_channels, use_conv=True, padding=1)
-
-    def forward(self, hidden_states, temb=None):
-        output_states = ()
-
-        hidden_states = self.resnets[0](hidden_states, temb)
-        for resnet in self.resnets[1:]:
-            hidden_states = resnet(hidden_states, temb)
-
-        output_states += (hidden_states,)
-
-        if self.nonlinearity is not None:
-            hidden_states = self.nonlinearity(hidden_states)
-
-        if self.downsample is not None:
-            hidden_states = self.downsample(hidden_states)
-
-        return hidden_states, output_states
-
-
-class UpResnetBlock1D(nn.Layer):
-    def __init__(
-        self,
-        in_channels,
-        out_channels=None,
-        num_layers=1,
-        temb_channels=32,
-        groups=32,
-        groups_out=None,
-        non_linearity=None,
-        time_embedding_norm="default",
-        output_scale_factor=1.0,
-        add_upsample=True,
-    ):
-        super().__init__()
-        self.in_channels = in_channels
-        out_channels = in_channels if out_channels is None else out_channels
-        self.out_channels = out_channels
-        self.time_embedding_norm = time_embedding_norm
-        self.add_upsample = add_upsample
-        self.output_scale_factor = output_scale_factor
-
-        if groups_out is None:
-            groups_out = groups
-
-        # there will always be at least one resnet
-        resnets = [ResidualTemporalBlock1D(2 * in_channels, out_channels, embed_dim=temb_channels)]
-
-        for _ in range(num_layers):
-            resnets.append(ResidualTemporalBlock1D(out_channels, out_channels, embed_dim=temb_channels))
-
-        self.resnets = nn.LayerList(resnets)
-
-        if non_linearity == "swish":
-            self.nonlinearity = lambda x: F.silu(x)
-        elif non_linearity == "mish":
-            self.nonlinearity = nn.Mish()
-        elif non_linearity == "silu":
-            self.nonlinearity = nn.Silu()
-        else:
-            self.nonlinearity = None
-
-        self.upsample = None
-        if add_upsample:
-            self.upsample = Upsample1D(out_channels, use_conv_transpose=True)
-
-    def forward(self, hidden_states, res_hidden_states_tuple=None, temb=None):
-        if res_hidden_states_tuple is not None:
-            res_hidden_states = res_hidden_states_tuple[-1]
-            hidden_states = paddle.concat((hidden_states, res_hidden_states), axis=1)
-
-        hidden_states = self.resnets[0](hidden_states, temb)
-        for resnet in self.resnets[1:]:
-            hidden_states = resnet(hidden_states, temb)
-
-        if self.nonlinearity is not None:
-            hidden_states = self.nonlinearity(hidden_states)
-
-        if self.upsample is not None:
-            hidden_states = self.upsample(hidden_states)
-
-        return hidden_states
-
-
-class ValueFunctionMidBlock1D(nn.Layer):
-    def __init__(self, in_channels, out_channels, embed_dim):
-        super().__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.embed_dim = embed_dim
-
-        self.res1 = ResidualTemporalBlock1D(in_channels, in_channels // 2, embed_dim=embed_dim)
-        self.down1 = Downsample1D(out_channels // 2, use_conv=True)
-        self.res2 = ResidualTemporalBlock1D(in_channels // 2, in_channels // 4, embed_dim=embed_dim)
-        self.down2 = Downsample1D(out_channels // 4, use_conv=True)
-
-    def forward(self, x, temb=None):
-        x = self.res1(x, temb)
-        x = self.down1(x)
-        x = self.res2(x, temb)
-        x = self.down2(x)
-        return x
-
-
-class MidResTemporalBlock1D(nn.Layer):
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        embed_dim,
-        num_layers: int = 1,
-        add_downsample: bool = False,
-        add_upsample: bool = False,
-        non_linearity=None,
-    ):
-        super().__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.add_downsample = add_downsample
-
-        # there will always be at least one resnet
-        resnets = [ResidualTemporalBlock1D(in_channels, out_channels, embed_dim=embed_dim)]
-
-        for _ in range(num_layers):
-            resnets.append(ResidualTemporalBlock1D(out_channels, out_channels, embed_dim=embed_dim))
-
-        self.resnets = nn.LayerList(resnets)
-
-        if non_linearity == "swish":
-            self.nonlinearity = lambda x: F.silu(x)
-        elif non_linearity == "mish":
-            self.nonlinearity = nn.Mish()
-        elif non_linearity == "silu":
-            self.nonlinearity = nn.Silu()
-        else:
-            self.nonlinearity = None
-
-        self.upsample = None
-        if add_upsample:
-            self.upsample = Downsample1D(out_channels, use_conv=True)
-
-        self.downsample = None
-        if add_downsample:
-            self.downsample = Downsample1D(out_channels, use_conv=True)
-
-        if self.upsample and self.downsample:
-            raise ValueError("Block cannot downsample and upsample")
-
-    def forward(self, hidden_states, temb):
-        hidden_states = self.resnets[0](hidden_states, temb)
-        for resnet in self.resnets[1:]:
-            hidden_states = resnet(hidden_states, temb)
-
-        if self.upsample:
-            hidden_states = self.upsample(hidden_states)
-        if self.downsample:
-            self.downsample = self.downsample(hidden_states)
-
-        return hidden_states
-
-
-class OutConv1DBlock(nn.Layer):
-    def __init__(self, num_groups_out, out_channels, embed_dim, act_fn):
-        super().__init__()
-        self.final_conv1d_1 = nn.Conv1D(embed_dim, embed_dim, 5, padding=2)
-        self.final_conv1d_gn = nn.GroupNorm(num_groups_out, embed_dim)
-        if act_fn == "silu":
-            self.final_conv1d_act = nn.Silu()
-        if act_fn == "mish":
-            self.final_conv1d_act = nn.Mish()
-        self.final_conv1d_2 = nn.Conv1D(embed_dim, out_channels, 1)
-
-    def forward(self, hidden_states, temb=None):
-        hidden_states = self.final_conv1d_1(hidden_states)
-        hidden_states = rearrange_dims(hidden_states)
-        hidden_states = self.final_conv1d_gn(hidden_states)
-        hidden_states = rearrange_dims(hidden_states)
-        hidden_states = self.final_conv1d_act(hidden_states)
-        hidden_states = self.final_conv1d_2(hidden_states)
-        return hidden_states
-
-
-class OutValueFunctionBlock(nn.Layer):
-    def __init__(self, fc_dim, embed_dim):
-        super().__init__()
-        self.final_block = nn.LayerList(
-            [
-                nn.Linear(fc_dim + embed_dim, fc_dim // 2),
-                nn.Mish(),
-                nn.Linear(fc_dim // 2, 1),
-            ]
-        )
-
-    def forward(self, hidden_states, temb):
-        hidden_states = hidden_states.reshape([hidden_states.shape[0], -1])
-        hidden_states = paddle.concat((hidden_states, temb), axis=-1)
-        for layer in self.final_block:
-            hidden_states = layer(hidden_states)
-
-        return hidden_states
-
-
-_kernels = {
-    "linear": [1 / 8, 3 / 8, 3 / 8, 1 / 8],
-    "cubic": [-0.01171875, -0.03515625, 0.11328125, 0.43359375, 0.43359375, 0.11328125, -0.03515625, -0.01171875],
-    "lanczos3": [
-        0.003689131001010537,
-        0.015056144446134567,
-        -0.03399861603975296,
-        -0.066637322306633,
-        0.13550527393817902,
-        0.44638532400131226,
-        0.44638532400131226,
-        0.13550527393817902,
-        -0.066637322306633,
-        -0.03399861603975296,
-        0.015056144446134567,
-        0.003689131001010537,
-    ],
-}
-
-
-class Downsample1d(nn.Layer):
-    def __init__(self, kernel="linear", pad_mode="reflect"):
-        super().__init__()
-        self.pad_mode = pad_mode
-        kernel_1d = paddle.to_tensor(_kernels[kernel])
-        self.pad = kernel_1d.shape[0] // 2 - 1
-        self.register_buffer("kernel", kernel_1d)
-
-    def forward(self, hidden_states):
-        hidden_states = F.pad(hidden_states, (self.pad,) * 2, self.pad_mode, data_format="NCL")
-        weight = paddle.zeros(
-            [hidden_states.shape[1], hidden_states.shape[1], self.kernel.shape[0]], dtype=hidden_states.dtype
-        )
-        indices = paddle.arange(hidden_states.shape[1])
-        weight[indices, indices] = self.kernel.cast(weight.dtype)
-        return F.conv1d(hidden_states, weight, stride=2)
-
-
-class Upsample1d(nn.Layer):
-    def __init__(self, kernel="linear", pad_mode="reflect"):
-        super().__init__()
-        self.pad_mode = pad_mode
-        kernel_1d = paddle.to_tensor(_kernels[kernel])
-        self.pad = kernel_1d.shape[0] // 2 - 1
-        self.register_buffer("kernel", kernel_1d)
-
-    def forward(self, hidden_states, temb=None):
-        hidden_states = F.pad(hidden_states, ((self.pad + 1) // 2,) * 2, self.pad_mode, data_format="NCL")
-        weight = paddle.zeros(
-            [hidden_states.shape[1], hidden_states.shape[1], self.kernel.shape[0]], dtype=hidden_states.dtype
-        )
-        indices = paddle.arange(hidden_states.shape[1])
-        weight[indices, indices] = self.kernel.cast(weight.dtype)
-        return F.conv1d_transpose(hidden_states, weight, stride=2, padding=self.pad * 2 + 1)
-
-
-class SelfAttention1d(nn.Layer):
-    def __init__(self, in_channels, n_head=1, dropout_rate=0.0):
-        super().__init__()
-        self.channels = in_channels
-        self.group_norm = nn.GroupNorm(1, num_channels=in_channels)
-        self.num_heads = n_head
-        self.head_size = in_channels // n_head
-        self.scale = 1 / math.sqrt(self.head_size)
-
-        self.query = nn.Linear(self.channels, self.channels)
-        self.key = nn.Linear(self.channels, self.channels)
-        self.value = nn.Linear(self.channels, self.channels)
-
-        self.proj_attn = nn.Linear(self.channels, self.channels)
-
-        self.dropout = nn.Dropout(dropout_rate)
-
-        self._use_memory_efficient_attention_xformers = False
-        self._attention_op = None
-
-    def reshape_heads_to_batch_dim(self, tensor, transpose=True):
-        tensor = tensor.reshape([0, 0, self.num_heads, self.head_size])
-        if transpose:
-            tensor = tensor.transpose([0, 2, 1, 3])
-        return tensor
-
-    def reshape_batch_dim_to_heads(self, tensor, transpose=True):
-        if transpose:
-            tensor = tensor.transpose([0, 2, 1, 3])
-        tensor = tensor.reshape([0, 0, tensor.shape[2] * tensor.shape[3]])
-        return tensor
-
-    def set_use_memory_efficient_attention_xformers(
-        self, use_memory_efficient_attention_xformers: bool, attention_op: Optional[str] = None
-    ):
-        if self.head_size > 128 and attention_op == "flash":
-            attention_op = "cutlass"
-        if use_memory_efficient_attention_xformers:
-            if not is_ppxformers_available():
-                raise NotImplementedError(
-                    "requires the scaled_dot_product_attention but your PaddlePaddle donot have this. Checkout the instructions on the installation page: https://www.paddlepaddle.org.cn/install/quick and follow the ones that match your environment."
-                )
-            else:
-                try:
-                    _ = F.scaled_dot_product_attention_(
-                        paddle.randn((1, 1, 2, 40), dtype=paddle.float16),
-                        paddle.randn((1, 1, 2, 40), dtype=paddle.float16),
-                        paddle.randn((1, 1, 2, 40), dtype=paddle.float16),
-                        attention_op=attention_op,
-                    )
-                except Exception as e:
-                    raise e
-
-        self._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
-        self._attention_op = attention_op
-
-    def forward(self, hidden_states):
-        residual = hidden_states
-
-        hidden_states = self.group_norm(hidden_states)
-        hidden_states = hidden_states.transpose([0, 2, 1])
-
-        query_proj = self.query(hidden_states)
-        key_proj = self.key(hidden_states)
-        value_proj = self.value(hidden_states)
-
-        query_proj = self.reshape_heads_to_batch_dim(
-            query_proj, transpose=not self._use_memory_efficient_attention_xformers
-        )
-        key_proj = self.reshape_heads_to_batch_dim(
-            key_proj, transpose=not self._use_memory_efficient_attention_xformers
-        )
-        value_proj = self.reshape_heads_to_batch_dim(
-            value_proj, transpose=not self._use_memory_efficient_attention_xformers
-        )
-
-        if self._use_memory_efficient_attention_xformers:
-            hidden_states = F.scaled_dot_product_attention_(
-                query_proj,
-                key_proj,
-                value_proj,
-                attn_mask=None,
-                scale=self.scale,
-                dropout_p=0.0,
-                training=self.training,
-                attention_op=self._attention_op,
-            )
-        else:
-            attention_scores = paddle.matmul(query_proj, key_proj, transpose_y=True) * self.scale
-            attention_probs = F.softmax(attention_scores.cast("float32"), axis=-1).cast(attention_scores.dtype)
-            hidden_states = paddle.matmul(attention_probs, value_proj)
-
-        # reshape hidden_states
-        hidden_states = self.reshape_batch_dim_to_heads(
-            hidden_states, transpose=not self._use_memory_efficient_attention_xformers
-        )
-
-        # compute next hidden_states
-        hidden_states = self.proj_attn(hidden_states)
-        hidden_states = hidden_states.transpose([0, 2, 1])
-        hidden_states = self.dropout(hidden_states)
-
-        output = hidden_states + residual
-
-        return output
-
-
-class ResConvBlock(nn.Layer):
-    def __init__(self, in_channels, mid_channels, out_channels, is_last=False):
-        super().__init__()
-        self.is_last = is_last
-        self.has_conv_skip = in_channels != out_channels
-
-        if self.has_conv_skip:
-            self.conv_skip = nn.Conv1D(in_channels, out_channels, 1, bias_attr=False)
-
-        self.conv_1 = nn.Conv1D(in_channels, mid_channels, 5, padding=2)
-        self.group_norm_1 = nn.GroupNorm(1, mid_channels)
-        self.gelu_1 = nn.GELU()
-        self.conv_2 = nn.Conv1D(mid_channels, out_channels, 5, padding=2)
-
-        if not self.is_last:
-            self.group_norm_2 = nn.GroupNorm(1, out_channels)
-            self.gelu_2 = nn.GELU()
-
-    def forward(self, hidden_states):
-        residual = self.conv_skip(hidden_states) if self.has_conv_skip else hidden_states
-
-        hidden_states = self.conv_1(hidden_states)
-        hidden_states = self.group_norm_1(hidden_states)
-        hidden_states = self.gelu_1(hidden_states)
-        hidden_states = self.conv_2(hidden_states)
-
-        if not self.is_last:
-            hidden_states = self.group_norm_2(hidden_states)
-            hidden_states = self.gelu_2(hidden_states)
-
-        output = hidden_states + residual
-        return output
-
-
-class UNetMidBlock1D(nn.Layer):
-    def __init__(self, mid_channels, in_channels, out_channels=None):
-        super().__init__()
-
-        out_channels = in_channels if out_channels is None else out_channels
-
-        # there is always at least one resnet
-        self.down = Downsample1d("cubic")
-        resnets = [
-            ResConvBlock(in_channels, mid_channels, mid_channels),
-            ResConvBlock(mid_channels, mid_channels, mid_channels),
-            ResConvBlock(mid_channels, mid_channels, mid_channels),
-            ResConvBlock(mid_channels, mid_channels, mid_channels),
-            ResConvBlock(mid_channels, mid_channels, mid_channels),
-            ResConvBlock(mid_channels, mid_channels, out_channels),
-        ]
-        attentions = [
-            SelfAttention1d(mid_channels, mid_channels // 32),
-            SelfAttention1d(mid_channels, mid_channels // 32),
-            SelfAttention1d(mid_channels, mid_channels // 32),
-            SelfAttention1d(mid_channels, mid_channels // 32),
-            SelfAttention1d(mid_channels, mid_channels // 32),
-            SelfAttention1d(out_channels, out_channels // 32),
-        ]
-        self.up = Upsample1d(kernel="cubic")
-
-        self.attentions = nn.LayerList(attentions)
-        self.resnets = nn.LayerList(resnets)
-
-    def forward(self, hidden_states, temb=None):
-        hidden_states = self.down(hidden_states)
-        for attn, resnet in zip(self.attentions, self.resnets):
-            hidden_states = resnet(hidden_states)
-            hidden_states = attn(hidden_states)
-
-        hidden_states = self.up(hidden_states)
-
-        return hidden_states
-
-
-class AttnDownBlock1D(nn.Layer):
-    def __init__(self, out_channels, in_channels, mid_channels=None):
-        super().__init__()
-        mid_channels = out_channels if mid_channels is None else mid_channels
-
-        self.down = Downsample1d("cubic")
-        resnets = [
-            ResConvBlock(in_channels, mid_channels, mid_channels),
-            ResConvBlock(mid_channels, mid_channels, mid_channels),
-            ResConvBlock(mid_channels, mid_channels, out_channels),
-        ]
-        attentions = [
-            SelfAttention1d(mid_channels, mid_channels // 32),
-            SelfAttention1d(mid_channels, mid_channels // 32),
-            SelfAttention1d(out_channels, out_channels // 32),
-        ]
-
-        self.attentions = nn.LayerList(attentions)
-        self.resnets = nn.LayerList(resnets)
-
-    def forward(self, hidden_states, temb=None):
-        hidden_states = self.down(hidden_states)
-
-        for resnet, attn in zip(self.resnets, self.attentions):
-            hidden_states = resnet(hidden_states)
-            hidden_states = attn(hidden_states)
-
-        return hidden_states, (hidden_states,)
-
-
-class DownBlock1D(nn.Layer):
-    def __init__(self, out_channels, in_channels, mid_channels=None):
-        super().__init__()
-        mid_channels = out_channels if mid_channels is None else mid_channels
-
-        self.down = Downsample1d("cubic")
-        resnets = [
-            ResConvBlock(in_channels, mid_channels, mid_channels),
-            ResConvBlock(mid_channels, mid_channels, mid_channels),
-            ResConvBlock(mid_channels, mid_channels, out_channels),
-        ]
-
-        self.resnets = nn.LayerList(resnets)
-
-    def forward(self, hidden_states, temb=None):
-        hidden_states = self.down(hidden_states)
-
-        for resnet in self.resnets:
-            hidden_states = resnet(hidden_states)
-
-        return hidden_states, (hidden_states,)
-
-
-class DownBlock1DNoSkip(nn.Layer):
-    def __init__(self, out_channels, in_channels, mid_channels=None):
-        super().__init__()
-        mid_channels = out_channels if mid_channels is None else mid_channels
-
-        resnets = [
-            ResConvBlock(in_channels, mid_channels, mid_channels),
-            ResConvBlock(mid_channels, mid_channels, mid_channels),
-            ResConvBlock(mid_channels, mid_channels, out_channels),
-        ]
-
-        self.resnets = nn.LayerList(resnets)
-
-    def forward(self, hidden_states, temb=None):
-        hidden_states = paddle.concat([hidden_states, temb], axis=1)
-        for resnet in self.resnets:
-            hidden_states = resnet(hidden_states)
-
-        return hidden_states, (hidden_states,)
-
-
-class AttnUpBlock1D(nn.Layer):
-    def __init__(self, in_channels, out_channels, mid_channels=None):
-        super().__init__()
-        mid_channels = out_channels if mid_channels is None else mid_channels
-
-        resnets = [
-            ResConvBlock(2 * in_channels, mid_channels, mid_channels),
-            ResConvBlock(mid_channels, mid_channels, mid_channels),
-            ResConvBlock(mid_channels, mid_channels, out_channels),
-        ]
-        attentions = [
-            SelfAttention1d(mid_channels, mid_channels // 32),
-            SelfAttention1d(mid_channels, mid_channels // 32),
-            SelfAttention1d(out_channels, out_channels // 32),
-        ]
-
-        self.attentions = nn.LayerList(attentions)
-        self.resnets = nn.LayerList(resnets)
-        self.up = Upsample1d(kernel="cubic")
-
-    def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
-        res_hidden_states = res_hidden_states_tuple[-1]
-        hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
-
-        for resnet, attn in zip(self.resnets, self.attentions):
-            hidden_states = resnet(hidden_states)
-            hidden_states = attn(hidden_states)
-
-        hidden_states = self.up(hidden_states)
-
-        return hidden_states
-
-
-class UpBlock1D(nn.Layer):
-    def __init__(self, in_channels, out_channels, mid_channels=None):
-        super().__init__()
-        mid_channels = in_channels if mid_channels is None else mid_channels
-
-        resnets = [
-            ResConvBlock(2 * in_channels, mid_channels, mid_channels),
-            ResConvBlock(mid_channels, mid_channels, mid_channels),
-            ResConvBlock(mid_channels, mid_channels, out_channels),
-        ]
-
-        self.resnets = nn.LayerList(resnets)
-        self.up = Upsample1d(kernel="cubic")
-
-    def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
-        res_hidden_states = res_hidden_states_tuple[-1]
-        hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
-
-        for resnet in self.resnets:
-            hidden_states = resnet(hidden_states)
-
-        hidden_states = self.up(hidden_states)
-
-        return hidden_states
-
-
-class UpBlock1DNoSkip(nn.Layer):
-    def __init__(self, in_channels, out_channels, mid_channels=None):
-        super().__init__()
-        mid_channels = in_channels if mid_channels is None else mid_channels
-
-        resnets = [
-            ResConvBlock(2 * in_channels, mid_channels, mid_channels),
-            ResConvBlock(mid_channels, mid_channels, mid_channels),
-            ResConvBlock(mid_channels, mid_channels, out_channels, is_last=True),
-        ]
-
-        self.resnets = nn.LayerList(resnets)
-
-    def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
-        res_hidden_states = res_hidden_states_tuple[-1]
-        hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
-
-        for resnet in self.resnets:
-            hidden_states = resnet(hidden_states)
-
-        return hidden_states
-
-
-def get_down_block(down_block_type, num_layers, in_channels, out_channels, temb_channels, add_downsample):
-    if down_block_type == "DownResnetBlock1D":
-        return DownResnetBlock1D(
-            in_channels=in_channels,
-            num_layers=num_layers,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_downsample=add_downsample,
-        )
-    elif down_block_type == "DownBlock1D":
-        return DownBlock1D(out_channels=out_channels, in_channels=in_channels)
-    elif down_block_type == "AttnDownBlock1D":
-        return AttnDownBlock1D(out_channels=out_channels, in_channels=in_channels)
-    elif down_block_type == "DownBlock1DNoSkip":
-        return DownBlock1DNoSkip(out_channels=out_channels, in_channels=in_channels)
-    raise ValueError(f"{down_block_type} does not exist.")
-
-
-def get_up_block(up_block_type, num_layers, in_channels, out_channels, temb_channels, add_upsample):
-    if up_block_type == "UpResnetBlock1D":
-        return UpResnetBlock1D(
-            in_channels=in_channels,
-            num_layers=num_layers,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_upsample=add_upsample,
-        )
-    elif up_block_type == "UpBlock1D":
-        return UpBlock1D(in_channels=in_channels, out_channels=out_channels)
-    elif up_block_type == "AttnUpBlock1D":
-        return AttnUpBlock1D(in_channels=in_channels, out_channels=out_channels)
-    elif up_block_type == "UpBlock1DNoSkip":
-        return UpBlock1DNoSkip(in_channels=in_channels, out_channels=out_channels)
-    raise ValueError(f"{up_block_type} does not exist.")
-
-
-def get_mid_block(mid_block_type, num_layers, in_channels, mid_channels, out_channels, embed_dim, add_downsample):
-    if mid_block_type == "MidResTemporalBlock1D":
-        return MidResTemporalBlock1D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            embed_dim=embed_dim,
-            add_downsample=add_downsample,
-        )
-    elif mid_block_type == "ValueFunctionMidBlock1D":
-        return ValueFunctionMidBlock1D(in_channels=in_channels, out_channels=out_channels, embed_dim=embed_dim)
-    elif mid_block_type == "UNetMidBlock1D":
-        return UNetMidBlock1D(in_channels=in_channels, mid_channels=mid_channels, out_channels=out_channels)
-    raise ValueError(f"{mid_block_type} does not exist.")
-
-
-def get_out_block(*, out_block_type, num_groups_out, embed_dim, out_channels, act_fn, fc_dim):
-    if out_block_type == "OutConv1DBlock":
-        return OutConv1DBlock(num_groups_out, out_channels, embed_dim, act_fn)
-    elif out_block_type == "ValueFunction":
-        return OutValueFunctionBlock(fc_dim, embed_dim)
-    return None
diff --git a/ppdiffusers/ppdiffusers/models/unet_2d.py b/ppdiffusers/ppdiffusers/models/unet_2d.py
deleted file mode 100644
index 5f8449e766c0..000000000000
--- a/ppdiffusers/ppdiffusers/models/unet_2d.py
+++ /dev/null
@@ -1,351 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from dataclasses import dataclass
-from typing import Optional, Tuple, Union
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput
-from .embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps
-from .modeling_utils import ModelMixin
-from .unet_2d_blocks import UNetMidBlock2D, get_down_block, get_up_block
-
-
-@dataclass
-class UNet2DOutput(BaseOutput):
-    """
-    Args:
-        sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)`):
-            Hidden states output. Output of last layer of model.
-    """
-
-    sample: paddle.Tensor
-
-
-class UNet2DModel(ModelMixin, ConfigMixin):
-    r"""
-    UNet2DModel is a 2D UNet model that takes in a noisy sample and a timestep and returns sample shaped output.
-
-    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
-    implements for all the model (such as downloading or saving, etc.)
-
-    Parameters:
-        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
-            Height and width of input/output sample. Dimensions must be a multiple of `2 ** (len(block_out_channels) -
-            1)`.
-        in_channels (`int`, *optional*, defaults to 3): Number of channels in the input image.
-        out_channels (`int`, *optional*, defaults to 3): Number of channels in the output.
-        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
-        time_embedding_type (`str`, *optional*, defaults to `"positional"`): Type of time embedding to use.
-        freq_shift (`int`, *optional*, defaults to 0): Frequency shift for fourier time embedding.
-        flip_sin_to_cos (`bool`, *optional*, defaults to :
-            obj:`True`): Whether to flip sin to cos for fourier time embedding.
-        down_block_types (`Tuple[str]`, *optional*, defaults to :
-            obj:`("DownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D")`): Tuple of downsample block
-            types.
-        mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2D"`):
-            The mid block type. Choose from `UNetMidBlock2D` or `UnCLIPUNetMidBlock2D`.
-        up_block_types (`Tuple[str]`, *optional*, defaults to :
-            obj:`("AttnUpBlock2D", "AttnUpBlock2D", "AttnUpBlock2D", "UpBlock2D")`): Tuple of upsample block types.
-        block_out_channels (`Tuple[int]`, *optional*, defaults to :
-            obj:`(224, 448, 672, 896)`): Tuple of block output channels.
-        layers_per_block (`int`, *optional*, defaults to `2`): The number of layers per block.
-        mid_block_scale_factor (`float`, *optional*, defaults to `1`): The scale factor for the mid block.
-        downsample_padding (`int`, *optional*, defaults to `1`): The padding for the downsample convolution.
-        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
-        attention_head_dim (`int`, *optional*, defaults to `8`): The attention head dimension.
-        norm_num_groups (`int`, *optional*, defaults to `32`): The number of groups for the normalization.
-        norm_eps (`float`, *optional*, defaults to `1e-5`): The epsilon for the normalization.
-        resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
-            for resnet blocks, see [`~models.resnet.ResnetBlock2D`]. Choose from `default` or `scale_shift`.
-        class_embed_type (`str`, *optional*, defaults to None):
-            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
-            `"timestep"`, or `"identity"`.
-        num_class_embeds (`int`, *optional*, defaults to None):
-            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
-            class conditioning with `class_embed_type` equal to `None`.
-    """
-
-    @register_to_config
-    def __init__(
-        self,
-        sample_size: Optional[Union[int, Tuple[int, int]]] = None,
-        in_channels: int = 3,
-        out_channels: int = 3,
-        center_input_sample: bool = False,
-        time_embedding_type: str = "positional",
-        freq_shift: int = 0,
-        flip_sin_to_cos: bool = True,
-        down_block_types: Tuple[str] = ("DownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D"),
-        up_block_types: Tuple[str] = ("AttnUpBlock2D", "AttnUpBlock2D", "AttnUpBlock2D", "UpBlock2D"),
-        block_out_channels: Tuple[int] = (224, 448, 672, 896),
-        layers_per_block: int = 2,
-        mid_block_scale_factor: float = 1,
-        downsample_padding: int = 1,
-        act_fn: str = "silu",
-        attention_head_dim: Optional[int] = 8,
-        norm_num_groups: int = 32,
-        norm_eps: float = 1e-5,
-        resnet_time_scale_shift: str = "default",
-        add_attention: bool = True,
-        class_embed_type: Optional[str] = None,
-        num_class_embeds: Optional[int] = None,
-        resnet_pre_temb_non_linearity: Optional[bool] = False,
-    ):
-        super().__init__()
-
-        self.sample_size = sample_size
-        time_embed_dim = block_out_channels[0] * 4
-
-        # Check inputs
-        if len(down_block_types) != len(up_block_types):
-            raise ValueError(
-                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
-            )
-
-        if len(block_out_channels) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
-            )
-
-        # input
-        self.conv_in = nn.Conv2D(in_channels, block_out_channels[0], kernel_size=3, padding=(1, 1))
-
-        # time
-        if time_embedding_type == "fourier":
-            self.time_proj = GaussianFourierProjection(embedding_size=block_out_channels[0], scale=16)
-            timestep_input_dim = 2 * block_out_channels[0]
-        elif time_embedding_type == "positional":
-            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
-            timestep_input_dim = block_out_channels[0]
-
-        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
-
-        # class embedding
-        if class_embed_type is None and num_class_embeds is not None:
-            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
-        elif class_embed_type == "timestep":
-            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
-        elif class_embed_type == "identity":
-            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
-        else:
-            self.class_embedding = None
-
-        self.down_blocks = nn.LayerList([])
-        self.mid_block = None
-        self.up_blocks = nn.LayerList([])
-
-        # pre_temb_act_fun opt
-        self.resnet_pre_temb_non_linearity = resnet_pre_temb_non_linearity
-        if resnet_pre_temb_non_linearity:
-            if act_fn == "swish":
-                self.down_resnet_temb_nonlinearity = lambda x: F.silu(x)
-            elif act_fn == "mish":
-                self.down_resnet_temb_nonlinearity = nn.Mish()
-            elif act_fn == "silu":
-                self.down_resnet_temb_nonlinearity = nn.Silu()
-            elif act_fn == "gelu":
-                self.down_resnet_temb_nonlinearity = nn.GELU()
-
-        # down
-        output_channel = block_out_channels[0]
-        for i, down_block_type in enumerate(down_block_types):
-            input_channel = output_channel
-            output_channel = block_out_channels[i]
-            is_final_block = i == len(block_out_channels) - 1
-
-            down_block = get_down_block(
-                down_block_type,
-                num_layers=layers_per_block,
-                in_channels=input_channel,
-                out_channels=output_channel,
-                temb_channels=time_embed_dim,
-                add_downsample=not is_final_block,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                resnet_groups=norm_num_groups,
-                attn_num_head_channels=attention_head_dim,
-                downsample_padding=downsample_padding,
-                resnet_time_scale_shift=resnet_time_scale_shift,
-                resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-            )
-            self.down_blocks.append(down_block)
-
-        # mid
-        self.mid_block = UNetMidBlock2D(
-            in_channels=block_out_channels[-1],
-            temb_channels=time_embed_dim,
-            resnet_eps=norm_eps,
-            resnet_act_fn=act_fn,
-            output_scale_factor=mid_block_scale_factor,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            attn_num_head_channels=attention_head_dim,
-            resnet_groups=norm_num_groups,
-            add_attention=add_attention,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-        )
-
-        # up
-        reversed_block_out_channels = list(reversed(block_out_channels))
-        output_channel = reversed_block_out_channels[0]
-        for i, up_block_type in enumerate(up_block_types):
-            prev_output_channel = output_channel
-            output_channel = reversed_block_out_channels[i]
-            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
-
-            is_final_block = i == len(block_out_channels) - 1
-
-            up_block = get_up_block(
-                up_block_type,
-                num_layers=layers_per_block + 1,
-                in_channels=input_channel,
-                out_channels=output_channel,
-                prev_output_channel=prev_output_channel,
-                temb_channels=time_embed_dim,
-                add_upsample=not is_final_block,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                resnet_groups=norm_num_groups,
-                attn_num_head_channels=attention_head_dim,
-                resnet_time_scale_shift=resnet_time_scale_shift,
-                resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-            )
-            self.up_blocks.append(up_block)
-            prev_output_channel = output_channel
-
-        # out
-        num_groups_out = norm_num_groups if norm_num_groups is not None else min(block_out_channels[0] // 4, 32)
-        self.conv_norm_out = nn.GroupNorm(
-            num_channels=block_out_channels[0], num_groups=num_groups_out, epsilon=norm_eps
-        )
-        self.conv_act = nn.Silu()
-        self.conv_out = nn.Conv2D(block_out_channels[0], out_channels, kernel_size=3, padding=1)
-
-    def forward(
-        self,
-        sample: paddle.Tensor,
-        timestep: Union[paddle.Tensor, float, int],
-        class_labels: Optional[paddle.Tensor] = None,
-        return_dict: bool = True,
-    ) -> Union[UNet2DOutput, Tuple]:
-        r"""
-        Args:
-            sample (`paddle.Tensor`): (batch, channel, height, width) noisy inputs tensor
-            timestep (`paddle.Tensor` or `float` or `int): (batch) timesteps
-            class_labels (`paddle.Tensor`, *optional*, defaults to `None`):
-                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.unet_2d.UNet2DOutput`] instead of a plain tuple.
-
-        Returns:
-            [`~models.unet_2d.UNet2DOutput`] or `tuple`: [`~models.unet_2d.UNet2DOutput`] if `return_dict` is True,
-            otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
-        """
-        # TODO junnyu, add this to support pure fp16
-        sample = sample.cast(self.dtype)
-
-        # 0. center input if necessary
-        if self.config.center_input_sample:
-            sample = 2 * sample - 1.0
-
-        # 1. time
-        timesteps = timestep
-        if not paddle.is_tensor(timesteps):
-            timesteps = paddle.to_tensor([timesteps], dtype="int64")
-        elif len(timesteps.shape) == 0:
-            timesteps = timesteps[None]
-
-        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-        timesteps = timesteps.expand(
-            [
-                sample.shape[0],
-            ]
-        )
-
-        t_emb = self.time_proj(timesteps)
-
-        # timesteps does not contain any weights and will always return f32 tensors
-        # but time_embedding might actually be running in fp16. so we need to cast here.
-        # there might be better ways to encapsulate this.
-        t_emb = t_emb.cast(self.dtype)
-        emb = self.time_embedding(t_emb)
-
-        if self.class_embedding is not None:
-            if class_labels is None:
-                raise ValueError("class_labels should be provided when doing class conditioning")
-
-            class_labels = class_labels.cast(self.dtype)
-
-            if self.config.class_embed_type == "timestep":
-                class_labels = self.time_proj(class_labels)
-
-            if isinstance(self.class_embedding, nn.Embedding):
-                class_labels = class_labels.cast(paddle.int64)
-            class_emb = self.class_embedding(class_labels).cast(self.dtype)
-            emb = emb + class_emb
-
-        # 2. pre-process
-        skip_sample = sample
-        sample = self.conv_in(sample)
-
-        # 3. down
-        down_block_res_samples = (sample,)
-
-        if self.resnet_pre_temb_non_linearity:
-            emb = self.down_resnet_temb_nonlinearity(emb)
-
-        for downsample_block in self.down_blocks:
-            if hasattr(downsample_block, "skip_conv"):
-                sample, res_samples, skip_sample = downsample_block(
-                    hidden_states=sample, temb=emb, skip_sample=skip_sample
-                )
-            else:
-                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
-
-            down_block_res_samples += res_samples
-
-        # 4. mid
-        sample = self.mid_block(sample, emb)
-
-        # 5. up
-        skip_sample = None
-        for upsample_block in self.up_blocks:
-            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
-            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
-
-            if hasattr(upsample_block, "skip_conv"):
-                sample, skip_sample = upsample_block(sample, res_samples, emb, skip_sample)
-            else:
-                sample = upsample_block(sample, res_samples, emb)
-
-        # 6. post-process
-        sample = self.conv_norm_out(sample)
-        sample = self.conv_act(sample)
-        sample = self.conv_out(sample)
-
-        if skip_sample is not None:
-            sample += skip_sample
-
-        if self.config.time_embedding_type == "fourier":
-            timesteps = timesteps.reshape([sample.shape[0], *([1] * len(sample.shape[1:]))])
-            sample = sample / timesteps
-
-        if not return_dict:
-            return (sample,)
-
-        return UNet2DOutput(sample=sample)
diff --git a/ppdiffusers/ppdiffusers/models/unet_2d_blocks.py b/ppdiffusers/ppdiffusers/models/unet_2d_blocks.py
deleted file mode 100644
index be7b08746d46..000000000000
--- a/ppdiffusers/ppdiffusers/models/unet_2d_blocks.py
+++ /dev/null
@@ -1,2916 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Optional
-
-import numpy as np
-import paddle
-from paddle import nn
-from paddle.distributed.fleet.utils import recompute
-
-from .attention import AdaGroupNorm, AttentionBlock
-from .attention_processor import Attention, AttnAddedKVProcessor
-from .dual_transformer_2d import DualTransformer2DModel
-from .resnet import (
-    Downsample2D,
-    FirDownsample2D,
-    FirUpsample2D,
-    KDownsample2D,
-    KUpsample2D,
-    ResnetBlock2D,
-    Upsample2D,
-)
-from .transformer_2d import Transformer2DModel
-
-
-def get_down_block(
-    down_block_type,
-    num_layers,
-    in_channels,
-    out_channels,
-    temb_channels,
-    add_downsample,
-    resnet_eps,
-    resnet_act_fn,
-    attn_num_head_channels,
-    resnet_groups=None,
-    cross_attention_dim=None,
-    downsample_padding=None,
-    dual_cross_attention=False,
-    use_linear_projection=False,
-    only_cross_attention=False,
-    upcast_attention=False,
-    resnet_time_scale_shift="default",
-    resnet_skip_time_act=False,
-    resnet_out_scale_factor=1.0,
-    cross_attention_norm=None,
-    resnet_pre_temb_non_linearity=False,
-):
-    down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
-    if down_block_type == "DownBlock2D":
-        return DownBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            resnet_groups=resnet_groups,
-            downsample_padding=downsample_padding,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-        )
-    elif down_block_type == "ResnetDownsampleBlock2D":
-        return ResnetDownsampleBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            resnet_groups=resnet_groups,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            skip_time_act=resnet_skip_time_act,
-            output_scale_factor=resnet_out_scale_factor,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-        )
-    elif down_block_type == "AttnDownBlock2D":
-        return AttnDownBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            resnet_groups=resnet_groups,
-            downsample_padding=downsample_padding,
-            attn_num_head_channels=attn_num_head_channels,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-        )
-    elif down_block_type == "CrossAttnDownBlock2D":
-        if cross_attention_dim is None:
-            raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock2D")
-        return CrossAttnDownBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            resnet_groups=resnet_groups,
-            downsample_padding=downsample_padding,
-            cross_attention_dim=cross_attention_dim,
-            attn_num_head_channels=attn_num_head_channels,
-            dual_cross_attention=dual_cross_attention,
-            use_linear_projection=use_linear_projection,
-            only_cross_attention=only_cross_attention,
-            upcast_attention=upcast_attention,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-        )
-    elif down_block_type == "SimpleCrossAttnDownBlock2D":
-        if cross_attention_dim is None:
-            raise ValueError("cross_attention_dim must be specified for SimpleCrossAttnDownBlock2D")
-        return SimpleCrossAttnDownBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            resnet_groups=resnet_groups,
-            cross_attention_dim=cross_attention_dim,
-            attn_num_head_channels=attn_num_head_channels,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            skip_time_act=resnet_skip_time_act,
-            output_scale_factor=resnet_out_scale_factor,
-            only_cross_attention=only_cross_attention,
-            cross_attention_norm=cross_attention_norm,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-        )
-    elif down_block_type == "SkipDownBlock2D":
-        return SkipDownBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            downsample_padding=downsample_padding,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-        )
-    elif down_block_type == "AttnSkipDownBlock2D":
-        return AttnSkipDownBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            downsample_padding=downsample_padding,
-            attn_num_head_channels=attn_num_head_channels,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-        )
-    elif down_block_type == "DownEncoderBlock2D":
-        return DownEncoderBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            resnet_groups=resnet_groups,
-            downsample_padding=downsample_padding,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-        )
-    elif down_block_type == "AttnDownEncoderBlock2D":
-        return AttnDownEncoderBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            resnet_groups=resnet_groups,
-            downsample_padding=downsample_padding,
-            attn_num_head_channels=attn_num_head_channels,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-        )
-    elif down_block_type == "KDownBlock2D":
-        return KDownBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-        )
-    elif down_block_type == "KCrossAttnDownBlock2D":
-        return KCrossAttnDownBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            cross_attention_dim=cross_attention_dim,
-            attn_num_head_channels=attn_num_head_channels,
-            add_self_attention=True if not add_downsample else False,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-        )
-    raise ValueError(f"{down_block_type} does not exist.")
-
-
-def get_up_block(
-    up_block_type,
-    num_layers,
-    in_channels,
-    out_channels,
-    prev_output_channel,
-    temb_channels,
-    add_upsample,
-    resnet_eps,
-    resnet_act_fn,
-    attn_num_head_channels,
-    resnet_groups=None,
-    cross_attention_dim=None,
-    dual_cross_attention=False,
-    use_linear_projection=False,
-    only_cross_attention=False,
-    upcast_attention=False,
-    resnet_time_scale_shift="default",
-    resnet_skip_time_act=False,
-    resnet_out_scale_factor=1.0,
-    cross_attention_norm=None,
-    resnet_pre_temb_non_linearity=False,
-):
-    up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
-    if up_block_type == "UpBlock2D":
-        return UpBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            prev_output_channel=prev_output_channel,
-            temb_channels=temb_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            resnet_groups=resnet_groups,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-        )
-    elif up_block_type == "ResnetUpsampleBlock2D":
-        return ResnetUpsampleBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            prev_output_channel=prev_output_channel,
-            temb_channels=temb_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            resnet_groups=resnet_groups,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            skip_time_act=resnet_skip_time_act,
-            output_scale_factor=resnet_out_scale_factor,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-        )
-    elif up_block_type == "CrossAttnUpBlock2D":
-        if cross_attention_dim is None:
-            raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock2D")
-        return CrossAttnUpBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            prev_output_channel=prev_output_channel,
-            temb_channels=temb_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            resnet_groups=resnet_groups,
-            cross_attention_dim=cross_attention_dim,
-            attn_num_head_channels=attn_num_head_channels,
-            dual_cross_attention=dual_cross_attention,
-            use_linear_projection=use_linear_projection,
-            only_cross_attention=only_cross_attention,
-            upcast_attention=upcast_attention,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-        )
-    elif up_block_type == "SimpleCrossAttnUpBlock2D":
-        if cross_attention_dim is None:
-            raise ValueError("cross_attention_dim must be specified for SimpleCrossAttnUpBlock2D")
-        return SimpleCrossAttnUpBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            prev_output_channel=prev_output_channel,
-            temb_channels=temb_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            resnet_groups=resnet_groups,
-            cross_attention_dim=cross_attention_dim,
-            attn_num_head_channels=attn_num_head_channels,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            skip_time_act=resnet_skip_time_act,
-            output_scale_factor=resnet_out_scale_factor,
-            only_cross_attention=only_cross_attention,
-            cross_attention_norm=cross_attention_norm,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-        )
-    elif up_block_type == "AttnUpBlock2D":
-        return AttnUpBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            prev_output_channel=prev_output_channel,
-            temb_channels=temb_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            resnet_groups=resnet_groups,
-            attn_num_head_channels=attn_num_head_channels,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-        )
-    elif up_block_type == "SkipUpBlock2D":
-        return SkipUpBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            prev_output_channel=prev_output_channel,
-            temb_channels=temb_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-        )
-    elif up_block_type == "AttnSkipUpBlock2D":
-        return AttnSkipUpBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            prev_output_channel=prev_output_channel,
-            temb_channels=temb_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            attn_num_head_channels=attn_num_head_channels,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-        )
-    elif up_block_type == "UpDecoderBlock2D":
-        return UpDecoderBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            resnet_groups=resnet_groups,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-        )
-    elif up_block_type == "AttnUpDecoderBlock2D":
-        return AttnUpDecoderBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            resnet_groups=resnet_groups,
-            attn_num_head_channels=attn_num_head_channels,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-        )
-    elif up_block_type == "KUpBlock2D":
-        return KUpBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-        )
-    elif up_block_type == "KCrossAttnUpBlock2D":
-        return KCrossAttnUpBlock2D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            cross_attention_dim=cross_attention_dim,
-            attn_num_head_channels=attn_num_head_channels,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-        )
-
-    raise ValueError(f"{up_block_type} does not exist.")
-
-
-class UNetMidBlock2D(nn.Layer):
-    def __init__(
-        self,
-        in_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        add_attention: bool = True,
-        attn_num_head_channels: int = 1,
-        output_scale_factor: float = 1.0,
-        resnet_pre_temb_non_linearity: bool = False,
-    ):
-        super().__init__()
-        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
-        self.add_attention = add_attention
-
-        # there is always at least one resnet
-        resnets = [
-            ResnetBlock2D(
-                in_channels=in_channels,
-                out_channels=in_channels,
-                temb_channels=temb_channels,
-                eps=resnet_eps,
-                groups=resnet_groups,
-                dropout=dropout,
-                time_embedding_norm=resnet_time_scale_shift,
-                non_linearity=resnet_act_fn,
-                output_scale_factor=output_scale_factor,
-                pre_norm=resnet_pre_norm,
-                pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-            )
-        ]
-        attentions = []
-
-        for _ in range(num_layers):
-            if self.add_attention:
-                attentions.append(
-                    AttentionBlock(
-                        in_channels,
-                        num_head_channels=attn_num_head_channels,
-                        rescale_output_factor=output_scale_factor,
-                        eps=resnet_eps,
-                        norm_num_groups=resnet_groups,
-                    )
-                )
-            else:
-                attentions.append(None)
-
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=in_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-                )
-            )
-
-        self.attentions = nn.LayerList(attentions)
-        self.resnets = nn.LayerList(resnets)
-
-    def forward(self, hidden_states, temb=None):
-        hidden_states = self.resnets[0](hidden_states, temb)
-        for attn, resnet in zip(self.attentions, self.resnets[1:]):
-            if attn is not None:
-                hidden_states = attn(hidden_states)
-            hidden_states = resnet(hidden_states, temb)
-
-        return hidden_states
-
-
-class UNetMidBlock2DCrossAttn(nn.Layer):
-    def __init__(
-        self,
-        in_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels: int = 1,
-        output_scale_factor: float = 1.0,
-        cross_attention_dim: int = 1280,
-        dual_cross_attention: bool = False,
-        use_linear_projection: bool = False,
-        upcast_attention: bool = False,
-        resnet_pre_temb_non_linearity: bool = False,
-    ):
-        super().__init__()
-
-        self.has_cross_attention = True
-        self.attn_num_head_channels = attn_num_head_channels
-        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
-
-        # there is always at least one resnet
-        resnets = [
-            ResnetBlock2D(
-                in_channels=in_channels,
-                out_channels=in_channels,
-                temb_channels=temb_channels,
-                eps=resnet_eps,
-                groups=resnet_groups,
-                dropout=dropout,
-                time_embedding_norm=resnet_time_scale_shift,
-                non_linearity=resnet_act_fn,
-                output_scale_factor=output_scale_factor,
-                pre_norm=resnet_pre_norm,
-                pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-            )
-        ]
-        attentions = []
-
-        for _ in range(num_layers):
-            if not dual_cross_attention:
-                attentions.append(
-                    Transformer2DModel(
-                        attn_num_head_channels,
-                        in_channels // attn_num_head_channels,
-                        in_channels=in_channels,
-                        num_layers=1,
-                        cross_attention_dim=cross_attention_dim,
-                        norm_num_groups=resnet_groups,
-                        use_linear_projection=use_linear_projection,
-                        upcast_attention=upcast_attention,
-                    )
-                )
-            else:
-                attentions.append(
-                    DualTransformer2DModel(
-                        attn_num_head_channels,
-                        in_channels // attn_num_head_channels,
-                        in_channels=in_channels,
-                        num_layers=1,
-                        cross_attention_dim=cross_attention_dim,
-                        norm_num_groups=resnet_groups,
-                    )
-                )
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=in_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-                )
-            )
-
-        self.attentions = nn.LayerList(attentions)
-        self.resnets = nn.LayerList(resnets)
-
-    def forward(
-        self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None, cross_attention_kwargs=None
-    ):
-        hidden_states = self.resnets[0](hidden_states, temb)
-        for attn, resnet in zip(self.attentions, self.resnets[1:]):
-            hidden_states = attn(
-                hidden_states,
-                encoder_hidden_states=encoder_hidden_states,
-                cross_attention_kwargs=cross_attention_kwargs,
-            ).sample
-            hidden_states = resnet(hidden_states, temb)
-
-        return hidden_states
-
-
-class UNetMidBlock2DSimpleCrossAttn(nn.Layer):
-    def __init__(
-        self,
-        in_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels: int = 1,
-        output_scale_factor: float = 1.0,
-        cross_attention_dim: int = 1280,
-        skip_time_act=False,
-        only_cross_attention=False,
-        cross_attention_norm=None,
-        resnet_pre_temb_non_linearity: bool = False,
-    ):
-        super().__init__()
-
-        self.has_cross_attention = True
-
-        self.attn_num_head_channels = attn_num_head_channels
-        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
-
-        self.num_heads = in_channels // self.attn_num_head_channels
-
-        # there is always at least one resnet
-        resnets = [
-            ResnetBlock2D(
-                in_channels=in_channels,
-                out_channels=in_channels,
-                temb_channels=temb_channels,
-                eps=resnet_eps,
-                groups=resnet_groups,
-                dropout=dropout,
-                time_embedding_norm=resnet_time_scale_shift,
-                non_linearity=resnet_act_fn,
-                output_scale_factor=output_scale_factor,
-                pre_norm=resnet_pre_norm,
-                skip_time_act=skip_time_act,
-                pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-            )
-        ]
-        attentions = []
-
-        for _ in range(num_layers):
-            # TODO use AttnAddedKVProcessor2_5
-            # processor = (
-            #     AttnAddedKVProcessor2_5() if hasattr(F, "scaled_dot_product_attention_") else AttnAddedKVProcessor()
-            # )
-            processor = AttnAddedKVProcessor()
-            attentions.append(
-                Attention(
-                    query_dim=in_channels,
-                    cross_attention_dim=in_channels,
-                    heads=self.num_heads,
-                    dim_head=attn_num_head_channels,
-                    added_kv_proj_dim=cross_attention_dim,
-                    norm_num_groups=resnet_groups,
-                    bias=True,
-                    upcast_softmax=True,
-                    only_cross_attention=only_cross_attention,
-                    cross_attention_norm=cross_attention_norm,
-                    processor=processor,
-                )
-            )
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=in_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                    skip_time_act=skip_time_act,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-                )
-            )
-
-        self.attentions = nn.LayerList(attentions)
-        self.resnets = nn.LayerList(resnets)
-
-    def forward(
-        self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None, cross_attention_kwargs=None
-    ):
-        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
-        hidden_states = self.resnets[0](hidden_states, temb)
-        for attn, resnet in zip(self.attentions, self.resnets[1:]):
-            # attn
-            hidden_states = attn(
-                hidden_states,
-                encoder_hidden_states=encoder_hidden_states,
-                attention_mask=attention_mask,
-                **cross_attention_kwargs,
-            )
-
-            # resnet
-            hidden_states = resnet(hidden_states, temb)
-
-        return hidden_states
-
-
-class AttnDownBlock2D(nn.Layer):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels: int = 1,
-        output_scale_factor: float = 1.0,
-        downsample_padding: int = 1,
-        add_downsample: bool = True,
-        resnet_pre_temb_non_linearity: bool = False,
-    ):
-        super().__init__()
-        resnets = []
-        attentions = []
-
-        for i in range(num_layers):
-            in_channels = in_channels if i == 0 else out_channels
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-                )
-            )
-            attentions.append(
-                AttentionBlock(
-                    out_channels,
-                    num_head_channels=attn_num_head_channels,
-                    rescale_output_factor=output_scale_factor,
-                    eps=resnet_eps,
-                    norm_num_groups=resnet_groups,
-                )
-            )
-
-        self.attentions = nn.LayerList(attentions)
-        self.resnets = nn.LayerList(resnets)
-
-        if add_downsample:
-            self.downsamplers = nn.LayerList(
-                [
-                    Downsample2D(
-                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
-                    )
-                ]
-            )
-        else:
-            self.downsamplers = None
-
-    def forward(self, hidden_states, temb=None):
-        output_states = ()
-
-        for resnet, attn in zip(self.resnets, self.attentions):
-            hidden_states = resnet(hidden_states, temb)
-            hidden_states = attn(hidden_states)
-            output_states += (hidden_states,)
-
-        if self.downsamplers is not None:
-            for downsampler in self.downsamplers:
-                hidden_states = downsampler(hidden_states)
-
-            output_states += (hidden_states,)
-
-        return hidden_states, output_states
-
-
-class CrossAttnDownBlock2D(nn.Layer):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels: int = 1,
-        cross_attention_dim: int = 1280,
-        output_scale_factor: float = 1.0,
-        downsample_padding: int = 1,
-        add_downsample: bool = True,
-        dual_cross_attention: bool = False,
-        use_linear_projection: bool = False,
-        only_cross_attention: bool = False,
-        upcast_attention: bool = False,
-        resnet_pre_temb_non_linearity: bool = False,
-    ):
-        super().__init__()
-        resnets = []
-        attentions = []
-
-        self.has_cross_attention = True
-        self.attn_num_head_channels = attn_num_head_channels
-
-        for i in range(num_layers):
-            in_channels = in_channels if i == 0 else out_channels
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-                )
-            )
-            if not dual_cross_attention:
-                attentions.append(
-                    Transformer2DModel(
-                        attn_num_head_channels,
-                        out_channels // attn_num_head_channels,
-                        in_channels=out_channels,
-                        num_layers=1,
-                        cross_attention_dim=cross_attention_dim,
-                        norm_num_groups=resnet_groups,
-                        use_linear_projection=use_linear_projection,
-                        only_cross_attention=only_cross_attention,
-                        upcast_attention=upcast_attention,
-                    )
-                )
-            else:
-                attentions.append(
-                    DualTransformer2DModel(
-                        attn_num_head_channels,
-                        out_channels // attn_num_head_channels,
-                        in_channels=out_channels,
-                        num_layers=1,
-                        cross_attention_dim=cross_attention_dim,
-                        norm_num_groups=resnet_groups,
-                    )
-                )
-        self.attentions = nn.LayerList(attentions)
-        self.resnets = nn.LayerList(resnets)
-
-        if add_downsample:
-            self.downsamplers = nn.LayerList(
-                [
-                    Downsample2D(
-                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
-                    )
-                ]
-            )
-        else:
-            self.downsamplers = None
-
-        self.gradient_checkpointing = False
-
-    def forward(
-        self,
-        hidden_states,
-        temb=None,
-        encoder_hidden_states=None,
-        attention_mask=None,
-        cross_attention_kwargs=None,
-        additional_residuals=None,
-    ):
-        # TODO(Patrick, William) - attention mask is not used
-        output_states = ()
-
-        for resnet, attn in zip(self.resnets, self.attentions):
-            if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
-
-                def create_custom_forward(module, return_dict=None):
-                    def custom_forward(*inputs):
-                        if return_dict is not None:
-                            return module(*inputs, return_dict=return_dict)[0]  # move [0] when paddlepaddle <= 2.4.1
-                        else:
-                            return module(*inputs)
-
-                    return custom_forward
-
-                hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
-                hidden_states = recompute(
-                    create_custom_forward(attn, return_dict=False),
-                    hidden_states,
-                    encoder_hidden_states,
-                    cross_attention_kwargs,
-                )  # [0]
-            else:
-                hidden_states = resnet(hidden_states, temb)
-                hidden_states = attn(
-                    hidden_states,
-                    encoder_hidden_states=encoder_hidden_states,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                ).sample
-
-            output_states += (hidden_states,)
-
-        if additional_residuals is not None:
-            hidden_states += additional_residuals
-
-            # westfish: add to align with torch features
-            output_states = tuple(output_states[:-1]) + (hidden_states,)
-
-        if self.downsamplers is not None:
-            for downsampler in self.downsamplers:
-                hidden_states = downsampler(hidden_states)
-
-            output_states += (hidden_states,)
-
-        return hidden_states, output_states
-
-
-class DownBlock2D(nn.Layer):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        output_scale_factor: float = 1.0,
-        add_downsample: bool = True,
-        downsample_padding: int = 1,
-        resnet_pre_temb_non_linearity: bool = False,
-    ):
-        super().__init__()
-        resnets = []
-
-        for i in range(num_layers):
-            in_channels = in_channels if i == 0 else out_channels
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-                )
-            )
-
-        self.resnets = nn.LayerList(resnets)
-
-        if add_downsample:
-            self.downsamplers = nn.LayerList(
-                [
-                    Downsample2D(
-                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
-                    )
-                ]
-            )
-        else:
-            self.downsamplers = None
-
-        self.gradient_checkpointing = False
-
-    def forward(self, hidden_states, temb=None):
-        output_states = ()
-
-        for resnet in self.resnets:
-            if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs)
-
-                    return custom_forward
-
-                hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
-            else:
-                hidden_states = resnet(hidden_states, temb)
-
-            output_states += (hidden_states,)
-
-        if self.downsamplers is not None:
-            for downsampler in self.downsamplers:
-                hidden_states = downsampler(hidden_states)
-
-            output_states += (hidden_states,)
-
-        return hidden_states, output_states
-
-
-class DownEncoderBlock2D(nn.Layer):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        output_scale_factor: float = 1.0,
-        add_downsample: bool = True,
-        downsample_padding: int = 1,
-        resnet_pre_temb_non_linearity: bool = False,
-    ):
-        super().__init__()
-        resnets = []
-
-        for i in range(num_layers):
-            in_channels = in_channels if i == 0 else out_channels
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=out_channels,
-                    temb_channels=None,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-                )
-            )
-
-        self.resnets = nn.LayerList(resnets)
-
-        if add_downsample:
-            self.downsamplers = nn.LayerList(
-                [
-                    Downsample2D(
-                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
-                    )
-                ]
-            )
-        else:
-            self.downsamplers = None
-
-    def forward(self, hidden_states):
-        for resnet in self.resnets:
-            hidden_states = resnet(hidden_states, temb=None)
-
-        if self.downsamplers is not None:
-            for downsampler in self.downsamplers:
-                hidden_states = downsampler(hidden_states)
-
-        return hidden_states
-
-
-class AttnDownEncoderBlock2D(nn.Layer):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels: int = 1,
-        output_scale_factor: float = 1.0,
-        add_downsample: bool = True,
-        downsample_padding: int = 1,
-        resnet_pre_temb_non_linearity: bool = False,
-    ):
-        super().__init__()
-        resnets = []
-        attentions = []
-
-        for i in range(num_layers):
-            in_channels = in_channels if i == 0 else out_channels
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=out_channels,
-                    temb_channels=None,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-                )
-            )
-            attentions.append(
-                AttentionBlock(
-                    out_channels,
-                    num_head_channels=attn_num_head_channels,
-                    rescale_output_factor=output_scale_factor,
-                    eps=resnet_eps,
-                    norm_num_groups=resnet_groups,
-                )
-            )
-
-        self.attentions = nn.LayerList(attentions)
-        self.resnets = nn.LayerList(resnets)
-
-        if add_downsample:
-            self.downsamplers = nn.LayerList(
-                [
-                    Downsample2D(
-                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
-                    )
-                ]
-            )
-        else:
-            self.downsamplers = None
-
-    def forward(self, hidden_states):
-        for resnet, attn in zip(self.resnets, self.attentions):
-            hidden_states = resnet(hidden_states, temb=None)
-            hidden_states = attn(hidden_states)
-
-        if self.downsamplers is not None:
-            for downsampler in self.downsamplers:
-                hidden_states = downsampler(hidden_states)
-
-        return hidden_states
-
-
-class AttnSkipDownBlock2D(nn.Layer):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels: int = 1,
-        output_scale_factor: float = np.sqrt(2.0),
-        downsample_padding: int = 1,
-        add_downsample: bool = True,
-        resnet_pre_temb_non_linearity: bool = False,
-    ):
-        super().__init__()
-        self.attentions = nn.LayerList([])
-        self.resnets = nn.LayerList([])
-
-        for i in range(num_layers):
-            in_channels = in_channels if i == 0 else out_channels
-            self.resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=min(in_channels // 4, 32),
-                    groups_out=min(out_channels // 4, 32),
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-                )
-            )
-            self.attentions.append(
-                AttentionBlock(
-                    out_channels,
-                    num_head_channels=attn_num_head_channels,
-                    rescale_output_factor=output_scale_factor,
-                    eps=resnet_eps,
-                )
-            )
-
-        if add_downsample:
-            self.resnet_down = ResnetBlock2D(
-                in_channels=out_channels,
-                out_channels=out_channels,
-                temb_channels=temb_channels,
-                eps=resnet_eps,
-                groups=min(out_channels // 4, 32),
-                dropout=dropout,
-                time_embedding_norm=resnet_time_scale_shift,
-                non_linearity=resnet_act_fn,
-                output_scale_factor=output_scale_factor,
-                pre_norm=resnet_pre_norm,
-                use_in_shortcut=True,
-                down=True,
-                kernel="fir",
-                pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-            )
-            self.downsamplers = nn.LayerList([FirDownsample2D(out_channels, out_channels=out_channels)])
-            self.skip_conv = nn.Conv2D(3, out_channels, kernel_size=(1, 1), stride=(1, 1))
-        else:
-            self.resnet_down = None
-            self.downsamplers = None
-            self.skip_conv = None
-
-    def forward(self, hidden_states, temb=None, skip_sample=None):
-        output_states = ()
-
-        for resnet, attn in zip(self.resnets, self.attentions):
-            hidden_states = resnet(hidden_states, temb)
-            hidden_states = attn(hidden_states)
-            output_states += (hidden_states,)
-
-        if self.downsamplers is not None:
-            hidden_states = self.resnet_down(hidden_states, temb)
-            for downsampler in self.downsamplers:
-                skip_sample = downsampler(skip_sample)
-
-            hidden_states = self.skip_conv(skip_sample) + hidden_states
-
-            output_states += (hidden_states,)
-
-        return hidden_states, output_states, skip_sample
-
-
-class SkipDownBlock2D(nn.Layer):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_pre_norm: bool = True,
-        output_scale_factor: float = np.sqrt(2.0),
-        add_downsample: bool = True,
-        downsample_padding: int = 1,
-        resnet_pre_temb_non_linearity: bool = False,
-    ):
-        super().__init__()
-        self.resnets = nn.LayerList([])
-
-        for i in range(num_layers):
-            in_channels = in_channels if i == 0 else out_channels
-            self.resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=min(in_channels // 4, 32),
-                    groups_out=min(out_channels // 4, 32),
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-                )
-            )
-
-        if add_downsample:
-            self.resnet_down = ResnetBlock2D(
-                in_channels=out_channels,
-                out_channels=out_channels,
-                temb_channels=temb_channels,
-                eps=resnet_eps,
-                groups=min(out_channels // 4, 32),
-                dropout=dropout,
-                time_embedding_norm=resnet_time_scale_shift,
-                non_linearity=resnet_act_fn,
-                output_scale_factor=output_scale_factor,
-                pre_norm=resnet_pre_norm,
-                use_in_shortcut=True,
-                down=True,
-                kernel="fir",
-                pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-            )
-            self.downsamplers = nn.LayerList([FirDownsample2D(out_channels, out_channels=out_channels)])
-            self.skip_conv = nn.Conv2D(3, out_channels, kernel_size=(1, 1), stride=(1, 1))
-        else:
-            self.resnet_down = None
-            self.downsamplers = None
-            self.skip_conv = None
-
-    def forward(self, hidden_states, temb=None, skip_sample=None):
-        output_states = ()
-
-        for resnet in self.resnets:
-            hidden_states = resnet(hidden_states, temb)
-            output_states += (hidden_states,)
-
-        if self.downsamplers is not None:
-            hidden_states = self.resnet_down(hidden_states, temb)
-            for downsampler in self.downsamplers:
-                skip_sample = downsampler(skip_sample)
-
-            hidden_states = self.skip_conv(skip_sample) + hidden_states
-
-            output_states += (hidden_states,)
-
-        return hidden_states, output_states, skip_sample
-
-
-class ResnetDownsampleBlock2D(nn.Layer):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        output_scale_factor: float = 1.0,
-        add_downsample: bool = True,
-        skip_time_act: bool = False,
-        resnet_pre_temb_non_linearity: bool = False,
-    ):
-        super().__init__()
-        resnets = []
-
-        for i in range(num_layers):
-            in_channels = in_channels if i == 0 else out_channels
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                    skip_time_act=skip_time_act,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-                )
-            )
-
-        self.resnets = nn.LayerList(resnets)
-
-        if add_downsample:
-            self.downsamplers = nn.LayerList(
-                [
-                    ResnetBlock2D(
-                        in_channels=out_channels,
-                        out_channels=out_channels,
-                        temb_channels=temb_channels,
-                        eps=resnet_eps,
-                        groups=resnet_groups,
-                        dropout=dropout,
-                        time_embedding_norm=resnet_time_scale_shift,
-                        non_linearity=resnet_act_fn,
-                        output_scale_factor=output_scale_factor,
-                        pre_norm=resnet_pre_norm,
-                        skip_time_act=skip_time_act,
-                        down=True,
-                        pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-                    )
-                ]
-            )
-        else:
-            self.downsamplers = None
-
-        self.gradient_checkpointing = False
-
-    def forward(self, hidden_states, temb=None):
-        output_states = ()
-
-        for resnet in self.resnets:
-            if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs)
-
-                    return custom_forward
-
-                hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
-            else:
-                hidden_states = resnet(hidden_states, temb)
-
-            output_states += (hidden_states,)
-
-        if self.downsamplers is not None:
-            for downsampler in self.downsamplers:
-                hidden_states = downsampler(hidden_states, temb)
-
-            output_states += (hidden_states,)
-
-        return hidden_states, output_states
-
-
-class SimpleCrossAttnDownBlock2D(nn.Layer):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels: int = 1,
-        cross_attention_dim: int = 1280,
-        output_scale_factor: float = 1.0,
-        add_downsample: bool = True,
-        skip_time_act=False,
-        only_cross_attention=False,
-        cross_attention_norm=None,
-        resnet_pre_temb_non_linearity: bool = False,
-    ):
-        super().__init__()
-
-        self.has_cross_attention = True
-
-        resnets = []
-        attentions = []
-
-        self.attn_num_head_channels = attn_num_head_channels
-        self.num_heads = out_channels // self.attn_num_head_channels
-
-        for i in range(num_layers):
-            in_channels = in_channels if i == 0 else out_channels
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                    skip_time_act=skip_time_act,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-                )
-            )
-            # TODO use AttnAddedKVProcessor2_5
-            # processor = (
-            #     AttnAddedKVProcessor2_5() if hasattr(F, "scaled_dot_product_attention_") else AttnAddedKVProcessor()
-            # )
-            processor = AttnAddedKVProcessor()
-            attentions.append(
-                Attention(
-                    query_dim=out_channels,
-                    cross_attention_dim=out_channels,
-                    heads=self.num_heads,
-                    dim_head=attn_num_head_channels,
-                    added_kv_proj_dim=cross_attention_dim,
-                    norm_num_groups=resnet_groups,
-                    bias=True,
-                    upcast_softmax=True,
-                    only_cross_attention=only_cross_attention,
-                    cross_attention_norm=cross_attention_norm,
-                    processor=processor,
-                )
-            )
-        self.attentions = nn.LayerList(attentions)
-        self.resnets = nn.LayerList(resnets)
-
-        if add_downsample:
-            self.downsamplers = nn.LayerList(
-                [
-                    ResnetBlock2D(
-                        in_channels=out_channels,
-                        out_channels=out_channels,
-                        temb_channels=temb_channels,
-                        eps=resnet_eps,
-                        groups=resnet_groups,
-                        dropout=dropout,
-                        time_embedding_norm=resnet_time_scale_shift,
-                        non_linearity=resnet_act_fn,
-                        output_scale_factor=output_scale_factor,
-                        pre_norm=resnet_pre_norm,
-                        skip_time_act=skip_time_act,
-                        down=True,
-                        pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-                    )
-                ]
-            )
-        else:
-            self.downsamplers = None
-
-        self.gradient_checkpointing = False
-
-    def forward(
-        self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None, cross_attention_kwargs=None
-    ):
-        output_states = ()
-        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
-
-        for resnet, attn in zip(self.resnets, self.attentions):
-            # resnet
-            hidden_states = resnet(hidden_states, temb)
-
-            # attn
-            hidden_states = attn(
-                hidden_states,
-                encoder_hidden_states=encoder_hidden_states,
-                attention_mask=attention_mask,
-                **cross_attention_kwargs,
-            )
-
-            output_states += (hidden_states,)
-
-        if self.downsamplers is not None:
-            for downsampler in self.downsamplers:
-                hidden_states = downsampler(hidden_states, temb)
-
-            output_states += (hidden_states,)
-
-        return hidden_states, output_states
-
-
-class KDownBlock2D(nn.Layer):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 4,
-        resnet_eps: float = 1e-5,
-        resnet_act_fn: str = "gelu",
-        resnet_group_size: int = 32,
-        add_downsample: bool = False,
-        resnet_pre_temb_non_linearity: bool = False,
-    ):
-        super().__init__()
-        resnets = []
-
-        for i in range(num_layers):
-            in_channels = in_channels if i == 0 else out_channels
-            groups = in_channels // resnet_group_size
-            groups_out = out_channels // resnet_group_size
-
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=out_channels,
-                    dropout=dropout,
-                    temb_channels=temb_channels,
-                    groups=groups,
-                    groups_out=groups_out,
-                    eps=resnet_eps,
-                    non_linearity=resnet_act_fn,
-                    time_embedding_norm="ada_group",
-                    conv_shortcut_bias=False,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-                )
-            )
-
-        self.resnets = nn.LayerList(resnets)
-
-        if add_downsample:
-            # YiYi's comments- might be able to use FirDownsample2D, look into details later
-            self.downsamplers = nn.LayerList([KDownsample2D()])
-        else:
-            self.downsamplers = None
-
-        self.gradient_checkpointing = False
-
-    def forward(self, hidden_states, temb=None):
-        output_states = ()
-
-        for resnet in self.resnets:
-            if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs)
-
-                    return custom_forward
-
-                hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
-            else:
-                hidden_states = resnet(hidden_states, temb)
-
-            output_states += (hidden_states,)
-
-        if self.downsamplers is not None:
-            for downsampler in self.downsamplers:
-                hidden_states = downsampler(hidden_states)
-
-        return hidden_states, output_states
-
-
-class KCrossAttnDownBlock2D(nn.Layer):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        temb_channels: int,
-        cross_attention_dim: int,
-        dropout: float = 0.0,
-        num_layers: int = 4,
-        resnet_group_size: int = 32,
-        add_downsample=True,
-        attn_num_head_channels: int = 64,
-        add_self_attention: bool = False,
-        resnet_eps: float = 1e-5,
-        resnet_act_fn: str = "gelu",
-        resnet_pre_temb_non_linearity: bool = False,
-    ):
-        super().__init__()
-        resnets = []
-        attentions = []
-
-        self.has_cross_attention = True
-
-        for i in range(num_layers):
-            in_channels = in_channels if i == 0 else out_channels
-            groups = in_channels // resnet_group_size
-            groups_out = out_channels // resnet_group_size
-
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=out_channels,
-                    dropout=dropout,
-                    temb_channels=temb_channels,
-                    groups=groups,
-                    groups_out=groups_out,
-                    eps=resnet_eps,
-                    non_linearity=resnet_act_fn,
-                    time_embedding_norm="ada_group",
-                    conv_shortcut_bias=False,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-                )
-            )
-            attentions.append(
-                KAttentionBlock(
-                    out_channels,
-                    out_channels // attn_num_head_channels,
-                    attn_num_head_channels,
-                    cross_attention_dim=cross_attention_dim,
-                    temb_channels=temb_channels,
-                    attention_bias=True,
-                    add_self_attention=add_self_attention,
-                    cross_attention_norm="layer_norm",
-                    group_size=resnet_group_size,
-                )
-            )
-
-        self.resnets = nn.LayerList(resnets)
-        self.attentions = nn.LayerList(attentions)
-
-        if add_downsample:
-            self.downsamplers = nn.LayerList([KDownsample2D()])
-        else:
-            self.downsamplers = None
-
-        self.gradient_checkpointing = False
-
-    def forward(
-        self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None, cross_attention_kwargs=None
-    ):
-        output_states = ()
-
-        for resnet, attn in zip(self.resnets, self.attentions):
-            if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
-
-                def create_custom_forward(module, return_dict=None):
-                    def custom_forward(*inputs):
-                        if return_dict is not None:
-                            return module(*inputs, return_dict=return_dict)
-                        else:
-                            return module(*inputs)
-
-                    return custom_forward
-
-                hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
-                hidden_states = recompute(
-                    create_custom_forward(attn, return_dict=False),
-                    hidden_states,
-                    encoder_hidden_states,
-                    attention_mask,
-                    cross_attention_kwargs,
-                )
-            else:
-                hidden_states = resnet(hidden_states, temb)
-                hidden_states = attn(
-                    hidden_states,
-                    encoder_hidden_states=encoder_hidden_states,
-                    emb=temb,
-                    attention_mask=attention_mask,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                )
-
-            if self.downsamplers is None:
-                output_states += (None,)
-            else:
-                output_states += (hidden_states,)
-
-        if self.downsamplers is not None:
-            for downsampler in self.downsamplers:
-                hidden_states = downsampler(hidden_states)
-
-        return hidden_states, output_states
-
-
-class AttnUpBlock2D(nn.Layer):
-    def __init__(
-        self,
-        in_channels: int,
-        prev_output_channel: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels: int = 1,
-        output_scale_factor: float = 1.0,
-        add_upsample: bool = True,
-        resnet_pre_temb_non_linearity: bool = False,
-    ):
-        super().__init__()
-        resnets = []
-        attentions = []
-
-        for i in range(num_layers):
-            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
-            resnet_in_channels = prev_output_channel if i == 0 else out_channels
-
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=resnet_in_channels + res_skip_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-                )
-            )
-            attentions.append(
-                AttentionBlock(
-                    out_channels,
-                    num_head_channels=attn_num_head_channels,
-                    rescale_output_factor=output_scale_factor,
-                    eps=resnet_eps,
-                    norm_num_groups=resnet_groups,
-                )
-            )
-
-        self.attentions = nn.LayerList(attentions)
-        self.resnets = nn.LayerList(resnets)
-
-        if add_upsample:
-            self.upsamplers = nn.LayerList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
-        else:
-            self.upsamplers = None
-
-    def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
-        for resnet, attn in zip(self.resnets, self.attentions):
-            # pop res hidden states
-            res_hidden_states = res_hidden_states_tuple[-1]
-            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-            hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
-
-            hidden_states = resnet(hidden_states, temb)
-            hidden_states = attn(hidden_states)
-
-        if self.upsamplers is not None:
-            for upsampler in self.upsamplers:
-                hidden_states = upsampler(hidden_states)
-
-        return hidden_states
-
-
-class CrossAttnUpBlock2D(nn.Layer):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        prev_output_channel: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels: int = 1,
-        cross_attention_dim: int = 1280,
-        output_scale_factor: float = 1.0,
-        add_upsample: bool = True,
-        dual_cross_attention: bool = False,
-        use_linear_projection: bool = False,
-        only_cross_attention: bool = False,
-        upcast_attention: bool = False,
-        resnet_pre_temb_non_linearity: bool = False,
-    ):
-        super().__init__()
-        resnets = []
-        attentions = []
-
-        self.has_cross_attention = True
-        self.attn_num_head_channels = attn_num_head_channels
-
-        for i in range(num_layers):
-            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
-            resnet_in_channels = prev_output_channel if i == 0 else out_channels
-
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=resnet_in_channels + res_skip_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-                )
-            )
-            if not dual_cross_attention:
-                attentions.append(
-                    Transformer2DModel(
-                        attn_num_head_channels,
-                        out_channels // attn_num_head_channels,
-                        in_channels=out_channels,
-                        num_layers=1,
-                        cross_attention_dim=cross_attention_dim,
-                        norm_num_groups=resnet_groups,
-                        use_linear_projection=use_linear_projection,
-                        only_cross_attention=only_cross_attention,
-                        upcast_attention=upcast_attention,
-                    )
-                )
-            else:
-                attentions.append(
-                    DualTransformer2DModel(
-                        attn_num_head_channels,
-                        out_channels // attn_num_head_channels,
-                        in_channels=out_channels,
-                        num_layers=1,
-                        cross_attention_dim=cross_attention_dim,
-                        norm_num_groups=resnet_groups,
-                    )
-                )
-        self.attentions = nn.LayerList(attentions)
-        self.resnets = nn.LayerList(resnets)
-
-        if add_upsample:
-            self.upsamplers = nn.LayerList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
-        else:
-            self.upsamplers = None
-
-        self.gradient_checkpointing = False
-
-    def forward(
-        self,
-        hidden_states,
-        res_hidden_states_tuple,
-        temb=None,
-        encoder_hidden_states=None,
-        cross_attention_kwargs=None,
-        upsample_size=None,
-        attention_mask=None,
-    ):
-        # TODO(Patrick, William) - attention mask is not used
-        for resnet, attn in zip(self.resnets, self.attentions):
-            # pop res hidden states
-            res_hidden_states = res_hidden_states_tuple[-1]
-            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-            hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
-
-            if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
-
-                def create_custom_forward(module, return_dict=None):
-                    def custom_forward(*inputs):
-                        if return_dict is not None:
-                            return module(*inputs, return_dict=return_dict)[0]  # move [0]
-                        else:
-                            return module(*inputs)
-
-                    return custom_forward
-
-                hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
-                hidden_states = recompute(
-                    create_custom_forward(attn, return_dict=False),
-                    hidden_states,
-                    encoder_hidden_states,
-                    cross_attention_kwargs,
-                )  # [0]
-            else:
-                hidden_states = resnet(hidden_states, temb)
-                hidden_states = attn(
-                    hidden_states,
-                    encoder_hidden_states=encoder_hidden_states,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                ).sample
-
-        if self.upsamplers is not None:
-            for upsampler in self.upsamplers:
-                hidden_states = upsampler(hidden_states, upsample_size)
-
-        return hidden_states
-
-
-class UpBlock2D(nn.Layer):
-    def __init__(
-        self,
-        in_channels: int,
-        prev_output_channel: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        output_scale_factor: float = 1.0,
-        add_upsample: bool = True,
-        resnet_pre_temb_non_linearity: bool = False,
-    ):
-        super().__init__()
-        resnets = []
-
-        for i in range(num_layers):
-            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
-            resnet_in_channels = prev_output_channel if i == 0 else out_channels
-
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=resnet_in_channels + res_skip_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-                )
-            )
-
-        self.resnets = nn.LayerList(resnets)
-
-        if add_upsample:
-            self.upsamplers = nn.LayerList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
-        else:
-            self.upsamplers = None
-
-        self.gradient_checkpointing = False
-
-    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None):
-        for resnet in self.resnets:
-            # pop res hidden states
-            res_hidden_states = res_hidden_states_tuple[-1]
-            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-            hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
-
-            if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs)
-
-                    return custom_forward
-
-                hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
-            else:
-                hidden_states = resnet(hidden_states, temb)
-
-        if self.upsamplers is not None:
-            for upsampler in self.upsamplers:
-                hidden_states = upsampler(hidden_states, upsample_size)
-
-        return hidden_states
-
-
-class UpDecoderBlock2D(nn.Layer):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        output_scale_factor: float = 1.0,
-        add_upsample: bool = True,
-        resnet_pre_temb_non_linearity: bool = False,
-    ):
-        super().__init__()
-        resnets = []
-
-        for i in range(num_layers):
-            input_channels = in_channels if i == 0 else out_channels
-
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=input_channels,
-                    out_channels=out_channels,
-                    temb_channels=None,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-                )
-            )
-
-        self.resnets = nn.LayerList(resnets)
-
-        if add_upsample:
-            self.upsamplers = nn.LayerList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
-        else:
-            self.upsamplers = None
-
-    def forward(self, hidden_states):
-        for resnet in self.resnets:
-            hidden_states = resnet(hidden_states, temb=None)
-
-        if self.upsamplers is not None:
-            for upsampler in self.upsamplers:
-                hidden_states = upsampler(hidden_states)
-
-        return hidden_states
-
-
-class AttnUpDecoderBlock2D(nn.Layer):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels: int = 1,
-        output_scale_factor: float = 1.0,
-        add_upsample: bool = True,
-        resnet_pre_temb_non_linearity: bool = False,
-    ):
-        super().__init__()
-        resnets = []
-        attentions = []
-
-        for i in range(num_layers):
-            input_channels = in_channels if i == 0 else out_channels
-
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=input_channels,
-                    out_channels=out_channels,
-                    temb_channels=None,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-                )
-            )
-            attentions.append(
-                AttentionBlock(
-                    out_channels,
-                    num_head_channels=attn_num_head_channels,
-                    rescale_output_factor=output_scale_factor,
-                    eps=resnet_eps,
-                    norm_num_groups=resnet_groups,
-                )
-            )
-
-        self.attentions = nn.LayerList(attentions)
-        self.resnets = nn.LayerList(resnets)
-
-        if add_upsample:
-            self.upsamplers = nn.LayerList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
-        else:
-            self.upsamplers = None
-
-    def forward(self, hidden_states):
-        for resnet, attn in zip(self.resnets, self.attentions):
-            hidden_states = resnet(hidden_states, temb=None)
-            hidden_states = attn(hidden_states)
-
-        if self.upsamplers is not None:
-            for upsampler in self.upsamplers:
-                hidden_states = upsampler(hidden_states)
-
-        return hidden_states
-
-
-class AttnSkipUpBlock2D(nn.Layer):
-    def __init__(
-        self,
-        in_channels: int,
-        prev_output_channel: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels: int = 1,
-        output_scale_factor: float = np.sqrt(2.0),
-        upsample_padding: int = 1,
-        add_upsample: bool = True,
-        resnet_pre_temb_non_linearity: bool = False,
-    ):
-        super().__init__()
-        self.attentions = nn.LayerList([])
-        self.resnets = nn.LayerList([])
-
-        for i in range(num_layers):
-            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
-            resnet_in_channels = prev_output_channel if i == 0 else out_channels
-
-            self.resnets.append(
-                ResnetBlock2D(
-                    in_channels=resnet_in_channels + res_skip_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=min(resnet_in_channels + res_skip_channels // 4, 32),
-                    groups_out=min(out_channels // 4, 32),
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-                )
-            )
-
-        self.attentions.append(
-            AttentionBlock(
-                out_channels,
-                num_head_channels=attn_num_head_channels,
-                rescale_output_factor=output_scale_factor,
-                eps=resnet_eps,
-            )
-        )
-
-        self.upsampler = FirUpsample2D(in_channels, out_channels=out_channels)
-        if add_upsample:
-            self.resnet_up = ResnetBlock2D(
-                in_channels=out_channels,
-                out_channels=out_channels,
-                temb_channels=temb_channels,
-                eps=resnet_eps,
-                groups=min(out_channels // 4, 32),
-                groups_out=min(out_channels // 4, 32),
-                dropout=dropout,
-                time_embedding_norm=resnet_time_scale_shift,
-                non_linearity=resnet_act_fn,
-                output_scale_factor=output_scale_factor,
-                pre_norm=resnet_pre_norm,
-                use_in_shortcut=True,
-                up=True,
-                kernel="fir",
-                pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-            )
-            self.skip_conv = nn.Conv2D(out_channels, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
-            self.skip_norm = nn.GroupNorm(
-                num_groups=min(out_channels // 4, 32), num_channels=out_channels, epsilon=resnet_eps
-            )
-            self.act = nn.Silu()
-        else:
-            self.resnet_up = None
-            self.skip_conv = None
-            self.skip_norm = None
-            self.act = None
-
-    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, skip_sample=None):
-        for resnet in self.resnets:
-            # pop res hidden states
-            res_hidden_states = res_hidden_states_tuple[-1]
-            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-            hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
-
-            hidden_states = resnet(hidden_states, temb)
-
-        hidden_states = self.attentions[0](hidden_states)
-
-        if skip_sample is not None:
-            skip_sample = self.upsampler(skip_sample)
-        else:
-            skip_sample = 0
-
-        if self.resnet_up is not None:
-            skip_sample_states = self.skip_norm(hidden_states)
-            skip_sample_states = self.act(skip_sample_states)
-            skip_sample_states = self.skip_conv(skip_sample_states)
-
-            skip_sample = skip_sample + skip_sample_states
-
-            hidden_states = self.resnet_up(hidden_states, temb)
-
-        return hidden_states, skip_sample
-
-
-class SkipUpBlock2D(nn.Layer):
-    def __init__(
-        self,
-        in_channels: int,
-        prev_output_channel: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_pre_norm: bool = True,
-        output_scale_factor: float = np.sqrt(2.0),
-        add_upsample: bool = True,
-        upsample_padding: int = 1,
-        resnet_pre_temb_non_linearity: bool = False,
-    ):
-        super().__init__()
-        self.resnets = nn.LayerList([])
-
-        for i in range(num_layers):
-            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
-            resnet_in_channels = prev_output_channel if i == 0 else out_channels
-
-            self.resnets.append(
-                ResnetBlock2D(
-                    in_channels=resnet_in_channels + res_skip_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=min((resnet_in_channels + res_skip_channels) // 4, 32),
-                    groups_out=min(out_channels // 4, 32),
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-                )
-            )
-
-        self.upsampler = FirUpsample2D(in_channels, out_channels=out_channels)
-        if add_upsample:
-            self.resnet_up = ResnetBlock2D(
-                in_channels=out_channels,
-                out_channels=out_channels,
-                temb_channels=temb_channels,
-                eps=resnet_eps,
-                groups=min(out_channels // 4, 32),
-                groups_out=min(out_channels // 4, 32),
-                dropout=dropout,
-                time_embedding_norm=resnet_time_scale_shift,
-                non_linearity=resnet_act_fn,
-                output_scale_factor=output_scale_factor,
-                pre_norm=resnet_pre_norm,
-                use_in_shortcut=True,
-                up=True,
-                kernel="fir",
-                pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-            )
-            self.skip_conv = nn.Conv2D(out_channels, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
-            self.skip_norm = nn.GroupNorm(
-                num_groups=min(out_channels // 4, 32), num_channels=out_channels, epsilon=resnet_eps
-            )
-            self.act = nn.Silu()
-        else:
-            self.resnet_up = None
-            self.skip_conv = None
-            self.skip_norm = None
-            self.act = None
-
-    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, skip_sample=None):
-        for resnet in self.resnets:
-            # pop res hidden states
-            res_hidden_states = res_hidden_states_tuple[-1]
-            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-            hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
-
-            hidden_states = resnet(hidden_states, temb)
-
-        if skip_sample is not None:
-            skip_sample = self.upsampler(skip_sample)
-        else:
-            skip_sample = 0
-
-        if self.resnet_up is not None:
-            skip_sample_states = self.skip_norm(hidden_states)
-            skip_sample_states = self.act(skip_sample_states)
-            skip_sample_states = self.skip_conv(skip_sample_states)
-
-            skip_sample = skip_sample + skip_sample_states
-
-            hidden_states = self.resnet_up(hidden_states, temb)
-
-        return hidden_states, skip_sample
-
-
-class ResnetUpsampleBlock2D(nn.Layer):
-    def __init__(
-        self,
-        in_channels: int,
-        prev_output_channel: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        output_scale_factor: float = 1.0,
-        add_upsample: bool = True,
-        skip_time_act=False,
-        resnet_pre_temb_non_linearity: bool = False,
-    ):
-        super().__init__()
-        resnets = []
-
-        for i in range(num_layers):
-            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
-            resnet_in_channels = prev_output_channel if i == 0 else out_channels
-
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=resnet_in_channels + res_skip_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                    skip_time_act=skip_time_act,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-                )
-            )
-
-        self.resnets = nn.LayerList(resnets)
-
-        if add_upsample:
-            self.upsamplers = nn.LayerList(
-                [
-                    ResnetBlock2D(
-                        in_channels=out_channels,
-                        out_channels=out_channels,
-                        temb_channels=temb_channels,
-                        eps=resnet_eps,
-                        groups=resnet_groups,
-                        dropout=dropout,
-                        time_embedding_norm=resnet_time_scale_shift,
-                        non_linearity=resnet_act_fn,
-                        output_scale_factor=output_scale_factor,
-                        pre_norm=resnet_pre_norm,
-                        skip_time_act=skip_time_act,
-                        up=True,
-                        pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-                    )
-                ]
-            )
-        else:
-            self.upsamplers = None
-
-        self.gradient_checkpointing = False
-
-    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None):
-        for resnet in self.resnets:
-            # pop res hidden states
-            res_hidden_states = res_hidden_states_tuple[-1]
-            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-            hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
-
-            if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs)
-
-                    return custom_forward
-
-                hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
-            else:
-                hidden_states = resnet(hidden_states, temb)
-
-        if self.upsamplers is not None:
-            for upsampler in self.upsamplers:
-                hidden_states = upsampler(hidden_states, temb)
-
-        return hidden_states
-
-
-class SimpleCrossAttnUpBlock2D(nn.Layer):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        prev_output_channel: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels: int = 1,
-        cross_attention_dim: int = 1280,
-        output_scale_factor: float = 1.0,
-        add_upsample: bool = True,
-        skip_time_act=False,
-        only_cross_attention=False,
-        cross_attention_norm=None,
-        resnet_pre_temb_non_linearity: bool = False,
-    ):
-        super().__init__()
-        resnets = []
-        attentions = []
-
-        self.has_cross_attention = True
-        self.attn_num_head_channels = attn_num_head_channels
-
-        self.num_heads = out_channels // self.attn_num_head_channels
-
-        for i in range(num_layers):
-            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
-            resnet_in_channels = prev_output_channel if i == 0 else out_channels
-
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=resnet_in_channels + res_skip_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                    skip_time_act=skip_time_act,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-                )
-            )
-            # TODO support AttnAddedKVProcessor2_5
-            # processor = (
-            #     AttnAddedKVProcessor2_5() if hasattr(F, "scaled_dot_product_attention_") else AttnAddedKVProcessor()
-            # )
-            processor = AttnAddedKVProcessor()
-            attentions.append(
-                Attention(
-                    query_dim=out_channels,
-                    cross_attention_dim=out_channels,
-                    heads=self.num_heads,
-                    dim_head=attn_num_head_channels,
-                    added_kv_proj_dim=cross_attention_dim,
-                    norm_num_groups=resnet_groups,
-                    bias=True,
-                    upcast_softmax=True,
-                    only_cross_attention=only_cross_attention,
-                    cross_attention_norm=cross_attention_norm,
-                    processor=processor,
-                )
-            )
-        self.attentions = nn.LayerList(attentions)
-        self.resnets = nn.LayerList(resnets)
-
-        if add_upsample:
-            self.upsamplers = nn.LayerList(
-                [
-                    ResnetBlock2D(
-                        in_channels=out_channels,
-                        out_channels=out_channels,
-                        temb_channels=temb_channels,
-                        eps=resnet_eps,
-                        groups=resnet_groups,
-                        dropout=dropout,
-                        time_embedding_norm=resnet_time_scale_shift,
-                        non_linearity=resnet_act_fn,
-                        output_scale_factor=output_scale_factor,
-                        pre_norm=resnet_pre_norm,
-                        skip_time_act=skip_time_act,
-                        up=True,
-                        pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-                    )
-                ]
-            )
-        else:
-            self.upsamplers = None
-
-        self.gradient_checkpointing = False
-
-    def forward(
-        self,
-        hidden_states,
-        res_hidden_states_tuple,
-        temb=None,
-        encoder_hidden_states=None,
-        upsample_size=None,
-        attention_mask=None,
-        cross_attention_kwargs=None,
-    ):
-        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
-        for resnet, attn in zip(self.resnets, self.attentions):
-            # resnet
-            # pop res hidden states
-            res_hidden_states = res_hidden_states_tuple[-1]
-            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-            hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
-
-            hidden_states = resnet(hidden_states, temb)
-
-            # attn
-            hidden_states = attn(
-                hidden_states,
-                encoder_hidden_states=encoder_hidden_states,
-                attention_mask=attention_mask,
-                **cross_attention_kwargs,
-            )
-
-        if self.upsamplers is not None:
-            for upsampler in self.upsamplers:
-                hidden_states = upsampler(hidden_states, temb)
-
-        return hidden_states
-
-
-class KUpBlock2D(nn.Layer):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 5,
-        resnet_eps: float = 1e-5,
-        resnet_act_fn: str = "gelu",
-        resnet_group_size: Optional[int] = 32,
-        add_upsample: bool = True,
-        resnet_pre_temb_non_linearity: bool = False,
-    ):
-        super().__init__()
-        resnets = []
-        k_in_channels = 2 * out_channels
-        k_out_channels = in_channels
-        num_layers = num_layers - 1
-
-        for i in range(num_layers):
-            in_channels = k_in_channels if i == 0 else out_channels
-            groups = in_channels // resnet_group_size
-            groups_out = out_channels // resnet_group_size
-
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=k_out_channels if (i == num_layers - 1) else out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=groups,
-                    groups_out=groups_out,
-                    dropout=dropout,
-                    non_linearity=resnet_act_fn,
-                    time_embedding_norm="ada_group",
-                    conv_shortcut_bias=False,
-                    pre_norm=resnet_pre_temb_non_linearity,
-                )
-            )
-
-        self.resnets = nn.LayerList(resnets)
-
-        if add_upsample:
-            self.upsamplers = nn.LayerList([KUpsample2D()])
-        else:
-            self.upsamplers = None
-
-        self.gradient_checkpointing = False
-
-    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None):
-        res_hidden_states_tuple = res_hidden_states_tuple[-1]
-        if res_hidden_states_tuple is not None:
-            hidden_states = paddle.concat([hidden_states, res_hidden_states_tuple], axis=1)
-
-        for resnet in self.resnets:
-            if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs)
-
-                    return custom_forward
-
-                hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
-            else:
-                hidden_states = resnet(hidden_states, temb)
-
-        if self.upsamplers is not None:
-            for upsampler in self.upsamplers:
-                hidden_states = upsampler(hidden_states)
-
-        return hidden_states
-
-
-class KCrossAttnUpBlock2D(nn.Layer):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 4,
-        resnet_eps: float = 1e-5,
-        resnet_act_fn: str = "gelu",
-        resnet_group_size: int = 32,
-        attn_num_head_channels=1,  # attention dim_head
-        cross_attention_dim: int = 768,
-        add_upsample: bool = True,
-        upcast_attention: bool = False,
-        resnet_pre_temb_non_linearity: bool = False,
-    ):
-        super().__init__()
-        resnets = []
-        attentions = []
-
-        is_first_block = in_channels == out_channels == temb_channels
-        is_middle_block = in_channels != out_channels
-        add_self_attention = True if is_first_block else False
-
-        self.has_cross_attention = True
-        self.attn_num_head_channels = attn_num_head_channels
-
-        # in_channels, and out_channels for the block (k-unet)
-        k_in_channels = out_channels if is_first_block else 2 * out_channels
-        k_out_channels = in_channels
-
-        num_layers = num_layers - 1
-
-        for i in range(num_layers):
-            in_channels = k_in_channels if i == 0 else out_channels
-            groups = in_channels // resnet_group_size
-            groups_out = out_channels // resnet_group_size
-
-            if is_middle_block and (i == num_layers - 1):
-                conv_2d_out_channels = k_out_channels
-            else:
-                conv_2d_out_channels = None
-
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=out_channels,
-                    conv_2d_out_channels=conv_2d_out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=groups,
-                    groups_out=groups_out,
-                    dropout=dropout,
-                    non_linearity=resnet_act_fn,
-                    time_embedding_norm="ada_group",
-                    conv_shortcut_bias=False,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-                )
-            )
-            attentions.append(
-                KAttentionBlock(
-                    k_out_channels if (i == num_layers - 1) else out_channels,
-                    k_out_channels // attn_num_head_channels
-                    if (i == num_layers - 1)
-                    else out_channels // attn_num_head_channels,
-                    attn_num_head_channels,
-                    cross_attention_dim=cross_attention_dim,
-                    temb_channels=temb_channels,
-                    attention_bias=True,
-                    add_self_attention=add_self_attention,
-                    cross_attention_norm="layer_norm",
-                    upcast_attention=upcast_attention,
-                )
-            )
-
-        self.resnets = nn.LayerList(resnets)
-        self.attentions = nn.LayerList(attentions)
-
-        if add_upsample:
-            self.upsamplers = nn.LayerList([KUpsample2D()])
-        else:
-            self.upsamplers = None
-
-        self.gradient_checkpointing = False
-
-    def forward(
-        self,
-        hidden_states,
-        res_hidden_states_tuple,
-        temb=None,
-        encoder_hidden_states=None,
-        cross_attention_kwargs=None,
-        upsample_size=None,
-        attention_mask=None,
-    ):
-        res_hidden_states_tuple = res_hidden_states_tuple[-1]
-        if res_hidden_states_tuple is not None:
-            hidden_states = paddle.concat([hidden_states, res_hidden_states_tuple], axis=1)
-
-        for resnet, attn in zip(self.resnets, self.attentions):
-            if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
-
-                def create_custom_forward(module, return_dict=None):
-                    def custom_forward(*inputs):
-                        if return_dict is not None:
-                            return module(*inputs, return_dict=return_dict)[0]  # move [0]
-                        else:
-                            return module(*inputs)
-
-                    return custom_forward
-
-                hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
-                hidden_states = recompute(
-                    create_custom_forward(attn, return_dict=False),
-                    hidden_states,
-                    encoder_hidden_states,
-                    attention_mask,
-                    cross_attention_kwargs,
-                )  # [0]
-            else:
-                hidden_states = resnet(hidden_states, temb)
-                hidden_states = attn(
-                    hidden_states,
-                    encoder_hidden_states=encoder_hidden_states,
-                    emb=temb,
-                    attention_mask=attention_mask,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                )
-
-        if self.upsamplers is not None:
-            for upsampler in self.upsamplers:
-                hidden_states = upsampler(hidden_states)
-
-        return hidden_states
-
-
-# can potentially later be renamed to `No-feed-forward` attention
-class KAttentionBlock(nn.Layer):
-    r"""
-    A basic Transformer block.
-
-    Parameters:
-        dim (`int`): The number of channels in the input and output.
-        num_attention_heads (`int`): The number of heads to use for multi-head attention.
-        attention_head_dim (`int`): The number of channels in each head.
-        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
-        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
-        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
-        num_embeds_ada_norm (:
-            obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
-        attention_bias (:
-            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
-    """
-
-    def __init__(
-        self,
-        dim: int,
-        num_attention_heads: int,
-        attention_head_dim: int,
-        dropout: float = 0.0,
-        cross_attention_dim: Optional[int] = None,
-        attention_bias: bool = False,
-        upcast_attention: bool = False,
-        temb_channels: int = 768,  # for ada_group_norm
-        add_self_attention: bool = False,
-        cross_attention_norm: Optional[str] = None,
-        group_size: int = 32,
-    ):
-        super().__init__()
-        self.add_self_attention = add_self_attention
-
-        # 1. Self-Attn
-        if add_self_attention:
-            self.norm1 = AdaGroupNorm(temb_channels, dim, max(1, dim // group_size))
-            self.attn1 = Attention(
-                query_dim=dim,
-                heads=num_attention_heads,
-                dim_head=attention_head_dim,
-                dropout=dropout,
-                bias=attention_bias,
-                cross_attention_dim=None,
-                cross_attention_norm=None,
-            )
-
-        # 2. Cross-Attn
-        self.norm2 = AdaGroupNorm(temb_channels, dim, max(1, dim // group_size))
-        self.attn2 = Attention(
-            query_dim=dim,
-            cross_attention_dim=cross_attention_dim,
-            heads=num_attention_heads,
-            dim_head=attention_head_dim,
-            dropout=dropout,
-            bias=attention_bias,
-            upcast_attention=upcast_attention,
-            cross_attention_norm=cross_attention_norm,
-        )
-
-    def _to_3d(self, hidden_states, height, weight):
-        return hidden_states.transpose([0, 2, 3, 1]).reshape([hidden_states.shape[0], height * weight, -1])
-
-    def _to_4d(self, hidden_states, height, weight):
-        return hidden_states.transpose([0, 2, 1]).reshape([hidden_states.shape[0], -1, height, weight])
-
-    def forward(
-        self,
-        hidden_states,
-        encoder_hidden_states=None,
-        emb=None,
-        attention_mask=None,
-        cross_attention_kwargs=None,
-    ):
-        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
-
-        # 1. Self-Attention
-        if self.add_self_attention:
-            norm_hidden_states = self.norm1(hidden_states, emb)
-
-            height, weight = norm_hidden_states.shape[2:]
-            norm_hidden_states = self._to_3d(norm_hidden_states, height, weight)
-
-            attn_output = self.attn1(
-                norm_hidden_states,
-                encoder_hidden_states=None,
-                **cross_attention_kwargs,
-            )
-            attn_output = self._to_4d(attn_output, height, weight)
-
-            hidden_states = attn_output + hidden_states
-
-        # 2. Cross-Attention/None
-        norm_hidden_states = self.norm2(hidden_states, emb)
-
-        height, weight = norm_hidden_states.shape[2:]
-        norm_hidden_states = self._to_3d(norm_hidden_states, height, weight)
-        attn_output = self.attn2(
-            norm_hidden_states,
-            encoder_hidden_states=encoder_hidden_states,
-            **cross_attention_kwargs,
-        )
-        attn_output = self._to_4d(attn_output, height, weight)
-
-        hidden_states = attn_output + hidden_states
-
-        return hidden_states
diff --git a/ppdiffusers/ppdiffusers/models/unet_2d_condition.py b/ppdiffusers/ppdiffusers/models/unet_2d_condition.py
deleted file mode 100644
index a14ed38faf2f..000000000000
--- a/ppdiffusers/ppdiffusers/models/unet_2d_condition.py
+++ /dev/null
@@ -1,836 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple, Union
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..loaders import UNet2DConditionLoadersMixin
-from ..utils import NEG_INF, BaseOutput, logging
-from .attention_processor import AttentionProcessor, AttnProcessor
-from .embeddings import (
-    GaussianFourierProjection,
-    TextTimeEmbedding,
-    TimestepEmbedding,
-    Timesteps,
-)
-from .modeling_utils import ModelMixin
-from .unet_2d_blocks import (
-    CrossAttnDownBlock2D,
-    CrossAttnUpBlock2D,
-    DownBlock2D,
-    UNetMidBlock2DCrossAttn,
-    UNetMidBlock2DSimpleCrossAttn,
-    UpBlock2D,
-    get_down_block,
-    get_up_block,
-)
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-@dataclass
-class UNet2DConditionOutput(BaseOutput):
-    """
-    Args:
-        sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)`):
-            Hidden states conditioned on `encoder_hidden_states` input. Output of last layer of model.
-    """
-
-    sample: paddle.Tensor
-
-
-class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
-    r"""
-    UNet2DConditionModel is a conditional 2D UNet model that takes in a noisy sample, conditional state, and a timestep
-    and returns sample shaped output.
-
-    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
-    implements for all the models (such as downloading or saving, etc.)
-
-    Parameters:
-        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
-            Height and width of input/output sample.
-        in_channels (`int`, *optional*, defaults to 4): The number of channels in the input sample.
-        out_channels (`int`, *optional*, defaults to 4): The number of channels in the output.
-        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
-        flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
-            Whether to flip the sin to cos in the time embedding.
-        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
-        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
-            The tuple of downsample blocks to use.
-        mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`):
-            The mid block type. Choose from `UNetMidBlock2DCrossAttn` or `UNetMidBlock2DSimpleCrossAttn`, will skip the
-            mid block layer if `None`.
-        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D",)`):
-            The tuple of upsample blocks to use.
-        only_cross_attention(`bool` or `Tuple[bool]`, *optional*, default to `False`):
-            Whether to include self-attention in the basic transformer blocks, see
-            [`~models.attention.BasicTransformerBlock`].
-        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
-            The tuple of output channels for each block.
-        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
-        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
-        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
-        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
-        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
-            If `None`, it will skip the normalization and activation layers in post-processing
-        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
-        cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
-            The dimension of the cross attention features.
-        encoder_hid_dim (`int`, *optional*, defaults to None):
-            If given, `encoder_hidden_states` will be projected from this dimension to `cross_attention_dim`.
-        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
-        resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
-            for resnet blocks, see [`~models.resnet.ResnetBlock2D`]. Choose from `default` or `scale_shift`.
-        class_embed_type (`str`, *optional*, defaults to None):
-            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
-            `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
-        addition_embed_type (`str`, *optional*, defaults to None):
-            Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
-            "text". "text" will use the `TextTimeEmbedding` layer.
-        num_class_embeds (`int`, *optional*, defaults to None):
-            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
-            class conditioning with `class_embed_type` equal to `None`.
-        time_embedding_type (`str`, *optional*, default to `positional`):
-            The type of position embedding to use for timesteps. Choose from `positional` or `fourier`.
-        time_embedding_dim (`int`, *optional*, default to `None`):
-            An optional override for the dimension of the projected time embedding.
-        time_embedding_act_fn (`str`, *optional*, default to `None`):
-            Optional activation function to use on the time embeddings only one time before they as passed to the rest
-            of the unet. Choose from `silu`, `mish`, `gelu`, and `swish`.
-        timestep_post_act (`str, *optional*, default to `None`):
-            The second activation function to use in timestep embedding. Choose from `silu`, `mish` and `gelu`.
-        time_cond_proj_dim (`int`, *optional*, default to `None`):
-            The dimension of `cond_proj` layer in timestep embedding.
-        conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer.
-        conv_out_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_out` layer.
-        projection_class_embeddings_input_dim (`int`, *optional*): The dimension of the `class_labels` input when
-            using the "projection" `class_embed_type`. Required when using the "projection" `class_embed_type`.
-        class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time
-            embeddings with the class embeddings.
-        mid_block_only_cross_attention (`bool`, *optional*, defaults to `None`):
-            Whether to use cross attention with the mid block when using the `UNetMidBlock2DSimpleCrossAttn`. If
-            `only_cross_attention` is given as a single boolean and `mid_block_only_cross_attention` is None, the
-            `only_cross_attention` value will be used as the value for `mid_block_only_cross_attention`. Else, it will
-            default to `False`.
-    """
-
-    _supports_gradient_checkpointing = True
-
-    @register_to_config
-    def __init__(
-        self,
-        sample_size: Optional[int] = None,
-        in_channels: int = 4,
-        out_channels: int = 4,
-        center_input_sample: bool = False,
-        flip_sin_to_cos: bool = True,
-        freq_shift: int = 0,
-        down_block_types: Tuple[str] = (
-            "CrossAttnDownBlock2D",
-            "CrossAttnDownBlock2D",
-            "CrossAttnDownBlock2D",
-            "DownBlock2D",
-        ),
-        mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
-        up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
-        only_cross_attention: Union[bool, Tuple[bool]] = False,
-        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
-        layers_per_block: Union[int, Tuple[int]] = 2,
-        downsample_padding: int = 1,
-        mid_block_scale_factor: float = 1,
-        act_fn: str = "silu",
-        norm_num_groups: Optional[int] = 32,
-        norm_eps: float = 1e-5,
-        cross_attention_dim: Union[int, Tuple[int]] = 1280,
-        encoder_hid_dim: Optional[int] = None,
-        attention_head_dim: Union[int, Tuple[int]] = 8,
-        dual_cross_attention: bool = False,
-        use_linear_projection: bool = False,
-        class_embed_type: Optional[str] = None,
-        addition_embed_type: Optional[str] = None,
-        num_class_embeds: Optional[int] = None,
-        upcast_attention: bool = False,
-        resnet_time_scale_shift: str = "default",
-        resnet_skip_time_act: bool = False,
-        resnet_out_scale_factor: int = 1.0,
-        time_embedding_type: str = "positional",  # fourier, positional
-        time_embedding_dim: Optional[int] = None,
-        time_embedding_act_fn: Optional[str] = None,
-        timestep_post_act: Optional[str] = None,
-        time_cond_proj_dim: Optional[int] = None,
-        conv_in_kernel: int = 3,
-        conv_out_kernel: int = 3,
-        projection_class_embeddings_input_dim: Optional[int] = None,
-        class_embeddings_concat: bool = False,
-        mid_block_only_cross_attention: Optional[bool] = None,
-        cross_attention_norm: Optional[str] = None,
-        resnet_pre_temb_non_linearity: Optional[bool] = False,
-        addition_embed_type_num_heads: int = 64,
-    ):
-        super().__init__()
-
-        self.sample_size = sample_size
-
-        # Check inputs
-        if len(down_block_types) != len(up_block_types):
-            raise ValueError(
-                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
-            )
-
-        if len(block_out_channels) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
-            )
-
-        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
-            )
-
-        if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
-            )
-
-        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
-            )
-
-        if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
-            )
-
-        # input
-        conv_in_padding = (conv_in_kernel - 1) // 2
-        self.conv_in = nn.Conv2D(
-            in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
-        )
-
-        # time
-        if time_embedding_type == "fourier":
-            time_embed_dim = time_embedding_dim or block_out_channels[0] * 2
-            if time_embed_dim % 2 != 0:
-                raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.")
-            self.time_proj = GaussianFourierProjection(
-                time_embed_dim // 2, set_W_to_weight=False, log=False, flip_sin_to_cos=flip_sin_to_cos
-            )
-            timestep_input_dim = time_embed_dim
-        elif time_embedding_type == "positional":
-            time_embed_dim = time_embedding_dim or block_out_channels[0] * 4
-
-            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
-            timestep_input_dim = block_out_channels[0]
-        else:
-            raise ValueError(
-                f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`."
-            )
-
-        self.time_embedding = TimestepEmbedding(
-            timestep_input_dim,
-            time_embed_dim,
-            act_fn=act_fn,
-            post_act_fn=timestep_post_act,
-            cond_proj_dim=time_cond_proj_dim,
-        )
-
-        if encoder_hid_dim is not None:
-            self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
-        else:
-            self.encoder_hid_proj = None
-
-        # class embedding
-        if class_embed_type is None and num_class_embeds is not None:
-            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)  # int64
-        elif class_embed_type == "timestep":
-            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim, act_fn=act_fn)  # float
-        elif class_embed_type == "identity":
-            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
-        elif class_embed_type == "projection":
-            if projection_class_embeddings_input_dim is None:
-                raise ValueError(
-                    "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
-                )
-            # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
-            # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
-            # 2. it projects from an arbitrary input dimension.
-            #
-            # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
-            # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
-            # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
-            self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)  # float
-        elif class_embed_type == "simple_projection":
-            if projection_class_embeddings_input_dim is None:
-                raise ValueError(
-                    "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set"
-                )
-            self.class_embedding = nn.Linear(projection_class_embeddings_input_dim, time_embed_dim)
-        else:
-            self.class_embedding = None
-
-        if addition_embed_type == "text":
-            if encoder_hid_dim is not None:
-                text_time_embedding_from_dim = encoder_hid_dim
-            else:
-                text_time_embedding_from_dim = cross_attention_dim
-
-            self.add_embedding = TextTimeEmbedding(
-                text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
-            )
-        elif addition_embed_type is not None:
-            raise ValueError(f"addition_embed_type: {addition_embed_type} must be None or 'text'.")
-
-        if time_embedding_act_fn is None:
-            self.time_embed_act = None
-        elif time_embedding_act_fn == "swish":
-            self.time_embed_act = lambda x: F.silu(x)
-        elif time_embedding_act_fn == "mish":
-            self.time_embed_act = nn.Mish()
-        elif time_embedding_act_fn == "silu":
-            self.time_embed_act = nn.Silu()
-        elif time_embedding_act_fn == "gelu":
-            self.time_embed_act = nn.GELU()
-        else:
-            raise ValueError(f"Unsupported activation function: {time_embedding_act_fn}")
-
-        self.down_blocks = nn.LayerList([])
-        self.up_blocks = nn.LayerList([])
-
-        if isinstance(only_cross_attention, bool):
-            if mid_block_only_cross_attention is None:
-                mid_block_only_cross_attention = only_cross_attention
-            only_cross_attention = [only_cross_attention] * len(down_block_types)
-
-        if mid_block_only_cross_attention is None:
-            mid_block_only_cross_attention = False
-
-        if isinstance(attention_head_dim, int):
-            attention_head_dim = (attention_head_dim,) * len(down_block_types)
-
-        if isinstance(cross_attention_dim, int):
-            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
-
-        if isinstance(layers_per_block, int):
-            layers_per_block = [layers_per_block] * len(down_block_types)
-
-        if class_embeddings_concat:
-            # The time embeddings are concatenated with the class embeddings. The dimension of the
-            # time embeddings passed to the down, middle, and up blocks is twice the dimension of the
-            # regular time embeddings
-            blocks_time_embed_dim = time_embed_dim * 2
-        else:
-            blocks_time_embed_dim = time_embed_dim
-
-        # pre_temb_act_fun opt
-        self.resnet_pre_temb_non_linearity = resnet_pre_temb_non_linearity
-        if resnet_pre_temb_non_linearity:
-            if act_fn == "swish":
-                self.down_resnet_temb_nonlinearity = lambda x: F.silu(x)
-            elif act_fn == "mish":
-                self.down_resnet_temb_nonlinearity = nn.Mish()
-            elif act_fn == "silu":
-                self.down_resnet_temb_nonlinearity = nn.Silu()
-            elif act_fn == "gelu":
-                self.down_resnet_temb_nonlinearity = nn.GELU()
-
-        # down
-        output_channel = block_out_channels[0]
-        for i, down_block_type in enumerate(down_block_types):
-            input_channel = output_channel
-            output_channel = block_out_channels[i]
-            is_final_block = i == len(block_out_channels) - 1
-
-            down_block = get_down_block(
-                down_block_type,
-                num_layers=layers_per_block[i],
-                in_channels=input_channel,
-                out_channels=output_channel,
-                temb_channels=blocks_time_embed_dim,
-                add_downsample=not is_final_block,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                resnet_groups=norm_num_groups,
-                cross_attention_dim=cross_attention_dim[i],
-                attn_num_head_channels=attention_head_dim[i],
-                downsample_padding=downsample_padding,
-                dual_cross_attention=dual_cross_attention,
-                use_linear_projection=use_linear_projection,
-                only_cross_attention=only_cross_attention[i],
-                upcast_attention=upcast_attention,
-                resnet_time_scale_shift=resnet_time_scale_shift,
-                resnet_skip_time_act=resnet_skip_time_act,
-                resnet_out_scale_factor=resnet_out_scale_factor,
-                cross_attention_norm=cross_attention_norm,
-                resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-            )
-            self.down_blocks.append(down_block)
-
-        # mid
-        if mid_block_type == "UNetMidBlock2DCrossAttn":
-            self.mid_block = UNetMidBlock2DCrossAttn(
-                in_channels=block_out_channels[-1],
-                temb_channels=blocks_time_embed_dim,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                output_scale_factor=mid_block_scale_factor,
-                resnet_time_scale_shift=resnet_time_scale_shift,
-                cross_attention_dim=cross_attention_dim[-1],
-                attn_num_head_channels=attention_head_dim[-1],
-                resnet_groups=norm_num_groups,
-                dual_cross_attention=dual_cross_attention,
-                use_linear_projection=use_linear_projection,
-                upcast_attention=upcast_attention,
-                resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-            )
-        elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn":
-            self.mid_block = UNetMidBlock2DSimpleCrossAttn(
-                in_channels=block_out_channels[-1],
-                temb_channels=blocks_time_embed_dim,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                output_scale_factor=mid_block_scale_factor,
-                cross_attention_dim=cross_attention_dim[-1],
-                attn_num_head_channels=attention_head_dim[-1],
-                resnet_groups=norm_num_groups,
-                resnet_time_scale_shift=resnet_time_scale_shift,
-                skip_time_act=resnet_skip_time_act,
-                only_cross_attention=mid_block_only_cross_attention,
-                cross_attention_norm=cross_attention_norm,
-                resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-            )
-        elif mid_block_type is None:
-            self.mid_block = None
-        else:
-            raise ValueError(f"unknown mid_block_type : {mid_block_type}")
-
-        # count how many layers upsample the images
-        self.num_upsamplers = 0
-
-        # up
-        reversed_block_out_channels = list(reversed(block_out_channels))
-        reversed_attention_head_dim = list(reversed(attention_head_dim))
-        reversed_layers_per_block = list(reversed(layers_per_block))
-        reversed_cross_attention_dim = list(reversed(cross_attention_dim))
-        reversed_only_cross_attention = list(reversed(only_cross_attention))
-
-        output_channel = reversed_block_out_channels[0]
-        for i, up_block_type in enumerate(up_block_types):
-            is_final_block = i == len(block_out_channels) - 1
-
-            prev_output_channel = output_channel
-            output_channel = reversed_block_out_channels[i]
-            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
-
-            # add upsample block for all BUT final layer
-            if not is_final_block:
-                add_upsample = True
-                self.num_upsamplers += 1
-            else:
-                add_upsample = False
-
-            up_block = get_up_block(
-                up_block_type,
-                num_layers=reversed_layers_per_block[i] + 1,
-                in_channels=input_channel,
-                out_channels=output_channel,
-                prev_output_channel=prev_output_channel,
-                temb_channels=blocks_time_embed_dim,
-                add_upsample=add_upsample,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                resnet_groups=norm_num_groups,
-                cross_attention_dim=reversed_cross_attention_dim[i],
-                attn_num_head_channels=reversed_attention_head_dim[i],
-                dual_cross_attention=dual_cross_attention,
-                use_linear_projection=use_linear_projection,
-                only_cross_attention=reversed_only_cross_attention[i],
-                upcast_attention=upcast_attention,
-                resnet_time_scale_shift=resnet_time_scale_shift,
-                resnet_skip_time_act=resnet_skip_time_act,
-                resnet_out_scale_factor=resnet_out_scale_factor,
-                cross_attention_norm=cross_attention_norm,
-                resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-            )
-            self.up_blocks.append(up_block)
-            prev_output_channel = output_channel
-
-        # out
-        if norm_num_groups is not None:
-            self.conv_norm_out = nn.GroupNorm(
-                num_channels=block_out_channels[0], num_groups=norm_num_groups, epsilon=norm_eps
-            )
-            if act_fn == "swish":
-                self.conv_act = lambda x: F.silu(x)
-            elif act_fn == "mish":
-                self.conv_act = nn.Mish()
-            elif act_fn == "silu":
-                self.conv_act = nn.Silu()
-            elif act_fn == "gelu":
-                self.conv_act = nn.GELU()
-            else:
-                raise ValueError(f"Unsupported activation function: {act_fn}")
-        else:
-            self.conv_norm_out = None
-            self.conv_act = None
-
-        conv_out_padding = (conv_out_kernel - 1) // 2
-        self.conv_out = nn.Conv2D(
-            block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
-        )
-
-    @property
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: nn.Layer, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "set_processor"):
-                processors[f"{name}.processor"] = module.processor
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Parameters:
-            `processor (`dict` of `AttentionProcessor` or `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                of **all** `Attention` layers.
-            In case `processor` is a dict, the key needs to define the path to the corresponding cross attention processor. This is strongly recommended when setting trainable attention processors.:
-
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: nn.Layer, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
-    def set_default_attn_processor(self):
-        """
-        Disables custom attention processors and sets the default attention implementation.
-        """
-        self.set_attn_processor(AttnProcessor())
-
-    def set_attention_slice(self, slice_size):
-        r"""
-        Enable sliced attention computation.
-
-        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
-        in several steps. This is useful to save some memory in exchange for a small speed decrease.
-
-        Args:
-            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
-                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
-                `"max"`, maximum amount of memory will be saved by running only one slice at a time. If a number is
-                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
-                must be a multiple of `slice_size`.
-        """
-        sliceable_head_dims = []
-
-        def fn_recursive_retrieve_sliceable_dims(module: nn.Layer):
-            if hasattr(module, "set_attention_slice"):
-                sliceable_head_dims.append(module.sliceable_head_dim)
-
-            for child in module.children():
-                fn_recursive_retrieve_sliceable_dims(child)
-
-        # retrieve number of attention layers
-        for module in self.children():
-            fn_recursive_retrieve_sliceable_dims(module)
-
-        num_sliceable_layers = len(sliceable_head_dims)
-
-        if slice_size == "auto":
-            # half the attention head size is usually a good trade-off between
-            # speed and memory
-            slice_size = [dim // 2 for dim in sliceable_head_dims]
-        elif slice_size == "max":
-            # make smallest slice possible
-            slice_size = num_sliceable_layers * [1]
-
-        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
-
-        if len(slice_size) != len(sliceable_head_dims):
-            raise ValueError(
-                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
-                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
-            )
-
-        for i in range(len(slice_size)):
-            size = slice_size[i]
-            dim = sliceable_head_dims[i]
-            if size is not None and size > dim:
-                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
-
-        # Recursively walk through all the children.
-        # Any children which exposes the set_attention_slice method
-        # gets the message
-        def fn_recursive_set_attention_slice(module: nn.Layer, slice_size: List[int]):
-            if hasattr(module, "set_attention_slice"):
-                module.set_attention_slice(slice_size.pop())
-
-            for child in module.children():
-                fn_recursive_set_attention_slice(child, slice_size)
-
-        reversed_slice_size = list(reversed(slice_size))
-        for module in self.children():
-            fn_recursive_set_attention_slice(module, reversed_slice_size)
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D, CrossAttnUpBlock2D, UpBlock2D)):
-            module.gradient_checkpointing = value
-
-    def forward(
-        self,
-        sample: paddle.Tensor,
-        timestep: Union[paddle.Tensor, float, int],
-        encoder_hidden_states: paddle.Tensor,
-        class_labels: Optional[paddle.Tensor] = None,
-        timestep_cond: Optional[paddle.Tensor] = None,
-        attention_mask: Optional[paddle.Tensor] = None,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        down_block_additional_residuals: Optional[Tuple[paddle.Tensor]] = None,
-        mid_block_additional_residual: Optional[paddle.Tensor] = None,
-        return_dict: bool = True,
-    ) -> Union[UNet2DConditionOutput, Tuple]:
-        r"""
-        Args:
-            sample (`paddle.Tensor`): (batch, channel, height, width) noisy inputs tensor
-            timestep (`paddle.Tensor` or `float` or `int`): (batch) timesteps
-            encoder_hidden_states (`paddle.Tensor`): (batch, sequence_length, feature_dim) encoder hidden states
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [ppdiffusers.cross_attention](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/ppdiffusers/ppdiffusers/models/cross_attention.py).
-
-        Returns:
-            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
-            [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
-            returning a tuple, the first element is the sample tensor.
-        """
-        # TODO junnyu, add this to support pure fp16
-        sample = sample.cast(self.dtype)
-
-        # By default samples have to be AT least a multiple of the overall upsampling factor.
-        # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
-        # However, the upsampling interpolation output size can be forced to fit any upsampling size
-        # on the fly if necessary.
-        default_overall_up_factor = 2**self.num_upsamplers
-
-        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
-        forward_upsample_size = False
-        upsample_size = None
-
-        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
-            logger.info("Forward upsample size to force interpolation output size.")
-            forward_upsample_size = True
-
-        # prepare attention_mask
-        if attention_mask is not None:
-            attention_mask = (1 - attention_mask.cast(sample.dtype)) * NEG_INF
-            attention_mask = attention_mask.unsqueeze(1)
-
-        # 0. center input if necessary
-        if self.config.center_input_sample:
-            sample = 2 * sample - 1.0
-
-        # 1. time
-        timesteps = timestep
-        if not paddle.is_tensor(timesteps):
-            timesteps = paddle.to_tensor([timesteps], dtype="int64")
-        elif len(timesteps.shape) == 0:
-            timesteps = timesteps[None]
-
-        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-        timesteps = timesteps.expand(
-            [
-                sample.shape[0],
-            ]
-        )
-        t_emb = self.time_proj(timesteps)
-
-        # `Timesteps` does not contain any weights and will always return f32 tensors
-        # but time_embedding might actually be running in fp16. so we need to cast here.
-        # there might be better ways to encapsulate this.
-        t_emb = t_emb.cast(self.dtype)
-
-        emb = self.time_embedding(t_emb, timestep_cond)
-
-        if self.class_embedding is not None:
-            if class_labels is None:
-                raise ValueError("class_labels should be provided when num_class_embeds > 0")
-
-            # maybe cast it to float16
-            class_labels = class_labels.cast(self.dtype)
-            if self.config.class_embed_type == "timestep":
-                class_labels = self.time_proj(class_labels)
-                # `Timesteps` does not contain any weights and will always return f32 tensors
-                # there might be better ways to encapsulate this.
-                class_labels = class_labels.cast(sample.dtype)
-
-            # maybe cast it to int64
-            if isinstance(self.class_embedding, nn.Embedding):
-                class_labels = class_labels.cast(paddle.int64)
-            class_emb = self.class_embedding(class_labels).cast(self.dtype)
-
-            if self.config.class_embeddings_concat:
-                emb = paddle.concat([emb, class_emb], axis=-1)
-            else:
-                emb = emb + class_emb
-
-        if self.config.addition_embed_type == "text":
-            aug_emb = self.add_embedding(encoder_hidden_states)
-            emb = emb + aug_emb
-
-        if self.resnet_pre_temb_non_linearity:
-            emb = self.down_resnet_temb_nonlinearity(emb)
-        else:
-            if self.time_embed_act is not None:
-                emb = self.time_embed_act(emb)
-
-        if self.encoder_hid_proj is not None:
-            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
-
-        # 2. pre-process
-        sample = self.conv_in(sample)
-
-        # 3. down
-
-        is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
-        is_adapter = mid_block_additional_residual is None and down_block_additional_residuals is not None
-
-        down_block_res_samples = (sample,)
-
-        for downsample_block in self.down_blocks:
-            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
-                additional_kwargs = {}
-                if is_adapter and len(down_block_additional_residuals) > 0:
-                    additional_kwargs["additional_residuals"] = down_block_additional_residuals.pop(0)
-
-                sample, res_samples = downsample_block(
-                    hidden_states=sample,
-                    temb=emb,
-                    encoder_hidden_states=encoder_hidden_states,
-                    attention_mask=attention_mask,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                    **additional_kwargs,
-                )
-            else:
-                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
-
-                if is_adapter and len(down_block_additional_residuals) > 0:
-                    sample += down_block_additional_residuals.pop(0)
-                    # westfish: add to align with torch features
-                    res_samples = tuple(res_samples[:-1]) + (sample,)
-            down_block_res_samples += res_samples
-
-        if is_controlnet:
-            new_down_block_res_samples = ()
-
-            for down_block_res_sample, down_block_additional_residual in zip(
-                down_block_res_samples, down_block_additional_residuals
-            ):
-                down_block_res_sample = down_block_res_sample + down_block_additional_residual
-                new_down_block_res_samples += (down_block_res_sample,)
-            down_block_res_samples = new_down_block_res_samples
-
-        # 4. mid
-        if self.mid_block is not None:
-            sample = self.mid_block(
-                sample,
-                emb,
-                encoder_hidden_states=encoder_hidden_states,
-                attention_mask=attention_mask,
-                cross_attention_kwargs=cross_attention_kwargs,
-            )
-
-        if is_controlnet:
-            sample = sample + mid_block_additional_residual
-
-        # 5. up
-        for i, upsample_block in enumerate(self.up_blocks):
-            is_final_block = i == len(self.up_blocks) - 1
-
-            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
-            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
-
-            # if we have not reached the final block and need to forward the
-            # upsample size, we do it here
-            if not is_final_block and forward_upsample_size:
-                upsample_size = down_block_res_samples[-1].shape[2:]
-
-            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
-                sample = upsample_block(
-                    hidden_states=sample,
-                    temb=emb,
-                    res_hidden_states_tuple=res_samples,
-                    encoder_hidden_states=encoder_hidden_states,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                    upsample_size=upsample_size,
-                    attention_mask=attention_mask,
-                )
-            else:
-                sample = upsample_block(
-                    hidden_states=sample,
-                    temb=emb,
-                    res_hidden_states_tuple=res_samples,
-                    upsample_size=upsample_size,
-                )
-
-        # 6. post-process
-        if self.conv_norm_out:
-            sample = self.conv_norm_out(sample)
-            sample = self.conv_act(sample)
-        sample = self.conv_out(sample)
-
-        if not return_dict:
-            return (sample,)
-
-        return UNet2DConditionOutput(sample=sample)
diff --git a/ppdiffusers/ppdiffusers/models/unet_3d_blocks.py b/ppdiffusers/ppdiffusers/models/unet_3d_blocks.py
deleted file mode 100644
index 9baa9bc52d52..000000000000
--- a/ppdiffusers/ppdiffusers/models/unet_3d_blocks.py
+++ /dev/null
@@ -1,638 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.nn as nn
-
-from .resnet import Downsample2D, ResnetBlock2D, TemporalConvLayer, Upsample2D
-from .transformer_2d import Transformer2DModel
-from .transformer_temporal import TransformerTemporalModel
-
-
-def get_down_block(
-    down_block_type,
-    num_layers,
-    in_channels,
-    out_channels,
-    temb_channels,
-    add_downsample,
-    resnet_eps,
-    resnet_act_fn,
-    attn_num_head_channels,
-    resnet_groups=None,
-    cross_attention_dim=None,
-    downsample_padding=None,
-    dual_cross_attention=False,
-    use_linear_projection=True,
-    only_cross_attention=False,
-    upcast_attention=False,
-    resnet_time_scale_shift="default",
-):
-    if down_block_type == "DownBlock3D":
-        return DownBlock3D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            resnet_groups=resnet_groups,
-            downsample_padding=downsample_padding,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-        )
-    elif down_block_type == "CrossAttnDownBlock3D":
-        if cross_attention_dim is None:
-            raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock3D")
-        return CrossAttnDownBlock3D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            resnet_groups=resnet_groups,
-            downsample_padding=downsample_padding,
-            cross_attention_dim=cross_attention_dim,
-            attn_num_head_channels=attn_num_head_channels,
-            dual_cross_attention=dual_cross_attention,
-            use_linear_projection=use_linear_projection,
-            only_cross_attention=only_cross_attention,
-            upcast_attention=upcast_attention,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-        )
-    raise ValueError(f"{down_block_type} does not exist.")
-
-
-def get_up_block(
-    up_block_type,
-    num_layers,
-    in_channels,
-    out_channels,
-    prev_output_channel,
-    temb_channels,
-    add_upsample,
-    resnet_eps,
-    resnet_act_fn,
-    attn_num_head_channels,
-    resnet_groups=None,
-    cross_attention_dim=None,
-    dual_cross_attention=False,
-    use_linear_projection=True,
-    only_cross_attention=False,
-    upcast_attention=False,
-    resnet_time_scale_shift="default",
-):
-    if up_block_type == "UpBlock3D":
-        return UpBlock3D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            prev_output_channel=prev_output_channel,
-            temb_channels=temb_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            resnet_groups=resnet_groups,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-        )
-    elif up_block_type == "CrossAttnUpBlock3D":
-        if cross_attention_dim is None:
-            raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock3D")
-        return CrossAttnUpBlock3D(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            prev_output_channel=prev_output_channel,
-            temb_channels=temb_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            resnet_groups=resnet_groups,
-            cross_attention_dim=cross_attention_dim,
-            attn_num_head_channels=attn_num_head_channels,
-            dual_cross_attention=dual_cross_attention,
-            use_linear_projection=use_linear_projection,
-            only_cross_attention=only_cross_attention,
-            upcast_attention=upcast_attention,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-        )
-    raise ValueError(f"{up_block_type} does not exist.")
-
-
-class UNetMidBlock3DCrossAttn(nn.Layer):
-    def __init__(
-        self,
-        in_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-06,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels=1,
-        output_scale_factor=1.0,
-        cross_attention_dim=1280,
-        dual_cross_attention=False,
-        use_linear_projection=True,
-        upcast_attention=False,
-    ):
-        super().__init__()
-        self.has_cross_attention = True
-        self.attn_num_head_channels = attn_num_head_channels
-        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
-        # there is always at least one resnet
-        resnets = [
-            ResnetBlock2D(
-                in_channels=in_channels,
-                out_channels=in_channels,
-                temb_channels=temb_channels,
-                eps=resnet_eps,
-                groups=resnet_groups,
-                dropout=dropout,
-                time_embedding_norm=resnet_time_scale_shift,
-                non_linearity=resnet_act_fn,
-                output_scale_factor=output_scale_factor,
-                pre_norm=resnet_pre_norm,
-            )
-        ]
-        temp_convs = [
-            TemporalConvLayer(
-                in_channels,
-                in_channels,
-                dropout=0.1,
-            )
-        ]
-        attentions = []
-        temp_attentions = []
-        for _ in range(num_layers):
-            attentions.append(
-                Transformer2DModel(
-                    in_channels // attn_num_head_channels,
-                    attn_num_head_channels,
-                    in_channels=in_channels,
-                    num_layers=1,
-                    cross_attention_dim=cross_attention_dim,
-                    norm_num_groups=resnet_groups,
-                    use_linear_projection=use_linear_projection,
-                    upcast_attention=upcast_attention,
-                )
-            )
-            temp_attentions.append(
-                TransformerTemporalModel(
-                    in_channels // attn_num_head_channels,
-                    attn_num_head_channels,
-                    in_channels=in_channels,
-                    num_layers=1,
-                    cross_attention_dim=cross_attention_dim,
-                    norm_num_groups=resnet_groups,
-                )
-            )
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=in_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-            temp_convs.append(
-                TemporalConvLayer(
-                    in_channels,
-                    in_channels,
-                    dropout=0.1,
-                )
-            )
-        self.resnets = nn.LayerList(resnets)
-        self.temp_convs = nn.LayerList(temp_convs)
-        self.attentions = nn.LayerList(attentions)
-        self.temp_attentions = nn.LayerList(temp_attentions)
-
-    def forward(
-        self,
-        hidden_states,
-        temb=None,
-        encoder_hidden_states=None,
-        attention_mask=None,
-        num_frames=1,
-        cross_attention_kwargs=None,
-    ):
-        hidden_states = self.resnets[0](hidden_states, temb)
-        hidden_states = self.temp_convs[0](hidden_states, num_frames=num_frames)
-        for attn, temp_attn, resnet, temp_conv in zip(
-            self.attentions, self.temp_attentions, self.resnets[1:], self.temp_convs[1:]
-        ):
-            hidden_states = attn(
-                hidden_states,
-                encoder_hidden_states=encoder_hidden_states,
-                cross_attention_kwargs=cross_attention_kwargs,
-            ).sample
-            hidden_states = temp_attn(
-                hidden_states, num_frames=num_frames, cross_attention_kwargs=cross_attention_kwargs
-            ).sample
-            hidden_states = resnet(hidden_states, temb)
-            hidden_states = temp_conv(hidden_states, num_frames=num_frames)
-        return hidden_states
-
-
-class CrossAttnDownBlock3D(nn.Layer):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-06,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels=1,
-        cross_attention_dim=1280,
-        output_scale_factor=1.0,
-        downsample_padding=1,
-        add_downsample=True,
-        dual_cross_attention=False,
-        use_linear_projection=False,
-        only_cross_attention=False,
-        upcast_attention=False,
-    ):
-        super().__init__()
-        resnets = []
-        attentions = []
-        temp_attentions = []
-        temp_convs = []
-        self.has_cross_attention = True
-        self.attn_num_head_channels = attn_num_head_channels
-        for i in range(num_layers):
-            in_channels = in_channels if i == 0 else out_channels
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-            temp_convs.append(
-                TemporalConvLayer(
-                    out_channels,
-                    out_channels,
-                    dropout=0.1,
-                )
-            )
-            attentions.append(
-                Transformer2DModel(
-                    out_channels // attn_num_head_channels,
-                    attn_num_head_channels,
-                    in_channels=out_channels,
-                    num_layers=1,
-                    cross_attention_dim=cross_attention_dim,
-                    norm_num_groups=resnet_groups,
-                    use_linear_projection=use_linear_projection,
-                    only_cross_attention=only_cross_attention,
-                    upcast_attention=upcast_attention,
-                )
-            )
-            temp_attentions.append(
-                TransformerTemporalModel(
-                    out_channels // attn_num_head_channels,
-                    attn_num_head_channels,
-                    in_channels=out_channels,
-                    num_layers=1,
-                    cross_attention_dim=cross_attention_dim,
-                    norm_num_groups=resnet_groups,
-                )
-            )
-        self.resnets = nn.LayerList(resnets)
-        self.temp_convs = nn.LayerList(temp_convs)
-        self.attentions = nn.LayerList(attentions)
-        self.temp_attentions = nn.LayerList(temp_attentions)
-        if add_downsample:
-            self.downsamplers = nn.LayerList(
-                [
-                    Downsample2D(
-                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
-                    )
-                ]
-            )
-        else:
-            self.downsamplers = None
-        self.gradient_checkpointing = False
-
-    def forward(
-        self,
-        hidden_states,
-        temb=None,
-        encoder_hidden_states=None,
-        attention_mask=None,
-        num_frames=1,
-        cross_attention_kwargs=None,
-    ):
-        output_states = ()
-        for resnet, temp_conv, attn, temp_attn in zip(
-            self.resnets, self.temp_convs, self.attentions, self.temp_attentions
-        ):
-            hidden_states = resnet(hidden_states, temb)
-            hidden_states = temp_conv(hidden_states, num_frames=num_frames)
-            hidden_states = attn(
-                hidden_states,
-                encoder_hidden_states=encoder_hidden_states,
-                cross_attention_kwargs=cross_attention_kwargs,
-            ).sample
-            hidden_states = temp_attn(
-                hidden_states, num_frames=num_frames, cross_attention_kwargs=cross_attention_kwargs
-            ).sample
-            output_states += (hidden_states,)
-        if self.downsamplers is not None:
-            for downsampler in self.downsamplers:
-                hidden_states = downsampler(hidden_states)
-            output_states += (hidden_states,)
-        return hidden_states, output_states
-
-
-class DownBlock3D(nn.Layer):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-06,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        output_scale_factor=1.0,
-        add_downsample=True,
-        downsample_padding=1,
-    ):
-        super().__init__()
-        resnets = []
-        temp_convs = []
-        for i in range(num_layers):
-            in_channels = in_channels if i == 0 else out_channels
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=in_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-            temp_convs.append(
-                TemporalConvLayer(
-                    out_channels,
-                    out_channels,
-                    dropout=0.1,
-                )
-            )
-        self.resnets = nn.LayerList(resnets)
-        self.temp_convs = nn.LayerList(temp_convs)
-        if add_downsample:
-            self.downsamplers = nn.LayerList(
-                [
-                    Downsample2D(
-                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
-                    )
-                ]
-            )
-        else:
-            self.downsamplers = None
-        self.gradient_checkpointing = False
-
-    def forward(self, hidden_states, temb=None, num_frames=1):
-        output_states = ()
-        for resnet, temp_conv in zip(self.resnets, self.temp_convs):
-            hidden_states = resnet(hidden_states, temb)
-            hidden_states = temp_conv(hidden_states, num_frames=num_frames)
-            output_states += (hidden_states,)
-        if self.downsamplers is not None:
-            for downsampler in self.downsamplers:
-                hidden_states = downsampler(hidden_states)
-            output_states += (hidden_states,)
-        return hidden_states, output_states
-
-
-class CrossAttnUpBlock3D(nn.Layer):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        prev_output_channel: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-06,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels=1,
-        cross_attention_dim=1280,
-        output_scale_factor=1.0,
-        add_upsample=True,
-        dual_cross_attention=False,
-        use_linear_projection=False,
-        only_cross_attention=False,
-        upcast_attention=False,
-    ):
-        super().__init__()
-        resnets = []
-        temp_convs = []
-        attentions = []
-        temp_attentions = []
-        self.has_cross_attention = True
-        self.attn_num_head_channels = attn_num_head_channels
-        for i in range(num_layers):
-            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
-            resnet_in_channels = prev_output_channel if i == 0 else out_channels
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=resnet_in_channels + res_skip_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-            temp_convs.append(
-                TemporalConvLayer(
-                    out_channels,
-                    out_channels,
-                    dropout=0.1,
-                )
-            )
-            attentions.append(
-                Transformer2DModel(
-                    out_channels // attn_num_head_channels,
-                    attn_num_head_channels,
-                    in_channels=out_channels,
-                    num_layers=1,
-                    cross_attention_dim=cross_attention_dim,
-                    norm_num_groups=resnet_groups,
-                    use_linear_projection=use_linear_projection,
-                    only_cross_attention=only_cross_attention,
-                    upcast_attention=upcast_attention,
-                )
-            )
-            temp_attentions.append(
-                TransformerTemporalModel(
-                    out_channels // attn_num_head_channels,
-                    attn_num_head_channels,
-                    in_channels=out_channels,
-                    num_layers=1,
-                    cross_attention_dim=cross_attention_dim,
-                    norm_num_groups=resnet_groups,
-                )
-            )
-        self.resnets = nn.LayerList(sublayers=resnets)
-        self.temp_convs = nn.LayerList(sublayers=temp_convs)
-        self.attentions = nn.LayerList(sublayers=attentions)
-        self.temp_attentions = nn.LayerList(sublayers=temp_attentions)
-        if add_upsample:
-            self.upsamplers = nn.LayerList(
-                sublayers=[Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]
-            )
-        else:
-            self.upsamplers = None
-        self.gradient_checkpointing = False
-
-    def forward(
-        self,
-        hidden_states,
-        res_hidden_states_tuple,
-        temb=None,
-        encoder_hidden_states=None,
-        upsample_size=None,
-        attention_mask=None,
-        num_frames=1,
-        cross_attention_kwargs=None,
-    ):
-        for resnet, temp_conv, attn, temp_attn in zip(
-            self.resnets, self.temp_convs, self.attentions, self.temp_attentions
-        ):
-            # pop res hidden states
-            res_hidden_states = res_hidden_states_tuple[-1]
-            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-            hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
-            hidden_states = resnet(hidden_states, temb)
-            hidden_states = temp_conv(hidden_states, num_frames=num_frames)
-            hidden_states = attn(
-                hidden_states,
-                encoder_hidden_states=encoder_hidden_states,
-                cross_attention_kwargs=cross_attention_kwargs,
-            ).sample
-            hidden_states = temp_attn(
-                hidden_states, num_frames=num_frames, cross_attention_kwargs=cross_attention_kwargs
-            ).sample
-        if self.upsamplers is not None:
-            for upsampler in self.upsamplers:
-                hidden_states = upsampler(hidden_states, upsample_size)
-        return hidden_states
-
-
-class UpBlock3D(nn.Layer):
-    def __init__(
-        self,
-        in_channels: int,
-        prev_output_channel: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-06,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        output_scale_factor=1.0,
-        add_upsample=True,
-    ):
-        super().__init__()
-        resnets = []
-        temp_convs = []
-        for i in range(num_layers):
-            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
-            resnet_in_channels = prev_output_channel if i == 0 else out_channels
-            resnets.append(
-                ResnetBlock2D(
-                    in_channels=resnet_in_channels + res_skip_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                )
-            )
-            temp_convs.append(
-                TemporalConvLayer(
-                    out_channels,
-                    out_channels,
-                    dropout=0.1,
-                )
-            )
-        self.resnets = nn.LayerList(resnets)
-        self.temp_convs = nn.LayerList(temp_convs)
-        if add_upsample:
-            self.upsamplers = nn.LayerList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
-        else:
-            self.upsamplers = None
-        self.gradient_checkpointing = False
-
-    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None, num_frames=1):
-        for resnet, temp_conv in zip(self.resnets, self.temp_convs):
-            # pop res hidden states
-            res_hidden_states = res_hidden_states_tuple[-1]
-            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-            hidden_states = paddle.concat(x=[hidden_states, res_hidden_states], axis=1)
-            hidden_states = resnet(hidden_states, temb)
-            hidden_states = temp_conv(hidden_states, num_frames=num_frames)
-        if self.upsamplers is not None:
-            for upsampler in self.upsamplers:
-                hidden_states = upsampler(hidden_states, upsample_size)
-        return hidden_states
diff --git a/ppdiffusers/ppdiffusers/models/unet_3d_condition.py b/ppdiffusers/ppdiffusers/models/unet_3d_condition.py
deleted file mode 100644
index 0a1cb6be41ff..000000000000
--- a/ppdiffusers/ppdiffusers/models/unet_3d_condition.py
+++ /dev/null
@@ -1,527 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 Alibaba DAMO-VILAB and The HuggingFace Team. All rights reserved.
-# Copyright 2023 The ModelScope Team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple, Union
-
-import paddle
-import paddle.nn as nn
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..loaders import UNet2DConditionLoadersMixin
-from ..utils import BaseOutput, logging
-from .attention_processor import AttentionProcessor, AttnProcessor
-from .embeddings import TimestepEmbedding, Timesteps
-from .modeling_utils import ModelMixin
-from .transformer_temporal import TransformerTemporalModel
-from .unet_3d_blocks import (
-    CrossAttnDownBlock3D,
-    CrossAttnUpBlock3D,
-    DownBlock3D,
-    UNetMidBlock3DCrossAttn,
-    UpBlock3D,
-    get_down_block,
-    get_up_block,
-)
-
-logger = logging.get_logger(__name__)
-
-
-@dataclass
-class UNet3DConditionOutput(BaseOutput):
-    """
-    Args:
-        sample (`paddle.Tensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
-            Hidden states conditioned on `encoder_hidden_states` input. Output of last layer of model.
-    """
-
-    sample: paddle.Tensor
-
-
-class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
-    r"""
-    UNet3DConditionModel is a conditional 2D UNet model that takes in a noisy sample, conditional state, and a timestep
-    and returns sample shaped output.
-
-    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
-    implements for all the models (such as downloading or saving, etc.)
-
-    Parameters:
-        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
-            Height and width of input/output sample.
-        in_channels (`int`, *optional*, defaults to 4): The number of channels in the input sample.
-        out_channels (`int`, *optional*, defaults to 4): The number of channels in the output.
-        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
-            The tuple of downsample blocks to use.
-        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D",)`):
-            The tuple of upsample blocks to use.
-        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
-            The tuple of output channels for each block.
-        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
-        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
-        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
-        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
-        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
-            If `None`, it will skip the normalization and activation layers in post-processing
-        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
-        cross_attention_dim (`int`, *optional*, defaults to 1280): The dimension of the cross attention features.
-        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
-    """
-
-    _supports_gradient_checkpointing = False
-
-    @register_to_config
-    def __init__(
-        self,
-        sample_size: Optional[int] = None,
-        in_channels: int = 4,
-        out_channels: int = 4,
-        down_block_types: Tuple[str] = (
-            "CrossAttnDownBlock3D",
-            "CrossAttnDownBlock3D",
-            "CrossAttnDownBlock3D",
-            "DownBlock3D",
-        ),
-        up_block_types: Tuple[str] = ("UpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D"),
-        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
-        layers_per_block: int = 2,
-        downsample_padding: int = 1,
-        mid_block_scale_factor: float = 1,
-        act_fn: str = "silu",
-        norm_num_groups: Optional[int] = 32,
-        norm_eps: float = 1e-05,
-        cross_attention_dim: int = 1024,
-        attention_head_dim: Union[int, Tuple[int]] = 64,
-    ):
-        super().__init__()
-        self.sample_size = sample_size
-        # Check inputs
-        if len(down_block_types) != len(up_block_types):
-            raise ValueError(
-                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
-            )
-        if len(block_out_channels) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
-            )
-        if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
-            )
-        conv_in_kernel = 3
-        conv_out_kernel = 3
-        conv_in_padding = (conv_in_kernel - 1) // 2
-        self.conv_in = nn.Conv2D(
-            in_channels=in_channels,
-            out_channels=block_out_channels[0],
-            kernel_size=conv_in_kernel,
-            padding=conv_in_padding,
-        )
-        # time
-        time_embed_dim = block_out_channels[0] * 4
-        self.time_proj = Timesteps(block_out_channels[0], True, 0)
-        timestep_input_dim = block_out_channels[0]
-        self.time_embedding = TimestepEmbedding(
-            timestep_input_dim,
-            time_embed_dim,
-            act_fn=act_fn,
-        )
-        self.transformer_in = TransformerTemporalModel(
-            num_attention_heads=8,
-            attention_head_dim=attention_head_dim,
-            in_channels=block_out_channels[0],
-            num_layers=1,
-        )
-        # class embedding
-        self.down_blocks = nn.LayerList(sublayers=[])
-        self.up_blocks = nn.LayerList(sublayers=[])
-        if isinstance(attention_head_dim, int):
-            attention_head_dim = (attention_head_dim,) * len(down_block_types)
-
-        # down
-        output_channel = block_out_channels[0]
-        for i, down_block_type in enumerate(down_block_types):
-            input_channel = output_channel
-            output_channel = block_out_channels[i]
-            is_final_block = i == len(block_out_channels) - 1
-            down_block = get_down_block(
-                down_block_type,
-                num_layers=layers_per_block,
-                in_channels=input_channel,
-                out_channels=output_channel,
-                temb_channels=time_embed_dim,
-                add_downsample=not is_final_block,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                resnet_groups=norm_num_groups,
-                cross_attention_dim=cross_attention_dim,
-                attn_num_head_channels=attention_head_dim[i],
-                downsample_padding=downsample_padding,
-                dual_cross_attention=False,
-            )
-            self.down_blocks.append(down_block)
-        # mid
-        self.mid_block = UNetMidBlock3DCrossAttn(
-            in_channels=block_out_channels[-1],
-            temb_channels=time_embed_dim,
-            resnet_eps=norm_eps,
-            resnet_act_fn=act_fn,
-            output_scale_factor=mid_block_scale_factor,
-            cross_attention_dim=cross_attention_dim,
-            attn_num_head_channels=attention_head_dim[-1],
-            resnet_groups=norm_num_groups,
-            dual_cross_attention=False,
-        )
-        # count how many layers upsample the images
-        self.num_upsamplers = 0
-        # up
-        reversed_block_out_channels = list(reversed(block_out_channels))
-        reversed_attention_head_dim = list(reversed(attention_head_dim))
-        output_channel = reversed_block_out_channels[0]
-        for i, up_block_type in enumerate(up_block_types):
-            is_final_block = i == len(block_out_channels) - 1
-            prev_output_channel = output_channel
-            output_channel = reversed_block_out_channels[i]
-            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
-            # add upsample block for all BUT final layer
-            if not is_final_block:
-                add_upsample = True
-                self.num_upsamplers += 1
-            else:
-                add_upsample = False
-            up_block = get_up_block(
-                up_block_type,
-                num_layers=layers_per_block + 1,
-                in_channels=input_channel,
-                out_channels=output_channel,
-                prev_output_channel=prev_output_channel,
-                temb_channels=time_embed_dim,
-                add_upsample=add_upsample,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                resnet_groups=norm_num_groups,
-                cross_attention_dim=cross_attention_dim,
-                attn_num_head_channels=reversed_attention_head_dim[i],
-                dual_cross_attention=False,
-            )
-            self.up_blocks.append(up_block)
-            prev_output_channel = output_channel
-        if norm_num_groups is not None:
-            self.conv_norm_out = nn.GroupNorm(
-                num_channels=block_out_channels[0],
-                num_groups=norm_num_groups,
-                epsilon=norm_eps,
-            )
-            self.conv_act = nn.Silu()
-        else:
-            self.conv_norm_out = None
-            self.conv_act = None
-        conv_out_padding = (conv_out_kernel - 1) // 2
-        self.conv_out = nn.Conv2D(
-            in_channels=block_out_channels[0],
-            out_channels=out_channels,
-            kernel_size=conv_out_kernel,
-            padding=conv_out_padding,
-        )
-
-    @property
-    # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: nn.Layer, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "set_processor"):
-                processors[f"{name}.processor"] = module.processor
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.set_attention_slice
-    def set_attention_slice(self, slice_size):
-        r"""
-        Enable sliced attention computation.
-
-        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
-        in several steps. This is useful to save some memory in exchange for a small speed decrease.
-
-        Args:
-            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
-                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
-                `"max"`, maximum amount of memory will be saved by running only one slice at a time. If a number is
-                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
-                must be a multiple of `slice_size`.
-        """
-        sliceable_head_dims = []
-
-        def fn_recursive_retrieve_sliceable_dims(module: nn.Layer):
-            if hasattr(module, "set_attention_slice"):
-                sliceable_head_dims.append(module.sliceable_head_dim)
-
-            for child in module.children():
-                fn_recursive_retrieve_sliceable_dims(child)
-
-        # retrieve number of attention layers
-        for module in self.children():
-            fn_recursive_retrieve_sliceable_dims(module)
-
-        num_sliceable_layers = len(sliceable_head_dims)
-
-        if slice_size == "auto":
-            # half the attention head size is usually a good trade-off between
-            # speed and memory
-            slice_size = [dim // 2 for dim in sliceable_head_dims]
-        elif slice_size == "max":
-            # make smallest slice possible
-            slice_size = num_sliceable_layers * [1]
-
-        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
-
-        if len(slice_size) != len(sliceable_head_dims):
-            raise ValueError(
-                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
-                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
-            )
-
-        for i in range(len(slice_size)):
-            size = slice_size[i]
-            dim = sliceable_head_dims[i]
-            if size is not None and size > dim:
-                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
-
-        # Recursively walk through all the children.
-        # Any children which exposes the set_attention_slice method
-        # gets the message
-        def fn_recursive_set_attention_slice(module: nn.Layer, slice_size: List[int]):
-            if hasattr(module, "set_attention_slice"):
-                module.set_attention_slice(slice_size.pop())
-
-            for child in module.children():
-                fn_recursive_set_attention_slice(child, slice_size)
-
-        reversed_slice_size = list(reversed(slice_size))
-        for module in self.children():
-            fn_recursive_set_attention_slice(module, reversed_slice_size)
-
-    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Parameters:
-            `processor (`dict` of `AttentionProcessor` or `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                of **all** `Attention` layers.
-            In case `processor` is a dict, the key needs to define the path to the corresponding cross attention processor. This is strongly recommended when setting trainable attention processors.:
-
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: nn.Layer, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
-    # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
-    def set_default_attn_processor(self):
-        """
-        Disables custom attention processors and sets the default attention implementation.
-        """
-        self.set_attn_processor(AttnProcessor())
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, (CrossAttnDownBlock3D, DownBlock3D, CrossAttnUpBlock3D, UpBlock3D)):
-            module.gradient_checkpointing = value
-
-    def forward(
-        self,
-        sample: paddle.Tensor,
-        timestep: Union[paddle.Tensor, float, int],
-        encoder_hidden_states: paddle.Tensor,
-        class_labels: Optional[paddle.Tensor] = None,
-        timestep_cond: Optional[paddle.Tensor] = None,
-        attention_mask: Optional[paddle.Tensor] = None,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        down_block_additional_residuals: Optional[Tuple[paddle.Tensor]] = None,
-        mid_block_additional_residual: Optional[paddle.Tensor] = None,
-        return_dict: bool = True,
-    ) -> Union[UNet3DConditionOutput, Tuple]:
-        """
-        Args:
-            sample (`paddle.Tensor`): (batch, num_frames, channel, height, width) noisy inputs tensor
-            timestep (`paddle.Tensor` or `float` or `int`): (batch) timesteps
-            encoder_hidden_states (`paddle.Tensor`): (batch, sequence_length, feature_dim) encoder hidden states
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`models.unet_2d_condition.UNet3DConditionOutput`] instead of a plain tuple.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [ppdiffusers.cross_attention](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/ppdiffusers/ppdiffusers/models/cross_attention.py).
-
-        Returns:
-            [`~models.unet_2d_condition.UNet3DConditionOutput`] or `tuple`:
-            [`~models.unet_2d_condition.UNet3DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
-            returning a tuple, the first element is the sample tensor.
-        """
-        # TODO junnyu, add this to support pure fp16
-        sample = sample.cast(self.dtype)
-
-        # By default samples have to be AT least a multiple of the overall upsampling factor.
-        # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
-        # However, the upsampling interpolation output size can be forced to fit any upsampling size
-        # on the fly if necessary.
-        default_overall_up_factor = 2**self.num_upsamplers
-
-        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
-        forward_upsample_size = False
-        upsample_size = None
-
-        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
-            logger.info("Forward upsample size to force interpolation output size.")
-            forward_upsample_size = True
-        # prepare attention_mask
-        if attention_mask is not None:
-            attention_mask = (1 - attention_mask.cast(sample.dtype)) * -10000.0
-            attention_mask = attention_mask.unsqueeze(1)
-
-        # 1. time
-        timesteps = timestep
-        if not paddle.is_tensor(timesteps):
-            if isinstance(timestep, float):
-                dtype = "float64"
-            else:
-                dtype = "int64"
-            timesteps = paddle.to_tensor([timesteps], dtype=dtype)
-        elif len(timesteps.shape) == 0:
-            timesteps = timesteps[None]
-        num_frames = sample.shape[2]
-        timesteps = timesteps.expand(
-            [
-                sample.shape[0],
-            ]
-        )
-        t_emb = self.time_proj(timesteps)
-
-        # timesteps does not contain any weights and will always return f32 tensors
-        # but time_embedding might actually be running in fp16. so we need to cast here.
-        # there might be better ways to encapsulate this.
-        t_emb = t_emb.cast(dtype=self.dtype)
-        emb = self.time_embedding(t_emb, timestep_cond)
-        emb = emb.repeat_interleave(repeats=num_frames, axis=0)
-        encoder_hidden_states = encoder_hidden_states.repeat_interleave(repeats=num_frames, axis=0)
-        sample = sample.transpose([0, 2, 1, 3, 4]).reshape(
-            (sample.shape[0] * num_frames, -1) + tuple(sample.shape[3:])
-        )
-        sample = self.conv_in(sample)
-        sample = self.transformer_in(
-            sample, num_frames=num_frames, cross_attention_kwargs=cross_attention_kwargs
-        ).sample
-        # 3. down
-        down_block_res_samples = (sample,)
-        for downsample_block in self.down_blocks:
-            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
-                sample, res_samples = downsample_block(
-                    hidden_states=sample,
-                    temb=emb,
-                    encoder_hidden_states=encoder_hidden_states,
-                    attention_mask=attention_mask,
-                    num_frames=num_frames,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                )
-            else:
-                sample, res_samples = downsample_block(hidden_states=sample, temb=emb, num_frames=num_frames)
-            down_block_res_samples += res_samples
-        if down_block_additional_residuals is not None:
-            new_down_block_res_samples = ()
-            for down_block_res_sample, down_block_additional_residual in zip(
-                down_block_res_samples, down_block_additional_residuals
-            ):
-                down_block_res_sample = down_block_res_sample + down_block_additional_residual
-                new_down_block_res_samples += (down_block_res_sample,)
-            down_block_res_samples = new_down_block_res_samples
-        # 4. mid
-        if self.mid_block is not None:
-            sample = self.mid_block(
-                sample,
-                emb,
-                encoder_hidden_states=encoder_hidden_states,
-                attention_mask=attention_mask,
-                num_frames=num_frames,
-                cross_attention_kwargs=cross_attention_kwargs,
-            )
-        if mid_block_additional_residual is not None:
-            sample = sample + mid_block_additional_residual
-        # 5. up
-        for i, upsample_block in enumerate(self.up_blocks):
-            is_final_block = i == len(self.up_blocks) - 1
-            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
-            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
-            # if we have not reached the final block and need to forward the
-            # upsample size, we do it here
-            if not is_final_block and forward_upsample_size:
-                upsample_size = down_block_res_samples[-1].shape[2:]
-            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
-                sample = upsample_block(
-                    hidden_states=sample,
-                    temb=emb,
-                    res_hidden_states_tuple=res_samples,
-                    encoder_hidden_states=encoder_hidden_states,
-                    upsample_size=upsample_size,
-                    attention_mask=attention_mask,
-                    num_frames=num_frames,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                )
-            else:
-                sample = upsample_block(
-                    hidden_states=sample,
-                    temb=emb,
-                    res_hidden_states_tuple=res_samples,
-                    upsample_size=upsample_size,
-                    num_frames=num_frames,
-                )
-        # 6. post-process
-        if self.conv_norm_out:
-            sample = self.conv_norm_out(sample)
-            sample = self.conv_act(sample)
-        sample = self.conv_out(sample)
-        # reshape to (batch, channel, framerate, width, height)
-        sample = sample[None, :].reshape((-1, num_frames) + tuple(sample.shape[1:])).transpose([0, 2, 1, 3, 4])
-        if not return_dict:
-            return (sample,)
-        return UNet3DConditionOutput(sample=sample)
diff --git a/ppdiffusers/ppdiffusers/models/uvit.py b/ppdiffusers/ppdiffusers/models/uvit.py
deleted file mode 100644
index c975b6d70ee7..000000000000
--- a/ppdiffusers/ppdiffusers/models/uvit.py
+++ /dev/null
@@ -1,386 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from dataclasses import dataclass
-from typing import Optional
-
-import einops
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, is_ppxformers_available
-from .attention import DropPath, Mlp
-from .embeddings import PatchEmbed, get_timestep_embedding
-from .modeling_utils import ModelMixin
-
-
-def unpatchify(x, in_chans):
-    patch_size = int((x.shape[2] // in_chans) ** 0.5)
-    h = w = int(x.shape[1] ** 0.5)
-    assert h * w == x.shape[1] and patch_size ** 2 * in_chans == x.shape[2]
-    x = einops.rearrange(x, "B (h w) (p1 p2 C) -> B C (h p1) (w p2)", h=h, p1=patch_size, p2=patch_size)
-    return x
-
-
-def interpolate_pos_emb(pos_emb, old_shape, new_shape):
-    pos_emb = einops.rearrange(pos_emb, "B (H W) C -> B C H W", H=old_shape[0], W=old_shape[1])
-    pos_emb = F.interpolate(pos_emb, new_shape, mode="bilinear")
-    pos_emb = einops.rearrange(pos_emb, "B C H W -> B (H W) C")
-    return pos_emb
-
-
-class Attention(nn.Layer):
-    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.0, proj_drop=0.0):
-        super().__init__()
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.head_size = head_dim
-        self.scale = qk_scale or head_dim**-0.5
-        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
-        self.attn_drop = attn_drop
-        self.proj = nn.Linear(dim, dim)
-        self.proj_drop = nn.Dropout(proj_drop)
-
-        self._use_memory_efficient_attention_xformers = False
-        self._attention_op = None
-
-    def reshape_heads_to_batch_dim(self, tensor, transpose=True):
-        tensor = tensor.reshape([0, 0, self.num_heads, self.head_size])
-        if transpose:
-            tensor = tensor.transpose([0, 2, 1, 3])
-        return tensor
-
-    def reshape_batch_dim_to_heads(self, tensor, transpose=True):
-        if transpose:
-            tensor = tensor.transpose([0, 2, 1, 3])
-        tensor = tensor.reshape([0, 0, tensor.shape[2] * tensor.shape[3]])
-        return tensor
-
-    def set_use_memory_efficient_attention_xformers(
-        self, use_memory_efficient_attention_xformers: bool, attention_op: Optional[str] = None
-    ):
-        if self.head_size > 128 and attention_op == "flash":
-            attention_op = "cutlass"
-        if use_memory_efficient_attention_xformers:
-            if not is_ppxformers_available():
-                raise NotImplementedError(
-                    "requires the scaled_dot_product_attention but your PaddlePaddle donot have this. Checkout the instructions on the installation page: https://www.paddlepaddle.org.cn/install/quick and follow the ones that match your environment."
-                )
-            else:
-                try:
-                    _ = F.scaled_dot_product_attention_(
-                        paddle.ones((1, 1, 2, 40), dtype=paddle.float16),
-                        paddle.ones((1, 1, 2, 40), dtype=paddle.float16),
-                        paddle.ones((1, 1, 2, 40), dtype=paddle.float16),
-                        attention_op=attention_op,
-                    )
-                except Exception as e:
-                    raise e
-
-        self._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
-        self._attention_op = attention_op
-
-    def forward(self, x):
-        qkv = self.qkv(x)
-        if not self._use_memory_efficient_attention_xformers:
-            qkv = qkv.cast(paddle.float32)
-        query_proj, key_proj, value_proj = qkv.chunk(3, axis=-1)
-        query_proj = self.reshape_heads_to_batch_dim(
-            query_proj, transpose=not self._use_memory_efficient_attention_xformers
-        )
-        key_proj = self.reshape_heads_to_batch_dim(
-            key_proj, transpose=not self._use_memory_efficient_attention_xformers
-        )
-        value_proj = self.reshape_heads_to_batch_dim(
-            value_proj, transpose=not self._use_memory_efficient_attention_xformers
-        )
-
-        if self._use_memory_efficient_attention_xformers:
-            hidden_states = F.scaled_dot_product_attention_(
-                query_proj,
-                key_proj,
-                value_proj,
-                attn_mask=None,
-                scale=self.scale,
-                dropout_p=self.attn_drop,
-                training=self.training,
-                attention_op=self._attention_op,
-            )
-        else:
-            with paddle.amp.auto_cast(enable=False):
-                attention_scores = paddle.matmul(query_proj * self.scale, key_proj, transpose_y=True)
-                attention_probs = F.softmax(attention_scores, axis=-1)
-                hidden_states = paddle.matmul(attention_probs, value_proj).cast(x.dtype)
-
-        hidden_states = self.reshape_batch_dim_to_heads(
-            hidden_states, transpose=not self._use_memory_efficient_attention_xformers
-        )
-
-        hidden_states = self.proj_drop(self.proj(hidden_states))
-        return hidden_states
-
-
-class Block(nn.Layer):
-    def __init__(
-        self,
-        dim,
-        num_heads,
-        mlp_ratio=4.0,
-        qkv_bias=False,
-        qk_scale=None,
-        drop=0.0,
-        attn_drop=0.0,
-        drop_path=0.0,
-        act_layer=nn.GELU,
-        norm_layer=nn.LayerNorm,
-        skip=False,
-    ):
-        super().__init__()
-        self.norm1 = norm_layer(dim) if skip else None
-        self.norm2 = norm_layer(dim)
-
-        self.attn = Attention(
-            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop
-        )
-        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
-        self.norm3 = norm_layer(dim)
-        mlp_hidden_dim = int(dim * mlp_ratio)
-        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
-        self.skip_linear = nn.Linear(2 * dim, dim) if skip else None
-
-    def forward(self, x, skip=None):
-        if self.skip_linear is not None:
-            x = self.skip_linear(paddle.concat([x, skip], axis=-1))
-            x = self.norm1(x)
-        x = x + self.drop_path(self.attn(x))
-        x = self.norm2(x)
-
-        x = x + self.drop_path(self.mlp(x))
-        x = self.norm3(x)
-
-        return x
-
-
-@dataclass
-class UViTModelOutput(BaseOutput):
-    """
-    Args:
-        sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)`):
-            Hidden states output. Output of last layer of model.
-    """
-
-    sample_img: paddle.Tensor
-    sample_clip_img: paddle.Tensor
-    sample_text: paddle.Tensor
-
-
-class UViTModel(ModelMixin, ConfigMixin):
-    r"""
-    UViTModel is a unet-stype ViT model that takes in a noisy sample and a timestep and returns sample shaped output.
-    Note that the different between the original U-ViT is the post-layer normalization and add a layer normalization
-    after concatenat-ing a long skip connection, which stabilizes the training of U-ViT in UniDiffuser.
-
-    """
-
-    @register_to_config
-    def __init__(
-        self,
-        sample_size=1,
-        img_size=64,
-        in_channels=4,
-        patch_size=2,
-        embed_dim=1536,
-        depth=30,
-        num_heads=24,
-        mlp_ratio=4.0,
-        qkv_bias=False,
-        qk_scale=None,
-        pos_drop_rate=0.0,
-        drop_rate=0.0,
-        attn_drop_rate=0.0,
-        norm_type="layer_norm",
-        text_dim=64,
-        num_text_tokens=77,
-        clip_img_dim=512,
-        use_checkpoint=False,
-    ):
-        super().__init__()
-        self.sample_size = sample_size
-        self.in_channels = in_channels
-        self.patch_size = patch_size
-        self.embed_dim = embed_dim
-
-        self.img_size = (img_size, img_size) if isinstance(img_size, int) else img_size
-        self.patch_embed = PatchEmbed(
-            height=self.img_size[0],
-            width=self.img_size[1],
-            patch_size=patch_size,
-            in_channels=in_channels,
-            embed_dim=embed_dim,
-            add_pos_embed=False,
-        )
-        assert self.img_size[0] % patch_size == 0 and self.img_size[1] % patch_size == 0
-        self.num_patches = (self.img_size[0] // patch_size) * (self.img_size[1] // patch_size)
-
-        self.encode_prefix = nn.Linear(768, text_dim)
-
-        self.text_embed = nn.Linear(text_dim, embed_dim)
-        self.text_out = nn.Linear(embed_dim, text_dim)
-        self.clip_img_embed = nn.Linear(clip_img_dim, embed_dim)
-        self.clip_img_out = nn.Linear(embed_dim, clip_img_dim)
-
-        self.num_text_tokens = num_text_tokens
-        self.num_tokens = 1 + 1 + num_text_tokens + 1 + self.num_patches
-
-        self.pos_embed = self.create_parameter(
-            shape=(1, self.num_tokens, embed_dim),
-            default_initializer=nn.initializer.Constant(0.0),
-        )
-        assert norm_type == "layer_norm", "We only support norm_type == layer_norm. "
-        norm_layer = nn.LayerNorm
-        self.pos_drop = nn.Dropout(p=pos_drop_rate)
-
-        self.in_blocks = nn.LayerList(
-            [
-                Block(
-                    dim=embed_dim,
-                    num_heads=num_heads,
-                    mlp_ratio=mlp_ratio,
-                    qkv_bias=qkv_bias,
-                    qk_scale=qk_scale,
-                    drop=drop_rate,
-                    attn_drop=attn_drop_rate,
-                    norm_layer=norm_layer,
-                )
-                for _ in range(depth // 2)
-            ]
-        )
-
-        self.mid_block = Block(
-            dim=embed_dim,
-            num_heads=num_heads,
-            mlp_ratio=mlp_ratio,
-            qkv_bias=qkv_bias,
-            qk_scale=qk_scale,
-            drop=drop_rate,
-            attn_drop=attn_drop_rate,
-            norm_layer=norm_layer,
-        )
-
-        self.out_blocks = nn.LayerList(
-            [
-                Block(
-                    dim=embed_dim,
-                    num_heads=num_heads,
-                    mlp_ratio=mlp_ratio,
-                    qkv_bias=qkv_bias,
-                    qk_scale=qk_scale,
-                    drop=drop_rate,
-                    attn_drop=attn_drop_rate,
-                    norm_layer=norm_layer,
-                    skip=True,
-                )
-                for _ in range(depth // 2)
-            ]
-        )
-
-        self.norm = norm_layer(embed_dim)
-        self.patch_dim = patch_size**2 * in_channels
-        self.decoder_pred = nn.Linear(embed_dim, self.patch_dim, bias_attr=True)
-
-        self.token_embedding = nn.Embedding(2, embed_dim)
-        self.pos_embed_token = self.create_parameter(
-            shape=(1, 1, embed_dim), default_initializer=nn.initializer.Constant(0.0)
-        )
-
-    def forward(
-        self,
-        img: paddle.Tensor,
-        clip_img: paddle.Tensor,
-        text: paddle.Tensor,
-        t_img: paddle.Tensor,
-        t_text: paddle.Tensor,
-        data_type: paddle.Tensor,
-        return_dict=False,  # TODO: nf
-    ):
-        _, _, H, W = img.shape
-        # TODO junnyu, support float16
-        img = img.cast(self.dtype)
-        clip_img = clip_img.cast(self.dtype)
-        text = text.cast(self.dtype)
-
-        img = self.patch_embed(img)
-        clip_img = self.clip_img_embed(clip_img)
-        text = self.text_embed(text)
-
-        t_img_token = get_timestep_embedding(t_img, self.embed_dim, True, 0).unsqueeze(axis=1)
-        t_text_token = get_timestep_embedding(t_text, self.embed_dim, True, 0).unsqueeze(axis=1)
-        token_embed = self.token_embedding(data_type).unsqueeze(axis=1)
-
-        # TODO junnyu, support float16
-        t_img_token = t_img_token.cast(self.dtype)
-        t_text_token = t_text_token.cast(self.dtype)
-        token_embed = token_embed.cast(self.dtype)
-
-        x = paddle.concat((t_img_token, t_text_token, token_embed, text, clip_img, img), axis=1)
-
-        num_text_tokens, num_img_tokens = text.shape[1], img.shape[1]
-
-        pos_embed = paddle.concat(
-            [self.pos_embed[:, : 1 + 1, :], self.pos_embed_token, self.pos_embed[:, 1 + 1 :, :]], axis=1
-        )
-
-        if H == self.img_size[0] and W == self.img_size[1]:
-            pass
-        else:
-            # interpolate the positional embedding when the input image is not of the default shape
-            pos_embed_others, pos_embed_patches = paddle.split(
-                pos_embed, [1 + 1 + 1 + num_text_tokens + 1, self.num_patches], axis=1
-            )
-            pos_embed_patches = interpolate_pos_emb(
-                pos_embed_patches,
-                (self.img_size[0] // self.patch_size, self.img_size[1] // self.patch_size),
-                (H // self.patch_size, W // self.patch_size),
-            )
-            pos_embed = paddle.concat((pos_embed_others, pos_embed_patches), axis=1)
-
-        x = x + pos_embed
-        x = self.pos_drop(x)
-
-        skips = []
-        for blk in self.in_blocks:
-            x = blk(x)
-            skips.append(x)
-
-        x = self.mid_block(x)
-
-        for blk in self.out_blocks:
-            x = blk(x, skips.pop())
-
-        x = self.norm(x)
-
-        t_img_token_out, t_text_token_out, token_embed_out, text_out, clip_img_out, img_out = x.split(
-            (1, 1, 1, num_text_tokens, 1, num_img_tokens), axis=1
-        )
-
-        img_out = self.decoder_pred(img_out)
-        sample_img = unpatchify(img_out, self.in_channels)
-        sample_clip_img = self.clip_img_out(clip_img_out)
-        sample_text = self.text_out(text_out)
-
-        if not return_dict:
-            return (sample_img, sample_clip_img, sample_text)
-
-        return UViTModelOutput(sample_img=sample_img, sample_clip_img=sample_clip_img, sample_text=sample_text)
diff --git a/ppdiffusers/ppdiffusers/models/vae.py b/ppdiffusers/ppdiffusers/models/vae.py
deleted file mode 100644
index 32d063a02307..000000000000
--- a/ppdiffusers/ppdiffusers/models/vae.py
+++ /dev/null
@@ -1,424 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from dataclasses import dataclass
-from typing import Optional
-
-import numpy as np
-import paddle
-import paddle.nn as nn
-from paddle.distributed.fleet.utils import recompute
-
-from ..utils import BaseOutput, randn_tensor
-from .unet_2d_blocks import UNetMidBlock2D, get_down_block, get_up_block
-
-try:
-    from paddle.amp.auto_cast import amp_state
-except ImportError:
-    from paddle.fluid.dygraph.amp.auto_cast import amp_state
-
-
-def finfo(dtype):
-    if dtype == paddle.float32:
-        return np.finfo(np.float32)
-    if dtype == paddle.float16:
-        return np.finfo(np.float16)
-    if dtype == paddle.float64:
-        return np.finfo(np.float64)
-
-
-@dataclass
-class DecoderOutput(BaseOutput):
-    """
-    Output of decoding method.
-
-    Args:
-        sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)`):
-            Decoded output sample of the model. Output of the last layer of the model.
-    """
-
-    sample: paddle.Tensor
-
-
-class Encoder(nn.Layer):
-    def __init__(
-        self,
-        in_channels=3,
-        out_channels=3,
-        down_block_types=("DownEncoderBlock2D",),
-        block_out_channels=(64,),
-        layers_per_block=2,
-        norm_num_groups=32,
-        act_fn="silu",
-        double_z=True,
-    ):
-        super().__init__()
-        self.layers_per_block = layers_per_block
-
-        self.conv_in = nn.Conv2D(in_channels, block_out_channels[0], kernel_size=3, stride=1, padding=1)
-
-        self.mid_block = None
-        self.down_blocks = nn.LayerList([])
-
-        # down
-        output_channel = block_out_channels[0]
-        for i, down_block_type in enumerate(down_block_types):
-            input_channel = output_channel
-            output_channel = block_out_channels[i]
-            is_final_block = i == len(block_out_channels) - 1
-
-            down_block = get_down_block(
-                down_block_type,
-                num_layers=self.layers_per_block,
-                in_channels=input_channel,
-                out_channels=output_channel,
-                add_downsample=not is_final_block,
-                resnet_eps=1e-6,
-                downsample_padding=0,
-                resnet_act_fn=act_fn,
-                resnet_groups=norm_num_groups,
-                attn_num_head_channels=None,
-                temb_channels=None,
-            )
-            self.down_blocks.append(down_block)
-
-        # mid
-        self.mid_block = UNetMidBlock2D(
-            in_channels=block_out_channels[-1],
-            resnet_eps=1e-6,
-            resnet_act_fn=act_fn,
-            output_scale_factor=1,
-            resnet_time_scale_shift="default",
-            attn_num_head_channels=None,
-            resnet_groups=norm_num_groups,
-            temb_channels=None,
-        )
-
-        # out
-        self.conv_norm_out = nn.GroupNorm(
-            num_channels=block_out_channels[-1], num_groups=norm_num_groups, epsilon=1e-6
-        )
-        self.conv_act = nn.Silu()
-
-        conv_out_channels = 2 * out_channels if double_z else out_channels
-        self.conv_out = nn.Conv2D(block_out_channels[-1], conv_out_channels, 3, padding=1)
-        self.gradient_checkpointing = False
-
-    def forward(self, x):
-        sample = x
-        sample = self.conv_in(sample)
-
-        if self.training and self.gradient_checkpointing and not sample.stop_gradient:
-
-            def create_custom_forward(module):
-                def custom_forward(*inputs):
-                    return module(*inputs)
-
-                return custom_forward
-
-            # down
-            for down_block in self.down_blocks:
-                sample = recompute(create_custom_forward(down_block), sample)
-
-            # middle
-            sample = recompute(create_custom_forward(self.mid_block), sample)
-
-        else:
-            # down
-            for down_block in self.down_blocks:
-                sample = down_block(sample)
-
-            # middle
-            sample = self.mid_block(sample)
-
-        # post-process
-        sample = self.conv_norm_out(sample)
-        sample = self.conv_act(sample)
-        sample = self.conv_out(sample)
-
-        return sample
-
-
-class Decoder(nn.Layer):
-    def __init__(
-        self,
-        in_channels=3,
-        out_channels=3,
-        up_block_types=("UpDecoderBlock2D",),
-        block_out_channels=(64,),
-        layers_per_block=2,
-        norm_num_groups=32,
-        act_fn="silu",
-    ):
-        super().__init__()
-        self.layers_per_block = layers_per_block
-
-        self.conv_in = nn.Conv2D(in_channels, block_out_channels[-1], kernel_size=3, stride=1, padding=1)
-
-        self.mid_block = None
-        self.up_blocks = nn.LayerList([])
-
-        # mid
-        self.mid_block = UNetMidBlock2D(
-            in_channels=block_out_channels[-1],
-            resnet_eps=1e-6,
-            resnet_act_fn=act_fn,
-            output_scale_factor=1,
-            resnet_time_scale_shift="default",
-            attn_num_head_channels=None,
-            resnet_groups=norm_num_groups,
-            temb_channels=None,
-        )
-
-        # up
-        reversed_block_out_channels = list(reversed(block_out_channels))
-        output_channel = reversed_block_out_channels[0]
-        for i, up_block_type in enumerate(up_block_types):
-            prev_output_channel = output_channel
-            output_channel = reversed_block_out_channels[i]
-
-            is_final_block = i == len(block_out_channels) - 1
-
-            up_block = get_up_block(
-                up_block_type,
-                num_layers=self.layers_per_block + 1,
-                in_channels=prev_output_channel,
-                out_channels=output_channel,
-                prev_output_channel=None,
-                add_upsample=not is_final_block,
-                resnet_eps=1e-6,
-                resnet_act_fn=act_fn,
-                resnet_groups=norm_num_groups,
-                attn_num_head_channels=None,
-                temb_channels=None,
-            )
-            self.up_blocks.append(up_block)
-            prev_output_channel = output_channel
-
-        # out
-        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, epsilon=1e-6)
-        self.conv_act = nn.Silu()
-        self.conv_out = nn.Conv2D(block_out_channels[0], out_channels, 3, padding=1)
-        self.gradient_checkpointing = False
-
-    def forward(self, z):
-        sample = z
-        sample = self.conv_in(sample)
-
-        upscale_dtype = self.dtype
-        if self.training and self.gradient_checkpointing and not sample.stop_gradient:
-
-            def create_custom_forward(module):
-                def custom_forward(*inputs):
-                    return module(*inputs)
-
-                return custom_forward
-
-            # middle
-            sample = recompute(create_custom_forward(self.mid_block), sample)
-            if upscale_dtype != sample.dtype:
-                sample = sample.cast(upscale_dtype)
-
-            # up
-            for up_block in self.up_blocks:
-                sample = recompute(create_custom_forward(up_block), sample)
-        else:
-            # middle
-            sample = self.mid_block(sample)
-            if upscale_dtype != sample.dtype:
-                sample = sample.cast(upscale_dtype)
-
-            # up
-            for up_block in self.up_blocks:
-                sample = up_block(sample)
-
-        # (TODO, junnyu) check nan
-        # clamp inf values to enable fp16 training
-        if (amp_state() or sample.dtype == paddle.float16) and paddle.isinf(sample).any():
-            clamp_value = finfo(sample.dtype).max - 1000
-            sample = paddle.clip(sample, min=-clamp_value, max=clamp_value)
-
-        # post-process
-        sample = self.conv_norm_out(sample)
-        sample = self.conv_act(sample)
-        sample = self.conv_out(sample)
-
-        return sample
-
-
-class VectorQuantizer(nn.Layer):
-    """
-    Improved version over VectorQuantizer, can be used as a drop-in replacement. Mostly avoids costly matrix
-    multiplications and allows for post-hoc remapping of indices.
-    """
-
-    # NOTE: due to a bug the beta term was applied to the wrong term. for
-    # backwards compatibility we use the buggy version by default, but you can
-    # specify legacy=False to fix it.
-    def __init__(
-        self, n_e, vq_embed_dim, beta, remap=None, unknown_index="random", sane_index_shape=False, legacy=True
-    ):
-        super().__init__()
-        self.n_e = n_e
-        self.vq_embed_dim = vq_embed_dim
-        self.beta = beta
-        self.legacy = legacy
-
-        self.embedding = nn.Embedding(
-            self.n_e, self.vq_embed_dim, weight_attr=nn.initializer.Uniform(-1.0 / self.n_e, 1.0 / self.n_e)
-        )
-
-        self.remap = remap
-        if self.remap is not None:
-            self.register_buffer("used", paddle.to_tensor(np.load(self.remap)))
-            self.re_embed = self.used.shape[0]
-            self.unknown_index = unknown_index  # "random" or "extra" or integer
-            if self.unknown_index == "extra":
-                self.unknown_index = self.re_embed
-                self.re_embed = self.re_embed + 1
-            print(
-                f"Remapping {self.n_e} indices to {self.re_embed} indices. "
-                f"Using {self.unknown_index} for unknown indices."
-            )
-        else:
-            self.re_embed = n_e
-
-        self.sane_index_shape = sane_index_shape
-
-    def remap_to_used(self, inds):
-        ishape = inds.shape
-        assert len(ishape) > 1
-        inds = inds.reshape([ishape[0], -1])
-        used = self.used.cast(inds.dtype)
-        match = (inds[:, :, None] == used[None, None, ...]).cast("int64")
-        new = match.argmax(-1)
-        unknown = match.sum(2) < 1
-        if self.unknown_index == "random":
-            new[unknown] = paddle.randint(0, self.re_embed, shape=new[unknown].shape)
-        else:
-            new[unknown] = self.unknown_index
-        return new.reshape(ishape)
-
-    def unmap_to_all(self, inds):
-        ishape = inds.shape
-        assert len(ishape) > 1
-        inds = inds.reshape([ishape[0], -1])
-        used = self.used.cast(inds.dtype)
-        if self.re_embed > self.used.shape[0]:  # extra token
-            inds[inds >= self.used.shape[0]] = 0  # simply set to zero
-        back = paddle.take_along_axis(used[None, :][inds.shape[0] * [0], :], inds, axis=1)
-        return back.reshape(ishape)
-
-    def forward(self, z):
-        # reshape z -> (batch, height, width, channel) and flatten
-        z = z.transpose([0, 2, 3, 1])
-        z_flattened = z.reshape([-1, self.vq_embed_dim])
-        # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
-
-        d = (
-            paddle.sum(z_flattened**2, axis=1, keepdim=True)
-            + paddle.sum(self.embedding.weight**2, axis=1)
-            - 2 * paddle.matmul(z_flattened, self.embedding.weight, transpose_y=True)
-        )
-
-        min_encoding_indices = paddle.argmin(d, axis=1)
-        z_q = self.embedding(min_encoding_indices).reshape(z.shape)
-        perplexity = None
-        min_encodings = None
-
-        # compute loss for embedding
-        if not self.legacy:
-            loss = self.beta * paddle.mean((z_q.detach() - z) ** 2) + paddle.mean((z_q - z.detach()) ** 2)
-        else:
-            loss = paddle.mean((z_q.detach() - z) ** 2) + self.beta * paddle.mean((z_q - z.detach()) ** 2)
-
-        # preserve gradients
-        z_q = z + (z_q - z).detach()
-
-        # reshape back to match original input shape
-        z_q = z_q.transpose([0, 3, 1, 2])
-
-        if self.remap is not None:
-            min_encoding_indices = min_encoding_indices.reshape([z.shape[0], -1])  # add batch axis
-            min_encoding_indices = self.remap_to_used(min_encoding_indices)
-            min_encoding_indices = min_encoding_indices.reshape([-1, 1])  # flatten
-
-        if self.sane_index_shape:
-            min_encoding_indices = min_encoding_indices.reshape([z_q.shape[0], z_q.shape[2], z_q.shape[3]])
-
-        return z_q, loss, (perplexity, min_encodings, min_encoding_indices)
-
-    def get_codebook_entry(self, indices, shape):
-        # shape specifying (batch, height, width, channel)
-        if self.remap is not None:
-            indices = indices.reshape([shape[0], -1])  # add batch axis
-            indices = self.unmap_to_all(indices)
-            indices = indices.reshape(
-                [
-                    -1,
-                ]
-            )  # flatten again
-
-        # get quantized latent vectors
-        z_q = self.embedding(indices)
-
-        if shape is not None:
-            z_q = z_q.reshape(shape)
-            # reshape back to match original input shape
-            z_q = z_q.transpose([0, 3, 1, 2])
-
-        return z_q
-
-
-class DiagonalGaussianDistribution(object):
-    def __init__(self, parameters, deterministic=False):
-        self.parameters = parameters
-        self.mean, self.logvar = paddle.chunk(parameters, 2, axis=1)
-        self.logvar = paddle.clip(self.logvar, -30.0, 20.0)
-        self.deterministic = deterministic
-        self.std = paddle.exp(0.5 * self.logvar)
-        self.var = paddle.exp(self.logvar)
-        if self.deterministic:
-            self.var = self.std = paddle.zeros_like(self.mean, dtype=self.parameters.dtype)
-
-    def sample(self, generator: Optional[paddle.Generator] = None) -> paddle.Tensor:
-        # make sure sample is on the same device as the parameters and has same dtype
-        sample = randn_tensor(self.mean.shape, generator=generator, dtype=self.parameters.dtype)
-        x = self.mean + self.std * sample
-        return x
-
-    def kl(self, other=None):
-        if self.deterministic:
-            return paddle.to_tensor([0.0])
-        else:
-            if other is None:
-                return 0.5 * paddle.sum(paddle.pow(self.mean, 2) + self.var - 1.0 - self.logvar, axis=[1, 2, 3])
-            else:
-                return 0.5 * paddle.sum(
-                    paddle.pow(self.mean - other.mean, 2) / other.var
-                    + self.var / other.var
-                    - 1.0
-                    - self.logvar
-                    + other.logvar,
-                    axis=[1, 2, 3],
-                )
-
-    def nll(self, sample, axis=[1, 2, 3]):
-        if self.deterministic:
-            return paddle.to_tensor([0.0])
-        logtwopi = np.log(2.0 * np.pi)
-        return 0.5 * paddle.sum(logtwopi + self.logvar + paddle.pow(sample - self.mean, 2) / self.var, axis=axis)
-
-    def mode(self):
-        return self.mean
diff --git a/ppdiffusers/ppdiffusers/models/vq_model.py b/ppdiffusers/ppdiffusers/models/vq_model.py
deleted file mode 100644
index 0e55868481fa..000000000000
--- a/ppdiffusers/ppdiffusers/models/vq_model.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from dataclasses import dataclass
-from typing import Optional, Tuple
-
-import paddle
-import paddle.nn as nn
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput
-from .modeling_utils import ModelMixin
-from .vae import Decoder, DecoderOutput, Encoder, VectorQuantizer
-
-
-@dataclass
-class VQEncoderOutput(BaseOutput):
-    """
-    Output of VQModel encoding method.
-
-    Args:
-        latents (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)`):
-            Encoded output sample of the model. Output of the last layer of the model.
-    """
-
-    latents: paddle.Tensor
-
-
-class VQModel(ModelMixin, ConfigMixin):
-    r"""VQ-VAE model from the paper Neural Discrete Representation Learning by Aaron van den Oord, Oriol Vinyals and Koray
-    Kavukcuoglu.
-
-    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
-    implements for all the model (such as downloading or saving, etc.)
-
-    Parameters:
-        in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
-        out_channels (int,  *optional*, defaults to 3): Number of channels in the output.
-        down_block_types (`Tuple[str]`, *optional*, defaults to :
-            obj:`("DownEncoderBlock2D",)`): Tuple of downsample block types.
-        up_block_types (`Tuple[str]`, *optional*, defaults to :
-            obj:`("UpDecoderBlock2D",)`): Tuple of upsample block types.
-        block_out_channels (`Tuple[int]`, *optional*, defaults to :
-            obj:`(64,)`): Tuple of block output channels.
-        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
-        latent_channels (`int`, *optional*, defaults to `3`): Number of channels in the latent space.
-        sample_size (`int`, *optional*, defaults to `32`): TODO
-        num_vq_embeddings (`int`, *optional*, defaults to `256`): Number of codebook vectors in the VQ-VAE.
-        vq_embed_dim (`int`, *optional*): Hidden dim of codebook vectors in the VQ-VAE.
-        scaling_factor (`float`, *optional*, defaults to `0.18215`):
-            The component-wise standard deviation of the trained latent space computed using the first batch of the
-            training set. This is used to scale the latent space to have unit variance when training the diffusion
-            model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
-            diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
-            / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
-            Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
-    """
-
-    @register_to_config
-    def __init__(
-        self,
-        in_channels: int = 3,
-        out_channels: int = 3,
-        down_block_types: Tuple[str] = ("DownEncoderBlock2D",),
-        up_block_types: Tuple[str] = ("UpDecoderBlock2D",),
-        block_out_channels: Tuple[int] = (64,),
-        layers_per_block: int = 1,
-        act_fn: str = "silu",
-        latent_channels: int = 3,
-        sample_size: int = 32,
-        num_vq_embeddings: int = 256,
-        norm_num_groups: int = 32,
-        vq_embed_dim: Optional[int] = None,
-        scaling_factor: float = 0.18215,
-    ):
-        super().__init__()
-
-        # pass init params to Encoder
-        self.encoder = Encoder(
-            in_channels=in_channels,
-            out_channels=latent_channels,
-            down_block_types=down_block_types,
-            block_out_channels=block_out_channels,
-            layers_per_block=layers_per_block,
-            act_fn=act_fn,
-            norm_num_groups=norm_num_groups,
-            double_z=False,
-        )
-
-        vq_embed_dim = vq_embed_dim if vq_embed_dim is not None else latent_channels
-
-        self.quant_conv = nn.Conv2D(latent_channels, vq_embed_dim, 1)
-        self.quantize = VectorQuantizer(num_vq_embeddings, vq_embed_dim, beta=0.25, remap=None, sane_index_shape=False)
-        self.post_quant_conv = nn.Conv2D(vq_embed_dim, latent_channels, 1)
-
-        # pass init params to Decoder
-        self.decoder = Decoder(
-            in_channels=latent_channels,
-            out_channels=out_channels,
-            up_block_types=up_block_types,
-            block_out_channels=block_out_channels,
-            layers_per_block=layers_per_block,
-            act_fn=act_fn,
-            norm_num_groups=norm_num_groups,
-        )
-
-    def encode(self, x: paddle.Tensor, return_dict: bool = True):
-        h = self.encoder(x)
-        h = self.quant_conv(h)
-
-        if not return_dict:
-            return (h,)
-
-        return VQEncoderOutput(latents=h)
-
-    def decode(self, h: paddle.Tensor, force_not_quantize: bool = False, return_dict: bool = True):
-        # cast h to float16 / float32
-        h = h.cast(self.dtype)
-        # also go through quantization layer
-        if not force_not_quantize:
-            quant, emb_loss, info = self.quantize(h)
-        else:
-            quant = h
-        quant = self.post_quant_conv(quant)
-        dec = self.decoder(quant)
-
-        if not return_dict:
-            return (dec,)
-
-        return DecoderOutput(sample=dec)
-
-    def forward(self, sample: paddle.Tensor, return_dict: bool = True):
-        r"""
-        Args:
-            sample (`paddle.Tensor`): Input sample.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
-        """
-        x = sample
-        h = self.encode(x).latents
-        dec = self.decode(h).sample
-
-        if not return_dict:
-            return (dec,)
-
-        return DecoderOutput(sample=dec)
diff --git a/ppdiffusers/ppdiffusers/optimization.py b/ppdiffusers/ppdiffusers/optimization.py
deleted file mode 100644
index c15dbd6b3e8b..000000000000
--- a/ppdiffusers/ppdiffusers/optimization.py
+++ /dev/null
@@ -1,312 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Paddle optimization for diffusion models."""
-
-import math
-from enum import Enum
-from typing import Optional, Union
-
-from paddle.optimizer.lr import LambdaDecay
-
-from .utils import logging
-
-logger = logging.get_logger(__name__)
-
-
-class SchedulerType(Enum):
-    LINEAR = "linear"
-    COSINE = "cosine"
-    COSINE_WITH_RESTARTS = "cosine_with_restarts"
-    POLYNOMIAL = "polynomial"
-    CONSTANT = "constant"
-    CONSTANT_WITH_WARMUP = "constant_with_warmup"
-
-
-def get_constant_schedule(learning_rate: float, last_epoch: int = -1):
-    """
-    Create a schedule with a constant learning rate, using the learning rate set in optimizer.
-
-    Args:
-        learning_rate (`float`):
-            The base learning rate. It is a python float number.
-        last_epoch (`int`, *optional*, defaults to -1):
-            The index of the last epoch when resuming training.
-
-    Return:
-        `paddle.optimizer.lr.LambdaDecay` with the appropriate schedule.
-    """
-    return LambdaDecay(learning_rate, lambda _: 1, last_epoch=last_epoch)
-
-
-def get_constant_schedule_with_warmup(learning_rate: float, num_warmup_steps: int, last_epoch: int = -1):
-    """
-    Create a schedule with a constant learning rate preceded by a warmup period during which the learning rate
-    increases linearly between 0 and the initial lr set in the optimizer.
-
-    Args:
-        learning_rate (`float`):
-            The base learning rate. It is a python float number.
-        num_warmup_steps (`int`):
-            The number of steps for the warmup phase.
-        last_epoch (`int`, *optional*, defaults to -1):
-            The index of the last epoch when resuming training.
-
-    Return:
-        `paddle.optimizer.lr.LambdaDecay` with the appropriate schedule.
-    """
-
-    def lr_lambda(current_step: int):
-        if current_step < num_warmup_steps:
-            return float(current_step) / float(max(1.0, num_warmup_steps))
-        return 1.0
-
-    return LambdaDecay(learning_rate, lr_lambda, last_epoch=last_epoch)
-
-
-def get_linear_schedule_with_warmup(
-    learning_rate: float, num_warmup_steps: int, num_training_steps: int, last_epoch: int = -1
-):
-    """
-    Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
-    a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
-
-    Args:
-        learning_rate (`float`):
-            The base learning rate. It is a python float number.
-        num_warmup_steps (`int`):
-            The number of steps for the warmup phase.
-        num_training_steps (`int`):
-            The total number of training steps.
-        last_epoch (`int`, *optional*, defaults to -1):
-            The index of the last epoch when resuming training.
-
-    Return:
-        `paddle.optimizer.lr.LambdaDecay` with the appropriate schedule.
-    """
-
-    def lr_lambda(current_step: int):
-        if current_step < num_warmup_steps:
-            return float(current_step) / float(max(1, num_warmup_steps))
-        return max(
-            0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
-        )
-
-    return LambdaDecay(learning_rate, lr_lambda, last_epoch)
-
-
-def get_cosine_schedule_with_warmup(
-    learning_rate: float, num_warmup_steps: int, num_training_steps: int, num_cycles: float = 0.5, last_epoch: int = -1
-):
-    """
-    Create a schedule with a learning rate that decreases following the values of the cosine function between the
-    initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
-    initial lr set in the optimizer.
-
-    Args:
-        learning_rate (`float`):
-            The base learning rate. It is a python float number.
-        num_warmup_steps (`int`):
-            The number of steps for the warmup phase.
-        num_training_steps (`int`):
-            The total number of training steps.
-        num_cycles (`float`, *optional*, defaults to 0.5):
-            The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
-            following a half-cosine).
-        last_epoch (`int`, *optional*, defaults to -1):
-            The index of the last epoch when resuming training.
-
-    Return:
-        `paddle.optimizer.lr.LambdaDecay` with the appropriate schedule.
-    """
-
-    def lr_lambda(current_step):
-        if current_step < num_warmup_steps:
-            return float(current_step) / float(max(1, num_warmup_steps))
-        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
-        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
-
-    return LambdaDecay(learning_rate, lr_lambda, last_epoch)
-
-
-def get_cosine_with_hard_restarts_schedule_with_warmup(
-    learning_rate: float, num_warmup_steps: int, num_training_steps: int, num_cycles: int = 1, last_epoch: int = -1
-):
-    """
-    Create a schedule with a learning rate that decreases following the values of the cosine function between the
-    initial lr set in the optimizer to 0, with several hard restarts, after a warmup period during which it increases
-    linearly between 0 and the initial lr set in the optimizer.
-
-    Args:
-        learning_rate (`float`):
-            The base learning rate. It is a python float number.
-        num_warmup_steps (`int`):
-            The number of steps for the warmup phase.
-        num_training_steps (`int`):
-            The total number of training steps.
-        num_cycles (`int`, *optional*, defaults to 1):
-            The number of hard restarts to use.
-        last_epoch (`int`, *optional*, defaults to -1):
-            The index of the last epoch when resuming training.
-
-    Return:
-        `paddle.optimizer.lr.LambdaDecay` with the appropriate schedule.
-    """
-
-    def lr_lambda(current_step):
-        if current_step < num_warmup_steps:
-            return float(current_step) / float(max(1, num_warmup_steps))
-        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
-        if progress >= 1.0:
-            return 0.0
-        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0))))
-
-    return LambdaDecay(learning_rate, lr_lambda, last_epoch)
-
-
-def get_polynomial_decay_schedule_with_warmup(
-    learning_rate: float,
-    num_warmup_steps: int,
-    num_training_steps: int,
-    lr_end: float = 1e-7,
-    power: float = 1.0,
-    last_epoch: int = -1,
-):
-    """
-    Create a schedule with a learning rate that decreases as a polynomial decay from the initial lr set in the
-    optimizer to end lr defined by *lr_end*, after a warmup period during which it increases linearly from 0 to the
-    initial lr set in the optimizer.
-
-    Args:
-        learning_rate (`float`):
-            The base learning rate. It is a python float number.
-        num_warmup_steps (`int`):
-            The number of steps for the warmup phase.
-        num_training_steps (`int`):
-            The total number of training steps.
-        lr_end (`float`, *optional*, defaults to 1e-7):
-            The end LR.
-        power (`float`, *optional*, defaults to 1.0):
-            Power factor.
-        last_epoch (`int`, *optional*, defaults to -1):
-            The index of the last epoch when resuming training.
-
-    Note: *power* defaults to 1.0 as in the fairseq implementation, which in turn is based on the original BERT
-    implementation at
-    https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/optimization.py#L37
-
-    Return:
-        `paddle.optimizer.lr.LambdaDecay` with the appropriate schedule.
-
-    """
-
-    lr_init = learning_rate
-    if not (lr_init > lr_end):
-        raise ValueError(f"lr_end ({lr_end}) must be be smaller than initial lr ({lr_init})")
-
-    def lr_lambda(current_step: int):
-        if current_step < num_warmup_steps:
-            return float(current_step) / float(max(1, num_warmup_steps))
-        elif current_step > num_training_steps:
-            return lr_end / lr_init  # as LambdaLR multiplies by lr_init
-        else:
-            lr_range = lr_init - lr_end
-            decay_steps = num_training_steps - num_warmup_steps
-            pct_remaining = 1 - (current_step - num_warmup_steps) / decay_steps
-            decay = lr_range * pct_remaining**power + lr_end
-            return decay / lr_init  # as LambdaLR multiplies by lr_init
-
-    return LambdaDecay(learning_rate, lr_lambda, last_epoch)
-
-
-TYPE_TO_SCHEDULER_FUNCTION = {
-    SchedulerType.LINEAR: get_linear_schedule_with_warmup,
-    SchedulerType.COSINE: get_cosine_schedule_with_warmup,
-    SchedulerType.COSINE_WITH_RESTARTS: get_cosine_with_hard_restarts_schedule_with_warmup,
-    SchedulerType.POLYNOMIAL: get_polynomial_decay_schedule_with_warmup,
-    SchedulerType.CONSTANT: get_constant_schedule,
-    SchedulerType.CONSTANT_WITH_WARMUP: get_constant_schedule_with_warmup,
-}
-
-
-def get_scheduler(
-    name: Union[str, SchedulerType],
-    learning_rate: float = 0.1,
-    num_warmup_steps: Optional[int] = None,
-    num_training_steps: Optional[int] = None,
-    num_cycles: int = 1,
-    power: float = 1.0,
-    last_epoch: int = -1,
-):
-    """
-    Unified API to get any scheduler from its name.
-
-    Args:
-        name (`str` or `SchedulerType`):
-            The name of the scheduler to use.
-        learning_rate (`float`):
-            The base learning rate. It is a python float number.
-        num_warmup_steps (`int`, *optional*):
-            The number of warmup steps to do. This is not required by all schedulers (hence the argument being
-            optional), the function will raise an error if it's unset and the scheduler type requires it.
-        num_training_steps (`int``, *optional*):
-            The number of training steps to do. This is not required by all schedulers (hence the argument being
-            optional), the function will raise an error if it's unset and the scheduler type requires it.
-        num_cycles (`int`, *optional*):
-            The number of hard restarts used in `COSINE_WITH_RESTARTS` scheduler.
-        power (`float`, *optional*, defaults to 1.0):
-            Power factor. See `POLYNOMIAL` scheduler
-        last_epoch (`int`, *optional*, defaults to -1):
-            The index of the last epoch when resuming training.
-    """
-    name = SchedulerType(name)
-    schedule_func = TYPE_TO_SCHEDULER_FUNCTION[name]
-    if name == SchedulerType.CONSTANT:
-        return schedule_func(learning_rate=learning_rate, last_epoch=last_epoch)
-
-    # All other schedulers require `num_warmup_steps`
-    if num_warmup_steps is None:
-        raise ValueError(f"{name} requires `num_warmup_steps`, please provide that argument.")
-
-    if name == SchedulerType.CONSTANT_WITH_WARMUP:
-        return schedule_func(learning_rate=learning_rate, num_warmup_steps=num_warmup_steps, last_epoch=last_epoch)
-
-    # All other schedulers require `num_training_steps`
-    if num_training_steps is None:
-        raise ValueError(f"{name} requires `num_training_steps`, please provide that argument.")
-
-    if name == SchedulerType.COSINE_WITH_RESTARTS:
-        return schedule_func(
-            learning_rate=learning_rate,
-            num_warmup_steps=num_warmup_steps,
-            num_training_steps=num_training_steps,
-            num_cycles=num_cycles,
-            last_epoch=last_epoch,
-        )
-
-    if name == SchedulerType.POLYNOMIAL:
-        return schedule_func(
-            learning_rate=learning_rate,
-            num_warmup_steps=num_warmup_steps,
-            num_training_steps=num_training_steps,
-            power=power,
-            last_epoch=last_epoch,
-        )
-
-    return schedule_func(
-        learning_rate=learning_rate,
-        num_warmup_steps=num_warmup_steps,
-        num_training_steps=num_training_steps,
-        last_epoch=last_epoch,
-    )
diff --git a/ppdiffusers/ppdiffusers/patches/__init__.py b/ppdiffusers/ppdiffusers/patches/__init__.py
deleted file mode 100644
index 6f83cf55fd5e..000000000000
--- a/ppdiffusers/ppdiffusers/patches/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from . import ppnlp_patch_utils, tomesd_patch_utils, webui_lora_patch_utils
diff --git a/ppdiffusers/ppdiffusers/patches/ppnlp_patch_utils.py b/ppdiffusers/ppdiffusers/patches/ppnlp_patch_utils.py
deleted file mode 100644
index 72c73b6d2a01..000000000000
--- a/ppdiffusers/ppdiffusers/patches/ppnlp_patch_utils.py
+++ /dev/null
@@ -1,1544 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import builtins
-import contextlib
-import copy
-import functools
-import json
-import math
-import weakref
-from collections import OrderedDict
-from types import FunctionType, MethodType
-from typing import Any, Callable, Dict, List, Optional, Tuple
-
-from ..utils import (
-    DIFFUSERS_CACHE,
-    FROM_DIFFUSERS,
-    FROM_HF_HUB,
-    HF_HUB_OFFLINE,
-    LOW_CPU_MEM_USAGE_DEFAULT,
-    PPDIFFUSERS_CACHE,
-    TO_DIFFUSERS,
-    _add_variant,
-    _get_model_file,
-    get_logger,
-    is_paddle_available,
-    is_paddlenlp_available,
-    is_ppxformers_available,
-    is_safetensors_available,
-    is_torch_available,
-    is_torch_file,
-    smart_load,
-)
-
-logger = get_logger(__name__)
-
-__all__ = []
-
-from contextlib import ExitStack
-
-
-class ContextManagers:
-    """
-    Wrapper for `contextlib.ExitStack` which enters a collection of context managers. Adaptation of `ContextManagers`
-    in the `fastcore` library.
-    """
-
-    def __init__(self, context_managers):
-        self.context_managers = context_managers
-        self.stack = ExitStack()
-
-    def __enter__(self):
-        for context_manager in self.context_managers:
-            self.stack.enter_context(context_manager)
-
-    def __exit__(self, *args, **kwargs):
-        self.stack.__exit__(*args, **kwargs)
-
-
-def copy_func(f):
-    "Copy a non-builtin function (NB `copy.copy` does not work for this)"
-    if not isinstance(f, FunctionType):
-        return copy.copy(f)
-    fn = FunctionType(f.__code__, f.__globals__, f.__name__, f.__defaults__, f.__closure__)
-    fn.__kwdefaults__ = f.__kwdefaults__
-    fn.__dict__.update(f.__dict__)
-    fn.__annotations__.update(f.__annotations__)
-    fn.__qualname__ = f.__qualname__
-    return fn
-
-
-class _clsmethod:
-    def __init__(self, f):
-        self.f = f
-
-    def __get__(self, _, f_cls):
-        return MethodType(self.f, f_cls)
-
-
-# copied from https://github.com/fastai/fastcore/blob/c9b4c088d3706569c076e7c197c724730be190ab/fastcore/basics.py#L938-L954
-def patch_to(cls, as_prop=False, cls_method=False):
-    "Decorator: add `f` to `cls`"
-    if not isinstance(cls, (tuple, list)):
-        cls = (cls,)
-
-    def _inner(f):
-        for c_ in cls:
-            nf = copy_func(f)
-            nm = f.__name__
-            # `functools.update_wrapper` when passing patched function to `Pipeline`, so we do it manually
-            for o in functools.WRAPPER_ASSIGNMENTS:
-                setattr(nf, o, getattr(f, o))
-            nf.__qualname__ = f"{c_.__name__}.{nm}"
-            if cls_method:
-                # fix https://github.com/fastai/fastcore/issues/510
-                setattr(c_, nm, _clsmethod(nf))
-            else:
-                setattr(c_, nm, property(nf) if as_prop else nf)
-        # Avoid clobbering existing functions
-        return globals().get(nm, builtins.__dict__.get(nm, None))
-
-    return _inner
-
-
-if is_paddle_available():
-    import paddle
-    import paddle.nn as nn
-
-    def is_floating_point(x):
-        if not isinstance(x, (paddle.Tensor, paddle.static.Variable)):
-            raise TypeError("Expected Tensor, but received type of x: {}".format(type(x)))
-        dtype = x.dtype
-        is_fp_dtype = (
-            dtype == paddle.float32 or dtype == paddle.float64 or dtype == paddle.float16 or dtype == paddle.bfloat16
-        )
-        return is_fp_dtype
-
-    if not hasattr(paddle, "is_floating_point"):
-        paddle.is_floating_point = is_floating_point
-
-    # paddle.long = paddle.int64
-    # paddle.int = paddle.int32
-    # paddle.double = paddle.float64
-    # paddle.half = paddle.float16
-    # paddle.Tensor.half = lambda x: paddle.cast(x, paddle.float16)
-    # paddle.Tensor.float = lambda x: paddle.cast(x, paddle.float32)
-    # paddle.Tensor.double = lambda x: paddle.cast(x, paddle.float64)
-    # paddle.Tensor.int = lambda x: paddle.cast(x, paddle.int32)
-    # paddle.Tensor.long = lambda x: paddle.cast(x, paddle.int64)
-    # paddle.Tensor.bool = lambda x: paddle.cast(x, paddle.bool)
-    # paddle.Tensor.clamp = paddle.clip
-    # paddle.clamp = paddle.clip
-
-    def view_pt(x, *shape: builtins.int, name=None):
-        return paddle.reshape(x, shape=shape, name=name)
-
-    paddle.view = view_pt
-    paddle.Tensor.view = view_pt
-
-    if not hasattr(paddle.Tensor, "data_ptr"):
-        paddle.Tensor.data_ptr = lambda x: x.value().get_tensor()._ptr()
-
-    def permute_pt(x, *perm: builtins.int, name=None):
-        return paddle.transpose(x, perm=perm, name=name)
-
-    paddle.permute = permute_pt
-    paddle.Tensor.permute = permute_pt
-    paddle.Tensor.softmax = nn.functional.softmax
-
-    # patch repeat_interleave
-    raw_repeat_interleave = paddle.repeat_interleave
-
-    @paddle.jit.not_to_static
-    def repeat_interleave(x, repeats, axis=None, name=None):
-        fp16 = False
-        if x.dtype == paddle.float16:
-            x = x.cast(paddle.float32)
-            fp16 = True
-
-        out = raw_repeat_interleave(x, repeats=repeats, axis=axis, name=name)
-
-        if fp16:
-            out = out.cast(paddle.float16)
-        return out
-
-    paddle.repeat_interleave = repeat_interleave
-    paddle.Tensor.repeat_interleave = repeat_interleave
-
-    # patch max
-    raw_max = paddle.max
-
-    @paddle.jit.not_to_static
-    def max(x, axis=None, keepdim=False, name=None):
-        fp16 = False
-        if x.dtype == paddle.float16:
-            x = x.cast(paddle.float32)
-            fp16 = True
-
-        out = raw_max(x, axis=axis, keepdim=keepdim, name=name)
-
-        if fp16:
-            out = out.cast(paddle.float16)
-        return out
-
-    paddle.max = max
-    paddle.Tensor.max = max
-
-    # patch gather_nd support bfloat16
-    raw_gather_nd = paddle.gather_nd
-
-    @paddle.jit.not_to_static
-    def gather_nd(x, index, name=None):
-        bfp16 = False
-        if x.dtype == paddle.bfloat16:
-            x = x.cast(paddle.float16)
-            bfp16 = True
-
-        out = raw_gather_nd(x, index=index, name=name)
-
-        if bfp16:
-            out = out.cast(paddle.bfloat16)
-        return out
-
-    paddle.gather_nd = gather_nd
-    paddle.Tensor.gather_nd = gather_nd
-    paddle.Tensor.contiguous = lambda x: x
-
-    # must return self!
-    def eval(self):
-        # Layer-level setting
-        self.training = False
-        for layer in self.sublayers():
-            layer.training = False
-        return self
-
-    nn.Layer.eval = eval
-
-    def Parameter(data: paddle.Tensor, requires_grad=True):
-        tensor = paddle.create_parameter(data.shape, dtype=data.dtype, default_initializer=nn.initializer.Assign(data))
-        if not requires_grad:
-            tensor.stop_gradient = True
-        return tensor
-
-    nn.Parameter = Parameter
-
-    @contextlib.contextmanager
-    def device_scope(device="cpu"):
-        new_device = device.replace("cuda", "gpu")
-        old_device = paddle.get_device()
-        try:
-            paddle.set_device(new_device)
-            yield
-        finally:
-            paddle.set_device(old_device)
-
-    paddle.device_scope = device_scope
-
-    def get_sublayer(self, target: str):
-        if target == "":
-            return self
-
-        atoms: List[str] = target.split(".")
-        mod: nn.Layer = self
-
-        for item in atoms:
-            if not hasattr(mod, item):
-                raise AttributeError(mod.__class__.__name__ + " has no " "attribute `" + item + "`")
-
-            mod = getattr(mod, item)
-
-            if not isinstance(mod, nn.Layer):
-                raise AttributeError("`" + item + "` is not " "an nn.Layer")
-        return mod
-
-    nn.Layer.get_sublayer = get_sublayer
-
-    class _WrappedHook:
-        def __init__(self, hook: Callable, module: Optional["nn.Layer"] = None):
-            self.hook: Callable = hook
-            functools.update_wrapper(self, hook)
-
-            self.with_module: bool = False
-
-            if module is not None:
-                self.module: weakref.ReferenceType["nn.Layer"] = weakref.ref(module)
-                self.with_module = True
-
-        def __call__(self, *args: Any, **kwargs: Any) -> Any:
-            if self.with_module:
-                module = self.module()
-                if module is None:
-                    raise RuntimeError("You are trying to call the hook of a dead Module!")
-                return self.hook(module, *args, **kwargs)
-            return self.hook(*args, **kwargs)
-
-        def __getstate__(self) -> Dict:
-            result = {"hook": self.hook, "with_module": self.with_module}
-            if self.with_module:
-                result["module"] = self.module()
-
-            return result
-
-        def __setstate__(self, state: Dict):
-            self.hook = state["hook"]
-            self.with_module = state["with_module"]
-
-            if self.with_module:
-                if state["module"] is None:
-                    raise RuntimeError("You are trying to revive the hook of a dead Module!")
-                self.module = weakref.ref(state["module"])
-
-    try:
-        from paddle.nn.layer.layers import HookRemoveHelper
-    except ImportError:
-        from paddle.fluid.dygraph.layers import HookRemoveHelper
-
-    def register_load_state_dict_pre_hook(self, hook, with_module=False):
-        if not hasattr(self, "load_state_dict_pre_hooks"):
-            self.load_state_dict_pre_hooks = OrderedDict()
-        handle = HookRemoveHelper(self.load_state_dict_pre_hooks)
-        self.load_state_dict_pre_hooks[handle._hook_id] = _WrappedHook(hook, self if with_module else None)
-        return handle
-
-    nn.Layer.register_load_state_dict_pre_hook = register_load_state_dict_pre_hook
-
-    raw_set_state_dict = nn.Layer.set_state_dict
-
-    def set_state_dict(self, state_dict, use_structured_name: bool = True):
-        if hasattr(self, "load_state_dict_pre_hooks"):
-            for hook in self.load_state_dict_pre_hooks.values():
-                hook(state_dict)
-        # POP is_torch_weight
-        state_dict.pop("is_torch_weight", None)
-        return raw_set_state_dict(self, state_dict, use_structured_name=use_structured_name)
-
-    nn.Layer.set_state_dict = set_state_dict
-    nn.Layer.load_dict = nn.Layer.set_state_dict
-    nn.Layer.set_dict = nn.Layer.set_state_dict
-
-if is_paddle_available() and is_paddlenlp_available():
-    import paddle
-
-    import paddlenlp.transformers
-    from paddlenlp import __version__
-    from paddlenlp.transformers import PretrainedConfig, PretrainedModel
-
-    try:
-        from paddlenlp.transformers.model_utils import no_init_weights
-    except ImportError:
-        from ..utils.paddle_utils import no_init_weights
-
-    if is_ppxformers_available():
-        from paddle.incubate.nn.memory_efficient_attention import (
-            memory_efficient_attention,
-        )
-        from paddle.nn.functional.flash_attention import flash_attention
-
-        def scaled_dot_product_attention_(
-            query,
-            key,
-            value,
-            attn_mask=None,
-            dropout_p=0.0,
-            is_causal=False,
-            scale=None,
-            training=True,
-            attention_op="cutlass",
-        ):
-            if attn_mask is not None or attention_op == "math":
-                if scale is None:
-                    scale = 1 / math.sqrt(query.shape[-1])
-                qt = paddle.transpose(query, [0, 2, 1, 3])
-                kt = paddle.transpose(key, [0, 2, 1, 3])
-                vt = paddle.transpose(value, [0, 2, 1, 3])
-                s = paddle.matmul(qt * scale, kt, transpose_y=True)
-                if is_causal:
-                    p = paddle.incubate.softmax_mask_fuse_upper_triangle(s)
-                else:
-                    if attn_mask is not None:
-                        attn_mask = paddle.transpose(attn_mask, [0, 2, 1, 3])
-                        if attn_mask.cast("float32").min() == 0 and attn_mask.cast("float32").max() == 1:
-                            attn_mask = (attn_mask.cast(s.dtype) - 1) * 10000.0
-                        s = s + attn_mask
-                    p = paddle.nn.functional.softmax(s)
-                if dropout_p > 0.0:
-                    p = paddle.nn.functional.dropout(p, dropout_p, training=training, mode="upscale_in_train")
-                o = paddle.matmul(p, vt)
-                return paddle.transpose(o, [0, 2, 1, 3])
-            elif attention_op is None or attention_op == "cutlass" or training:
-                if scale is None:
-                    scale = 1 / math.sqrt(query.shape[-1])
-                # support fp32, fp16, bfp16
-                output = memory_efficient_attention(
-                    query,
-                    key,
-                    value,
-                    None,
-                    p=dropout_p,
-                    scale=scale,
-                    training=training,
-                )
-            elif attention_op == "flash":
-                raw_dtype = query.dtype
-                if raw_dtype == paddle.float32:
-                    query, key, value = (
-                        query.cast(paddle.float16),
-                        key.cast(paddle.float16),
-                        value.cast(paddle.float16),
-                    )
-                output = flash_attention(query, key, value, dropout=dropout_p, causal=is_causal, return_softmax=False)[
-                    0
-                ]
-                if raw_dtype == paddle.float32:
-                    output = output.cast(raw_dtype)
-            else:
-                raise ValueError("ppxformers's attention_op shoulde be in ['cutlass', 'flash', 'math']")
-            return output
-
-        paddle.nn.functional.scaled_dot_product_attention_ = scaled_dot_product_attention_
-
-    @patch_to(nn.Layer, as_prop=True)
-    def dtype(parameter: nn.Layer) -> paddle.dtype:
-        try:
-            return next(parameter.named_parameters())[1].dtype
-        except StopIteration:
-            try:
-                return next(parameter.named_buffers())[1].dtype
-            except StopIteration:
-                return parameter._dtype
-
-    @patch_to(PretrainedModel, as_prop=True)
-    def device(self):
-        try:
-            return next(self.named_parameters())[1].place
-        except StopIteration:
-            try:
-                return next(self.named_buffers())[1].place
-            except StopIteration:
-                return paddle.get_device()
-
-    try:
-        from paddlenlp.transformers import XLMRobertaTokenizer
-    except ImportError:
-        # patch xlm-roberta tokenizer
-        """Tokenization classes for XLM-RoBERTa model."""
-        import os
-        from shutil import copyfile
-
-        import sentencepiece as spm
-
-        from paddlenlp.transformers.tokenizer_utils import (
-            AddedToken,
-            PretrainedTokenizer,
-        )
-
-        SPIECE_UNDERLINE = "▁"
-
-        class XLMRobertaTokenizer(PretrainedTokenizer):
-
-            resource_files_names = {"vocab_file": "sentencepiece.bpe.model"}
-            pretrained_resource_files_map = {}
-            pretrained_init_configuration = {}
-            max_model_input_sizes = {
-                "xlm-roberta-base": 512,
-                "xlm-roberta-large": 512,
-                "xlm-roberta-large-finetuned-conll02-dutch": 512,
-                "xlm-roberta-large-finetuned-conll02-spanish": 512,
-                "xlm-roberta-large-finetuned-conll03-english": 512,
-                "xlm-roberta-large-finetuned-conll03-german": 512,
-            }
-            model_input_names = ["input_ids", "attention_mask"]
-
-            def __init__(
-                self,
-                vocab_file,
-                bos_token="<s>",
-                eos_token="</s>",
-                sep_token="</s>",
-                cls_token="<s>",
-                unk_token="<unk>",
-                pad_token="<pad>",
-                mask_token="<mask>",
-                sp_model_kwargs: Optional[Dict[str, Any]] = None,
-                **kwargs
-            ) -> None:
-                # Mask token behave like a normal word, i.e. include the space before it
-                mask_token = (
-                    AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
-                )
-
-                self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
-
-                super().__init__(
-                    bos_token=bos_token,
-                    eos_token=eos_token,
-                    unk_token=unk_token,
-                    sep_token=sep_token,
-                    cls_token=cls_token,
-                    pad_token=pad_token,
-                    mask_token=mask_token,
-                    sp_model_kwargs=self.sp_model_kwargs,
-                    **kwargs,
-                )
-
-                self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-                self.sp_model.Load(str(vocab_file))
-                self.vocab_file = vocab_file
-
-                # Original fairseq vocab and spm vocab must be "aligned":
-                # Vocab    |    0    |    1    |   2    |    3    |  4  |  5  |  6  |   7   |   8   |  9
-                # -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ----
-                # fairseq  | '<s>'   | '<pad>' | '</s>' | '<unk>' | ',' | '.' | '▁' | 's'   | '▁de' | '-'
-                # spm      | '<unk>' | '<s>'   | '</s>' | ','     | '.' | '▁' | 's' | '▁de' | '-'   | '▁a'
-
-                # Mimic fairseq token-to-id alignment for the first 4 token
-                self.fairseq_tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3}
-
-                # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab
-                self.fairseq_offset = 1
-
-                self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + self.fairseq_offset
-                self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
-
-            def __getstate__(self):
-                state = self.__dict__.copy()
-                state["sp_model"] = None
-                state["sp_model_proto"] = self.sp_model.serialized_model_proto()
-                return state
-
-            def __setstate__(self, d):
-                self.__dict__ = d
-
-                # for backward compatibility
-                if not hasattr(self, "sp_model_kwargs"):
-                    self.sp_model_kwargs = {}
-
-                self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-                self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
-
-            def build_inputs_with_special_tokens(
-                self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-            ) -> List[int]:
-                """
-                Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-                adding special tokens. An XLM-RoBERTa sequence has the following format:
-                - single sequence: `<s> X </s>`
-                - pair of sequences: `<s> A </s></s> B </s>`
-                Args:
-                    token_ids_0 (`List[int]`):
-                        List of IDs to which the special tokens will be added.
-                    token_ids_1 (`List[int]`, *optional*):
-                        Optional second list of IDs for sequence pairs.
-                Returns:
-                    `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-                """
-
-                if token_ids_1 is None:
-                    return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-                cls = [self.cls_token_id]
-                sep = [self.sep_token_id]
-                return cls + token_ids_0 + sep + sep + token_ids_1 + sep
-
-            def get_special_tokens_mask(
-                self,
-                token_ids_0: List[int],
-                token_ids_1: Optional[List[int]] = None,
-                already_has_special_tokens: bool = False,
-            ) -> List[int]:
-                """
-                Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-                special tokens using the tokenizer `prepare_for_model` method.
-                Args:
-                    token_ids_0 (`List[int]`):
-                        List of IDs.
-                    token_ids_1 (`List[int]`, *optional*):
-                        Optional second list of IDs for sequence pairs.
-                    already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                        Whether or not the token list is already formatted with special tokens for the model.
-                Returns:
-                    `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-                """
-
-                if already_has_special_tokens:
-                    return super().get_special_tokens_mask(
-                        token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-                    )
-
-                if token_ids_1 is None:
-                    return [1] + ([0] * len(token_ids_0)) + [1]
-                return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
-
-            def create_token_type_ids_from_sequences(
-                self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-            ) -> List[int]:
-                """
-                Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
-                not make use of token type ids, therefore a list of zeros is returned.
-                Args:
-                    token_ids_0 (`List[int]`):
-                        List of IDs.
-                    token_ids_1 (`List[int]`, *optional*):
-                        Optional second list of IDs for sequence pairs.
-                Returns:
-                    `List[int]`: List of zeros.
-                """
-
-                sep = [self.sep_token_id]
-                cls = [self.cls_token_id]
-
-                if token_ids_1 is None:
-                    return len(cls + token_ids_0 + sep) * [0]
-                return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
-
-            @property
-            def vocab_size(self):
-                return len(self.sp_model) + self.fairseq_offset + 1  # Add the <mask> token
-
-            def get_vocab(self):
-                vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
-                vocab.update(self.added_tokens_encoder)
-                return vocab
-
-            def _tokenize(self, text: str) -> List[str]:
-                return self.sp_model.encode(text, out_type=str)
-
-            def _convert_token_to_id(self, token):
-                """Converts a token (str) in an id using the vocab."""
-                if token in self.fairseq_tokens_to_ids:
-                    return self.fairseq_tokens_to_ids[token]
-                spm_id = self.sp_model.PieceToId(token)
-
-                # Need to return unknown token if the SP model returned 0
-                return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
-
-            def _convert_id_to_token(self, index):
-                """Converts an index (integer) in a token (str) using the vocab."""
-                if index in self.fairseq_ids_to_tokens:
-                    return self.fairseq_ids_to_tokens[index]
-                return self.sp_model.IdToPiece(index - self.fairseq_offset)
-
-            def convert_tokens_to_string(self, tokens):
-                """Converts a sequence of tokens (strings for sub-words) in a single string."""
-                out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
-                return out_string
-
-            def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
-                if not os.path.isdir(save_directory):
-                    logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-                    return
-                out_vocab_file = os.path.join(
-                    save_directory,
-                    (filename_prefix + "-" if filename_prefix else "") + self.resource_files_names["vocab_file"],
-                )
-
-                if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(
-                    self.vocab_file
-                ):
-                    copyfile(self.vocab_file, out_vocab_file)
-                elif not os.path.isfile(self.vocab_file):
-                    with open(out_vocab_file, "wb") as fi:
-                        content_spiece_model = self.sp_model.serialized_model_proto()
-                        fi.write(content_spiece_model)
-
-                return (out_vocab_file,)
-
-        paddlenlp.transformers.XLMRobertaTokenizer = XLMRobertaTokenizer
-
-    # patch BertModel forward
-    from paddlenlp.transformers import BertModel
-
-    BertModel.raw_forward = BertModel.forward
-
-    def forward_new(
-        self,
-        input_ids: paddle.Tensor,
-        token_type_ids: Optional[paddle.Tensor] = None,
-        position_ids: Optional[paddle.Tensor] = None,
-        attention_mask: Optional[paddle.Tensor] = None,
-        past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None,
-        use_cache: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ):
-        if attention_mask is None:
-            attention_mask = paddle.ones_like(input_ids)
-        return self.raw_forward(
-            input_ids=input_ids,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            attention_mask=attention_mask,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            output_hidden_states=output_hidden_states,
-            output_attentions=output_attentions,
-            return_dict=return_dict,
-        )
-
-    BertModel.forward = forward_new
-
-    TRANSFORMERS_SAFE_WEIGHTS_NAME = "model.safetensors"
-    TRANSFORMERS_WEIGHTS_NAME = "pytorch_model.bin"
-
-    # patch from_pretrained and save_pretrained
-    def from_pretrained_v3(cls, pretrained_model_name_or_path, *args, from_hf_hub: bool = False, **kwargs):
-        cache_dir = (
-            kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)
-        )
-        ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False)
-        force_download = kwargs.pop("force_download", False)
-        from_diffusers = kwargs.pop("from_diffusers", None)
-        if from_diffusers is None:
-            from_diffusers = FROM_DIFFUSERS
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-        output_loading_info = kwargs.pop("output_loading_info", False)
-        local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE)
-        use_auth_token = kwargs.pop("use_auth_token", None)
-        revision = kwargs.pop("revision", None)
-        paddle_dtype = kwargs.pop("paddle_dtype", None)
-        # do not use paddlenlp dtype
-        _dtype = kwargs.pop("dtype", None)
-        if _dtype is not None and paddle_dtype is None:
-            paddle_dtype = _dtype
-        subfolder = kwargs.pop("subfolder", None)
-        variant = kwargs.pop("variant", None)
-        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", LOW_CPU_MEM_USAGE_DEFAULT)
-
-        user_agent = {
-            "ppdiffusers": __version__,
-            "file_type": "model",
-            "framework": "paddle",
-        }
-
-        config = None
-
-        model_kwargs = kwargs
-        # 1. get the PretrainedConfig to init model
-        if not isinstance(config, PretrainedConfig):
-            config_path = config if config is not None else pretrained_model_name_or_path
-
-            # TODO fix config  from_pretrained
-            # must from hf hub
-            if from_hf_hub:
-                if subfolder is not None:
-                    kwargs["subfolder"] = subfolder
-            else:
-                if subfolder is not None:
-                    config_path = (
-                        os.path.join(config_path, subfolder)
-                        if os.path.isdir(config_path)
-                        else "/".join([config_path, subfolder])
-                    )
-
-            config, model_kwargs = cls.config_class.from_pretrained(
-                config_path,
-                cache_dir=cache_dir,
-                return_unused_kwargs=True,
-                force_download=force_download,
-                from_hf_hub=from_hf_hub,
-                **kwargs,
-            )
-        assert config is not None
-
-        # we will remove in the future.
-        if not from_hf_hub and not os.path.exists(os.path.join(cache_dir, config_path, "config.json")):
-            config.save_pretrained(os.path.join(cache_dir, config_path))
-
-        if paddle_dtype is None:
-            paddle_dtype = config.get("dtype", paddle.get_default_dtype())
-        # This variable will flag if we're loading a sharded checkpoint. In this case the archive file is just the
-        # Load model
-        model_file = None
-        if from_diffusers:
-            if is_safetensors_available():
-                try:
-                    model_file = _get_model_file(
-                        pretrained_model_name_or_path,
-                        weights_name=_add_variant(TRANSFORMERS_SAFE_WEIGHTS_NAME, variant),
-                        cache_dir=cache_dir,
-                        force_download=force_download,
-                        resume_download=resume_download,
-                        proxies=proxies,
-                        local_files_only=local_files_only,
-                        use_auth_token=use_auth_token,
-                        revision=revision,
-                        subfolder=subfolder,
-                        user_agent=user_agent,
-                        from_hf_hub=from_hf_hub,
-                    )
-                except Exception:  # noqa: E722
-                    model_file = None
-                    pass
-            if model_file is None:
-                model_file = _get_model_file(
-                    pretrained_model_name_or_path,
-                    weights_name=_add_variant(TRANSFORMERS_WEIGHTS_NAME, variant),
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    resume_download=resume_download,
-                    proxies=proxies,
-                    local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
-                    revision=revision,
-                    subfolder=subfolder,
-                    user_agent=user_agent,
-                    from_hf_hub=from_hf_hub,
-                )
-        else:
-            model_file = _get_model_file(
-                pretrained_model_name_or_path,
-                weights_name=_add_variant("model_state.pdparams", variant),
-                cache_dir=cache_dir,
-                force_download=force_download,
-                resume_download=resume_download,
-                proxies=proxies,
-                local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
-                revision=revision,
-                subfolder=subfolder,
-                user_agent=user_agent,
-                from_hf_hub=from_hf_hub,
-            )
-        assert model_file is not None
-
-        # try load model_file with paddle / torch / safetensor
-        state_dict = smart_load(model_file)
-        init_contexts = []
-
-        dtype = set(v.dtype for v in state_dict.values() if paddle.is_tensor(v) and paddle.is_floating_point(v))
-        if len(dtype) > 1 and paddle.float32 not in dtype:
-            raise ValueError(
-                f"The weights of the model file {model_file} have a mixture of incompatible dtypes {dtype}. Please"
-                f" make sure that {model_file} weights have only one dtype."
-            )
-        elif len(dtype) > 1 and paddle.float32 in dtype:
-            dtype = paddle.float32
-        elif len(dtype) == 0:
-            dtype = paddle.float32
-        else:
-            dtype = dtype.pop()
-
-        init_contexts.append(paddle.dtype_guard(dtype))
-
-        if low_cpu_mem_usage:
-            # Instantiate model.
-            init_contexts.append(no_init_weights(_enable=True))
-            if hasattr(paddle, "LazyGuard"):
-                init_contexts.append(paddle.LazyGuard())
-
-        with ContextManagers(init_contexts):
-            model = cls(config, **model_kwargs)
-
-        # convert weights
-        if (from_diffusers or is_torch_file(model_file)) and hasattr(cls, "smart_convert"):
-            state_dict = cls.smart_convert(state_dict, model)
-
-        loaded_state_dict_keys = list(state_dict.keys())
-
-        model, missing_keys, unexpected_keys, mismatched_keys = cls._load_pretrained_model_old(
-            model=model,
-            state_dict=state_dict,
-            loaded_keys=loaded_state_dict_keys,
-            ignore_mismatched_sizes=ignore_mismatched_sizes,
-            dtype=None,
-        )
-        loading_info = {
-            "missing_keys": missing_keys,
-            "unexpected_keys": unexpected_keys,
-            "mismatched_keys": mismatched_keys,
-            "error_msgs": "",
-        }
-
-        # if paddle_dtype is not None and not isinstance(paddle_dtype, paddle.dtype):
-        #     raise ValueError(
-        #         f"{paddle_dtype} needs to be of type `paddle.dtype`, e.g. `paddle.float16`, but is {type(paddle_dtype)}."
-        #     )
-        if paddle_dtype is not None:
-            model = model.to(dtype=paddle_dtype)
-
-        if len(unexpected_keys) > 0:
-            logger.warning(
-                f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
-                f" initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are"
-                f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task or"
-                " with another architecture (e.g. initializing a BertForSequenceClassification model from a"
-                " BertForPreTraining model).\n- This IS NOT expected if you are initializing"
-                f" {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly identical"
-                " (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)."
-            )
-        else:
-            logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n")
-
-        if len(missing_keys) > 0:
-            logger.warning(
-                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
-                f" {pretrained_model_name_or_path} and are newly initialized: {missing_keys}\nYou should probably"
-                " TRAIN this model on a down-stream task to be able to use it for predictions and inference."
-            )
-        elif len(mismatched_keys) == 0:
-            logger.info(
-                f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at"
-                f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the checkpoint"
-                f" was trained on, you can already use {model.__class__.__name__} for predictions without further"
-                " training."
-            )
-        if len(mismatched_keys) > 0:
-            mismatched_warning = "\n".join(
-                [
-                    f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated"
-                    for key, shape1, shape2 in mismatched_keys
-                ]
-            )
-            logger.warning(
-                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
-                f" {pretrained_model_name_or_path} and are newly initialized because the shapes did not"
-                f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be able"
-                " to use it for predictions and inference."
-            )
-
-        if output_loading_info:
-            return model, loading_info
-
-        return model
-
-    import re
-
-    import numpy as np
-
-    @classmethod
-    def _load_pretrained_model_old(
-        cls,
-        model: PretrainedModel,
-        state_dict: Dict[str, paddle.Tensor],
-        loaded_keys: List[str],
-        ignore_mismatched_sizes=False,
-        dtype=None,
-    ) -> Tuple[List[str]]:
-        model_state_dict = model.state_dict()
-
-        expected_keys = list(model_state_dict.keys())
-        prefix = model.base_model_prefix
-
-        if len(prefix) > 0:
-            has_prefix_module = any(s.startswith(prefix) for s in loaded_keys)
-            expects_prefix_module = any(s.startswith(prefix) for s in expected_keys)
-        else:
-            has_prefix_module = False
-            expects_prefix_module = False
-
-        # key re-naming operations are never done on the keys
-        # that are loaded, but always on the keys of the newly initialized model
-        remove_prefix_from_model = not has_prefix_module and expects_prefix_module
-        add_prefix_to_model = has_prefix_module and not expects_prefix_module
-
-        if remove_prefix_from_model:
-            expected_keys = [".".join(s.split(".")[1:]) if s.startswith(prefix) else s for s in expected_keys]
-        elif add_prefix_to_model:
-            expected_keys = [".".join([prefix, s]) for s in expected_keys]
-
-        missing_keys = list(set(expected_keys) - set(loaded_keys))
-        unexpected_keys = list(set(loaded_keys) - set(expected_keys))
-
-        # Some models may have keys that are not in the state by design, removing them before needlessly warning
-        # the user.
-        if cls._keys_to_ignore_on_load_missing is not None:
-            for pat in cls._keys_to_ignore_on_load_missing:
-                missing_keys = [k for k in missing_keys if re.search(pat, k) is None]
-
-        if cls._keys_to_ignore_on_load_unexpected is not None:
-            for pat in cls._keys_to_ignore_on_load_unexpected:
-                unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
-
-        # Make sure we are able to load base models as well as derived models (with heads)
-        start_prefix = ""
-        model_to_load = model
-        if len(cls.base_model_prefix) > 0 and not hasattr(model, cls.base_model_prefix) and has_prefix_module:
-            start_prefix = cls.base_model_prefix + "."
-
-        def _find_mismatched_keys(
-            state_dict,
-            model_state_dict,
-            loaded_keys,
-            add_prefix_to_model,
-            remove_prefix_from_model,
-            ignore_mismatched_sizes,
-        ):
-            mismatched_keys = []
-            if ignore_mismatched_sizes:
-                for checkpoint_key in loaded_keys:
-                    model_key = checkpoint_key
-                    if remove_prefix_from_model:
-                        # The model key starts with `prefix` but `checkpoint_key` doesn't so we add it.
-                        model_key = f"{prefix}.{checkpoint_key}"
-                    elif add_prefix_to_model:
-                        # The model key doesn't start with `prefix` but `checkpoint_key` does so we remove it.
-                        model_key = ".".join(checkpoint_key.split(".")[1:])
-
-                    if (
-                        model_key in model_state_dict
-                        and state_dict[checkpoint_key].shape != model_state_dict[model_key].shape
-                    ):
-                        mismatched_keys.append(
-                            (checkpoint_key, state_dict[checkpoint_key].shape, model_state_dict[model_key].shape)
-                        )
-                        del state_dict[checkpoint_key]
-            return mismatched_keys
-
-        # Whole checkpoint
-        mismatched_keys = _find_mismatched_keys(
-            state_dict,
-            model_state_dict,
-            loaded_keys,
-            add_prefix_to_model,
-            remove_prefix_from_model,
-            ignore_mismatched_sizes,
-        )
-
-        start_prefix = prefix + "."
-
-        # `add_prefix_to_model` and `remove_prefix_from_model` are for different situation,
-        # you can check the following matrix, which means:
-        # the value of cell: (add_prefix_to_model, remove_prefix_from_model)
-        # the load/Init-Base is the state-dict which don't contain `prefix`.
-        # the load/Init-DownStream is the state-dict which contain the `prefix`
-        #
-        # |                 | load-Base | load-DownStream |
-        # |-----------------|-----------|-----------------|
-        # | Init-Base       | F,F       | T,F             |
-        # | Init-DonwStream | F,T       | F,F             |
-        #
-        # the above value matrix will help you understand the following code.
-        if add_prefix_to_model:
-            for key in list(state_dict.keys()):
-                if key.startswith(start_prefix):
-                    state_dict[key.replace(start_prefix, "")] = state_dict.pop(key)
-
-        if remove_prefix_from_model:
-            for key in list(state_dict.keys()):
-                state_dict[start_prefix + key] = state_dict.pop(key)
-
-        # convert the dtype of state dict
-        if dtype is not None:
-            if isinstance(dtype, paddle.dtype):
-                dtype = str(dtype)[7:]
-
-            if dtype not in ["float32", "float16", "bfloat16"]:
-                raise ValueError(
-                    f"the value of `dtype` should be one of [`float32`, `float16`, `bfloat16`], but received {dtype}"
-                )
-            for key in state_dict.keys():
-                target_dtype = dtype
-                if isinstance(state_dict[key], np.ndarray):
-                    if not issubclass(state_dict[key].dtype.type, np.floating):
-                        continue
-
-                    # TODO(wj-Mcat): add `keep_in_fp32` feature to enable hybrid fp32 state-dict
-                    # this is the temp hard code for fused-mt transformer
-                    if model.keep_in_fp32_modules(key, model.config, dtype):
-                        target_dtype = "float32"
-                    # state_dict[key] = convert_ndarray_dtype(state_dict[key], target_dtype)
-
-                elif isinstance(state_dict[key], paddle.Tensor):
-                    if not state_dict[key].is_floating_point():
-                        continue
-
-                    # TODO(wj-Mcat): add `keep_in_fp32` feature to enable hybrid fp32 state-dict
-                    # this is the temp hard code for fused-mt transformer
-                    if model.keep_in_fp32_modules(key, model.config, dtype):
-                        target_dtype = "float32"
-                    state_dict[key] = paddle.cast(state_dict[key], dtype=target_dtype)
-                else:
-                    raise ValueError(f"the dtype<{state_dict[key].dtype}> of current state-dict[{key}] is not valid")
-        else:
-            dtype_prefix_len = len("paddle.")
-            for k, v in model_to_load.state_dict().items():
-                if not isinstance(v, np.ndarray):
-                    dtype = str(v.dtype)[dtype_prefix_len:]
-                if k in state_dict:
-                    if paddle.in_dynamic_mode():
-                        if isinstance(state_dict[k], np.ndarray):
-                            state_dict[k] = state_dict[k].astype(dtype)
-                        else:
-                            state_dict[k] = paddle.cast(state_dict[k], dtype)
-                    else:
-                        # there are some latent error when case dtype in static-mode, so let's:
-                        # 1. convert fluid.*.Tensor -> numpy.ndarray
-                        # 2. cast the dtype with numpy tools
-                        # 3. paddle works well with ndarray state-dict
-                        state_dict[k] = np.array(state_dict[k])
-                        state_dict[k] = state_dict[k].astype(dtype)
-
-        # For model parallel if FastGeneration
-        # To avoid recursive import temporarily.
-        import paddlenlp.ops.fast_transformer.transformer.decoding as ft_decoding
-
-        state_to_load = ft_decoding.get_ft_para_conf().fit_partial_model(model_to_load, state_dict)
-        if paddle.in_dynamic_mode():
-            model_to_load.set_state_dict(state_to_load)
-
-        return model_to_load, missing_keys, unexpected_keys, mismatched_keys
-
-    PretrainedModel._load_pretrained_model_old = _load_pretrained_model_old
-
-    # PretrainedModel.from_pretrained is classmethod
-    raw_from_pretrained = PretrainedModel.from_pretrained.__func__
-    raw_save_pretrained = PretrainedModel.save_pretrained
-
-    @classmethod
-    def from_pretrained(
-        cls,
-        pretrained_model_name_or_path,
-        *args,
-        from_hf_hub=False,
-        subfolder=None,
-        paddle_dtype=None,
-        from_diffusers=None,
-        variant=None,
-        **kwargs
-    ):
-        try:
-            if cls.constructed_from_pretrained_config() and (
-                hasattr(cls, "smart_convert") or hasattr(cls, "register_load_torch_hook")
-            ):
-                return from_pretrained_v3(
-                    cls,
-                    pretrained_model_name_or_path,
-                    *args,
-                    from_hf_hub=from_hf_hub,
-                    subfolder=subfolder,
-                    paddle_dtype=paddle_dtype,
-                    from_diffusers=from_diffusers,
-                    variant=variant,
-                    **kwargs,
-                )
-        except Exception:
-            pass
-
-        dtype = kwargs.pop("dtype", paddle_dtype)
-        if isinstance(dtype, paddle.dtype):
-            dtype = str(dtype).replace("paddle.", "")
-        return raw_from_pretrained(
-            cls,
-            pretrained_model_name_or_path,
-            *args,
-            from_hf_hub=from_hf_hub,
-            subfolder=subfolder,
-            dtype=dtype,
-            **kwargs,
-        )
-
-    PretrainedModel.from_pretrained = from_pretrained
-
-    if is_safetensors_available():
-        from safetensors.numpy import save_file as safetensors_numpy_save_file
-
-        if is_torch_available():
-            from safetensors.torch import save_file as safetensors_torch_save_file
-
-    if is_torch_available():
-        import torch
-
-    def save_pretrained_v3(
-        self: PretrainedModel,
-        save_directory: str,
-        is_main_process: bool = True,
-        save_function: Callable = None,
-        safe_serialization: bool = False,
-        variant: Optional[str] = None,
-        to_diffusers: Optional[bool] = None,
-    ):
-        from ..models.modeling_pytorch_paddle_utils import (
-            convert_paddle_state_dict_to_pytorch,
-        )
-        from ..models.modeling_utils import convert_state_dict
-
-        if to_diffusers is None:
-            to_diffusers = TO_DIFFUSERS
-
-        if to_diffusers and safe_serialization and not is_safetensors_available():
-            raise ImportError("`safe_serialization` requires the `safetensors library: `pip install safetensors`.")
-
-        if os.path.isfile(save_directory):
-            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
-            return
-
-        model_to_save = self._layers if isinstance(self, paddle.DataParallel) else self
-        if is_main_process:
-            try:
-                model_to_save.config.dtype = str(model_to_save._dtype).split(".")[-1]
-            except:
-                model_to_save.config.dtype = "float32"
-            # Attach architecture to the config
-            model_to_save.config.architectures = [model_to_save.__class__.__name__]
-
-            model_to_save.config.save_pretrained(save_directory)
-
-        state_dict = model_to_save.state_dict()
-        # save ignore lora_weights
-        fn = lambda k: ".lora_" in k or ".alpha" in k
-        state_dict = {k: v for k, v in state_dict.items() if not fn(k)}
-
-        # choose save_function
-        if save_function is None:
-            if to_diffusers:
-                if safe_serialization:
-                    if is_torch_available():
-                        save_function = safetensors_torch_save_file
-                        state_dict = convert_state_dict(state_dict, framework="torch")
-                    else:
-                        save_function = safetensors_numpy_save_file
-                        state_dict = convert_state_dict(state_dict, framework="numpy")
-                    weights_name = _add_variant("model.safetensors", variant)
-                else:
-                    if not is_torch_available():
-                        raise ImportError(
-                            "`to_diffusers=True` with `safe_serialization=False` requires the `torch library: `pip install torch`."
-                        )
-                    save_function = torch.save
-                    weights_name = _add_variant("pytorch_model.bin", variant)
-                    state_dict = convert_state_dict(state_dict, framework="torch")
-
-                state_dict = convert_paddle_state_dict_to_pytorch(state_dict, model_to_save)
-            else:
-                save_function = paddle.save
-                weights_name = _add_variant("model_state.pdparams", variant)
-
-        # Save the model
-        save_function(state_dict, os.path.join(save_directory, weights_name))
-
-        logger.info(f"Model weights saved in {os.path.join(save_directory, weights_name)}")
-
-    def save_pretrained(
-        self,
-        save_dir: str,
-        is_main_process: bool = True,
-        state_dict=None,
-        save_function: Callable = None,
-        max_shard_size="10GB",
-        safe_serialization: bool = False,
-        variant: Optional[str] = None,
-        to_diffusers: Optional[bool] = None,
-        *args,
-        **kwargs,
-    ):
-        if self.constructed_from_pretrained_config() and hasattr(self, "smart_convert"):
-            return save_pretrained_v3(
-                self,
-                save_dir,
-                is_main_process=is_main_process,
-                save_function=save_function,
-                safe_serialization=safe_serialization,
-                variant=variant,
-                to_diffusers=to_diffusers,
-            )
-        return raw_save_pretrained(
-            self,
-            save_dir=save_dir,
-            is_main_process=is_main_process,
-            state_dict=state_dict,
-            save_function=save_function,
-            max_shard_size=max_shard_size,
-            safe_serialization=safe_serialization,
-            variant=variant,
-            *args,
-            **kwargs,
-        )
-
-    PretrainedModel.save_pretrained = save_pretrained
-
-    from paddlenlp.transformers import (
-        BertModel,
-        BitBackbone,
-        ClapTextModelWithProjection,
-        CLIPTextModel,
-        CLIPTextModelWithProjection,
-        CLIPVisionModel,
-        CLIPVisionModelWithProjection,
-        DPTForDepthEstimation,
-        SpeechT5HifiGan,
-        T5EncoderModel,
-    )
-
-    if not hasattr(T5EncoderModel, "_keep_in_fp32_modules"):
-        T5EncoderModel._keep_in_fp32_modules = ["wo"]
-
-    from ..models.modeling_pytorch_paddle_utils import (
-        convert_pytorch_state_dict_to_paddle_class_method,
-    )
-    from ..pipelines.alt_diffusion.modeling_roberta_series import (
-        RobertaSeriesModelWithTransformation,
-    )
-    from ..pipelines.deepfloyd_if.safety_checker import IFSafetyChecker
-    from ..pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertModel
-    from ..pipelines.paint_by_example.image_encoder import PaintByExampleImageEncoder
-    from ..pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
-    from ..pipelines.stable_diffusion_safe.safety_checker import (
-        SafeStableDiffusionSafetyChecker,
-    )
-
-    @classmethod
-    def clip_smart_convert(cls, state_dict, pd_model):
-        new_model_state = {}
-        name_mapping_dict = {
-            ".encoder.": ".transformer.",
-            ".layer_norm": ".norm",
-            ".mlp.": ".",
-            ".fc1.": ".linear1.",
-            ".fc2.": ".linear2.",
-            ".final_layer_norm.": ".ln_final.",
-            ".embeddings.": ".",
-            ".position_embedding.": ".positional_embedding.",
-            ".patch_embedding.": ".conv1.",
-            "visual_projection.weight": "vision_projection",
-            "text_projection.weight": "text_projection",
-            ".pre_layrnorm.": ".ln_pre.",
-            ".post_layernorm.": ".ln_post.",
-        }
-        ignore_value = [
-            "position_ids",
-        ]
-        if cls in [PaintByExampleImageEncoder]:
-            # ignore mapper. prefix, we will use convert_pytorch_state_dict_to_paddle to convert mapper.xxxx state_dict
-            ignore_value.append("mapper.")
-        elif cls in [IFSafetyChecker]:
-            pass
-        else:
-            name_mapping_dict.update({".vision_model.": "."})
-
-        donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"]
-        if not hasattr(cls, "paddle_torch_name_mapping"):
-            cls.paddle_torch_name_mapping = {}
-        for name, value in state_dict.items():
-            torch_name = name
-            # step1: ignore position_ids
-            if any(i in name for i in ignore_value):
-                continue
-            # step2: transpose nn.Linear weight
-            if value.ndim == 2 and not any(i in name for i in donot_transpose):
-                value = value.T
-            # step3: hf_name -> ppnlp_name mapping
-            for hf_name, ppnlp_name in name_mapping_dict.items():
-                name = name.replace(hf_name, ppnlp_name)
-            # step4: 0d tensor -> 1d tensor
-            if name == "logit_scale" and value.ndim == 1:
-                value = value.reshape((1,))
-            # step5: safety_checker need prefix "clip."
-            if "vision_model" in name and cls in [StableDiffusionSafetyChecker, SafeStableDiffusionSafetyChecker]:
-                name = "clip." + name
-            new_model_state[name] = value
-
-            cls.paddle_torch_name_mapping[name] = torch_name
-
-        if cls in [PaintByExampleImageEncoder]:
-            # convert mapper
-            mappersd = cls.smart_convert(state_dict, pd_model, sub_layer="mapper.")
-            new_model_state.update(mappersd)
-
-        return new_model_state
-
-    @classmethod
-    def bert_smart_convert(cls, state_dict, pd_model):
-        new_model_state = {}
-        name_mapping_dict = {
-            # about embeddings
-            "embeddings.LayerNorm.weight": "embeddings.layer_norm.weight",
-            "embeddings.LayerNorm.bias": "embeddings.layer_norm.bias",
-            # about encoder layer
-            "encoder.layer": "encoder.layers",
-            "attention.self.query": "self_attn.q_proj",
-            "attention.self.key": "self_attn.k_proj",
-            "attention.self.value": "self_attn.v_proj",
-            "attention.output.dense": "self_attn.out_proj",
-            "attention.output.LayerNorm.weight": "norm1.weight",
-            "attention.output.LayerNorm.bias": "norm1.bias",
-            "intermediate.dense": "linear1",
-            "output.dense": "linear2",
-            "output.LayerNorm.weight": "norm2.weight",
-            "output.LayerNorm.bias": "norm2.bias",
-            # about cls predictions ignore
-            "cls.predictions.transform.dense": "cls.predictions.transform",
-            "cls.predictions.decoder.weight": "cls.predictions.decoder_weight",
-            "cls.predictions.transform.LayerNorm.weight": "cls.predictions.layer_norm.weight",
-            "cls.predictions.transform.LayerNorm.bias": "cls.predictions.layer_norm.bias",
-            "cls.predictions.bias": "cls.predictions.decoder_bias",
-        }
-        ignore_value = ["position_ids"]
-        donot_transpose = ["embeddings", "norm"]
-        if not hasattr(cls, "paddle_torch_name_mapping"):
-            cls.paddle_torch_name_mapping = {}
-        for name, value in state_dict.items():
-            torch_name = name
-            # step1: ignore position_ids
-            if any(i in name for i in ignore_value):
-                continue
-            # step2: transpose nn.Linear weight
-            if value.ndim == 2 and not any(i in name for i in donot_transpose):
-                value = value.T
-            # step3: hf_name -> ppnlp_name mapping
-            for hf_name, ppnlp_name in name_mapping_dict.items():
-                name = name.replace(hf_name, ppnlp_name)
-            new_model_state[name] = value
-            cls.paddle_torch_name_mapping[name] = torch_name
-
-        return new_model_state
-
-    @classmethod
-    def ldmbert_smart_convert(cls, state_dict, pd_model):
-        transformers2ppnlp = {
-            "model.embed_tokens.weight": "embeddings.word_embeddings.weight",
-            "model.embed_positions.weight": "embeddings.position_embeddings.weight",
-            "model.layer_norm.": "final_layer_norm.",
-            "model.layers": "encoder.layers",
-            ".self_attn_layer_norm.": ".norm1.",
-            ".final_layer_norm.": ".norm2.",
-            ".fc1.": ".linear1.",
-            ".fc2.": ".linear2.",
-        }
-        ignore_value = ["to_logits"]
-        donot_transpose = ["embed_tokens", "embed_positions", "norm"]
-        new_model_state = {}
-        if not hasattr(cls, "paddle_torch_name_mapping"):
-            cls.paddle_torch_name_mapping = {}
-        for name, value in state_dict.items():
-            torch_name = name
-            # step1: ignore to_logits
-            if any(i in name for i in ignore_value):
-                continue
-            # step2: transpose nn.Linear weight
-            if value.ndim == 2 and not any(i in name for i in donot_transpose):
-                value = value.T
-            # step3: hf_name -> ppnlp_name mapping
-            for hf_name, ppnlp_name in transformers2ppnlp.items():
-                name = name.replace(hf_name, ppnlp_name)
-            new_model_state[name] = value
-            cls.paddle_torch_name_mapping[name] = torch_name
-
-        return new_model_state
-
-    LDMBertModel.smart_convert = ldmbert_smart_convert
-    for cls_ in [
-        CLIPTextModel,
-        CLIPTextModelWithProjection,
-        CLIPVisionModel,
-        CLIPVisionModelWithProjection,
-        StableDiffusionSafetyChecker,
-        SafeStableDiffusionSafetyChecker,
-        PaintByExampleImageEncoder,
-        IFSafetyChecker,
-    ]:
-        setattr(cls_, "smart_convert", clip_smart_convert)
-
-    for cls_ in [BertModel, RobertaSeriesModelWithTransformation]:
-        setattr(cls_, "smart_convert", bert_smart_convert)
-
-    if bool(os.getenv("USE_TORCH_LINEAR", False)):
-        # NEW TRANSFORMERS CLIP MODEL
-        from ..pipelines.stable_diffusion.hf_clip_model import (
-            HFCLIPModel,
-            HFCLIPTextModel,
-            HFCLIPTextModelWithProjection,
-            HFCLIPVisionModel,
-            HFCLIPVisionModelWithProjection,
-        )
-
-        TRANSFORMERS_CLIP_MODEL = [
-            HFCLIPModel,
-            HFCLIPTextModel,
-            HFCLIPTextModelWithProjection,
-            HFCLIPVisionModel,
-            HFCLIPVisionModelWithProjection,
-        ]
-    else:
-        TRANSFORMERS_CLIP_MODEL = []
-    for cls_ in [
-        DPTForDepthEstimation,
-        BitBackbone,
-        SpeechT5HifiGan,
-        ClapTextModelWithProjection,
-        T5EncoderModel,
-    ] + TRANSFORMERS_CLIP_MODEL:
-        setattr(cls_, "smart_convert", convert_pytorch_state_dict_to_paddle_class_method)
-
-    # TODO remove this when we updage ImageProcessingMixin
-    # patch get_image_processor_dict support subfolder.
-
-    IMAGE_PROCESSOR_NAME = "preprocessor_config.json"
-    from paddlenlp.transformers.feature_extraction_utils import FeatureExtractionMixin
-    from paddlenlp.transformers.image_processing_utils import ImageProcessingMixin
-
-    @classmethod
-    def get_image_processor_dict(cls, pretrained_model_name_or_path, **kwargs):
-        from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB)
-        cache_dir = (
-            kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)
-        )
-        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-        local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE)
-        use_auth_token = kwargs.pop("use_auth_token", None)
-        revision = kwargs.pop("revision", None)
-        subfolder = kwargs.pop("subfolder", None)
-        user_agent = kwargs.pop("user_agent", None)
-
-        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
-
-        resolved_image_processor_file = _get_model_file(
-            pretrained_model_name_or_path,
-            weights_name=IMAGE_PROCESSOR_NAME,
-            cache_dir=cache_dir,
-            force_download=force_download,
-            resume_download=resume_download,
-            proxies=proxies,
-            local_files_only=local_files_only,
-            use_auth_token=use_auth_token,
-            revision=revision,
-            subfolder=subfolder,
-            user_agent=user_agent,
-            from_hf_hub=from_hf_hub,
-        )
-        try:
-            # Load image_processor dict
-            with open(resolved_image_processor_file, "r", encoding="utf-8") as reader:
-                text = reader.read()
-            image_processor_dict = json.loads(text)
-
-        except json.JSONDecodeError:
-            raise EnvironmentError(
-                f"It looks like the config file at '{resolved_image_processor_file}' is not a valid JSON file."
-            )
-        # use ppdiffusers logger, not ppnlp_logger
-        logger.info(
-            f"loading configuration file {resolved_image_processor_file} from cache at {resolved_image_processor_file}"
-        )
-
-        return image_processor_dict, kwargs
-
-    ImageProcessingMixin.get_image_processor_dict = get_image_processor_dict
-    FeatureExtractionMixin.get_feature_extractor_dict = get_image_processor_dict
-
-    # patch T5LayerFF, we will remove this in the near future.
-    from paddlenlp.transformers.t5.modeling import T5LayerFF
-
-    def new_forward(self, hidden_states):
-        forwarded_states = self.layer_norm(hidden_states)
-        forwarded_states = self.DenseReluDense(forwarded_states)
-        # make sure FP32 + FP16 = FP32
-        hidden_states = self.dropout(forwarded_states) + hidden_states
-        return hidden_states
-
-    T5LayerFF.forward = new_forward
diff --git a/ppdiffusers/ppdiffusers/patches/tomesd_patch_utils.py b/ppdiffusers/ppdiffusers/patches/tomesd_patch_utils.py
deleted file mode 100644
index b4f43a780c2e..000000000000
--- a/ppdiffusers/ppdiffusers/patches/tomesd_patch_utils.py
+++ /dev/null
@@ -1,429 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Copyright (c) 2023 Daniel Bolya
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-# Adapted from https://github.com/dbolya/tomesd
-
-import math
-from typing import Any, Callable, Dict, Tuple, Type, Union
-
-import paddle
-import paddle.nn as nn
-
-from ..models.transformer_2d import BasicTransformerBlock
-from ..pipelines.pipeline_utils import DiffusionPipeline
-from .ppnlp_patch_utils import patch_to
-
-TOME_PREFIX = "ToMe"
-
-
-def scatter_reduce(
-    input: paddle.Tensor,
-    dim: int,
-    index: paddle.Tensor,
-    src: paddle.Tensor,
-    reduce: str = "mean",
-    include_self: bool = True,
-) -> paddle.Tensor:
-    # reduce "sum", "prod", "mean",
-    # TODO support "amax", "amin" and include_self = False
-    if reduce in ["sum", "assign", "add"]:
-        if reduce == "sum":
-            reduce = "add"
-        input.put_along_axis_(indices=index, values=src, axis=dim, reduce=reduce)
-    elif reduce == "mean":
-        # compute sum first
-        input.put_along_axis_(indices=index, values=src, axis=dim, reduce="add")
-        # compute div secondly
-        input_div = paddle.ones_like(input).put_along_axis(
-            indices=index, values=paddle.to_tensor(1.0, dtype=input.dtype), axis=dim, reduce="add"
-        )
-        input = input / input_div
-    elif reduce in ["prod", "mul", "multiply"]:
-        input = paddle.put_along_axis(input.cpu(), indices=index.cpu(), values=src.cpu(), axis=dim, reduce="mul")._to(
-            device=paddle.get_device()
-        )
-    else:
-        raise NotImplementedError("only support mode in ['add', 'sum', 'prod', 'mul', 'multiply', 'mean', 'assign']!")
-    return input
-
-
-# patch scatter_reduce
-paddle.scatter_reduce = scatter_reduce
-paddle.Tensor.scatter_reduce = scatter_reduce
-
-
-def do_nothing(x: paddle.Tensor, mode: str = None):
-    return x
-
-
-def bipartite_soft_matching_random2d(
-    metric: paddle.Tensor,
-    w: int,
-    h: int,
-    sx: int,
-    sy: int,
-    r: int,
-    no_rand: bool = False,
-) -> Tuple[Callable, Callable]:
-    """
-    Partitions the tokens into src and dst and merges r tokens from src to dst.
-    Dst tokens are partitioned by choosing one randomy in each (sx, sy) region.
-
-    Args:
-     - metric [B, N, C]: metric to use for similarity
-     - w: image width in tokens
-     - h: image height in tokens
-     - sx: stride in the x dimension for dst, must divide w
-     - sy: stride in the y dimension for dst, must divide h
-     - r: number of tokens to remove (by merging)
-     - no_rand: if true, disable randomness (use top left corner only)
-    """
-    B, N, _ = metric.shape
-
-    if r <= 0:
-        return do_nothing, do_nothing
-
-    with paddle.no_grad():
-
-        hsy, wsx = h // sy, w // sx
-
-        if no_rand:
-            rand_idx = paddle.zeros((hsy, wsx, 1), dtype=paddle.int64)
-        else:
-            rand_idx = paddle.randint(sy * sx, shape=(hsy, wsx, 1), dtype=paddle.int64)
-
-        # The image might not divide sx and sy, so we need to work on a view of the top left if the idx buffer instead
-        idx_buffer_view = paddle.zeros([hsy, wsx, sy * sx], dtype=paddle.int64)
-        idx_buffer_view.put_along_axis_(
-            axis=2, indices=rand_idx, values=-paddle.ones_like(rand_idx, dtype=rand_idx.dtype)
-        )
-        idx_buffer_view = (
-            idx_buffer_view.reshape([hsy, wsx, sy, sx]).transpose([0, 2, 1, 3]).reshape([hsy * sy, wsx * sx])
-        )
-
-        # Image is not divisible by sx or sy so we need to move it into a new buffer
-        if (hsy * sy) < h or (wsx * sx) < w:
-            idx_buffer = paddle.zeros([h, w], dtype=paddle.int64)
-            idx_buffer[: (hsy * sy), : (wsx * sx)] = idx_buffer_view
-        else:
-            idx_buffer = idx_buffer_view
-
-        # We set dst tokens to be -1 and src to be 0, so an argsort gives us dst|src indices
-        rand_idx = idx_buffer.reshape([1, -1, 1]).argsort(axis=1)
-
-        # We're finished with these
-        del idx_buffer, idx_buffer_view
-
-        # rand_idx is currently dst|src, so split them
-        num_dst = hsy * wsx
-        a_idx = rand_idx[:, num_dst:, :]  # src
-        b_idx = rand_idx[:, :num_dst, :]  # dst
-
-        def split(x):
-            C = x.shape[-1]
-
-            src = x.take_along_axis(indices=a_idx.expand([B, N - num_dst, C]), axis=1)
-            dst = x.take_along_axis(indices=b_idx.expand([B, num_dst, C]), axis=1)
-            return src, dst
-
-        # Cosine similarity between A and B
-        metric = metric / metric.norm(axis=-1, keepdim=True)
-        a, b = split(metric)
-        scores = paddle.matmul(a, b, transpose_y=True)
-
-        # Can't reduce more than the # tokens in src
-        r = min(a.shape[1], r)
-
-        # node_max, node_idx = scores.max(axis=-1)
-        # top_k vs max argmax
-        # Find the most similar greedily
-        node_max, node_idx = paddle.topk(scores, k=1, axis=-1)
-        # node_max = scores.max(axis=-1)
-        # node_idx = scores.argmax(axis=-1)
-        edge_idx = node_max.argsort(axis=-2, descending=True)
-
-        unm_idx = edge_idx[..., r:, :]  # Unmerged Tokens
-        src_idx = edge_idx[..., :r, :]  # Merged Tokens
-
-        dst_idx = node_idx.take_along_axis(indices=src_idx, axis=-2)
-
-    def merge(x: paddle.Tensor, mode="mean") -> paddle.Tensor:
-        src, dst = split(x)
-        n, t1, c = src.shape
-
-        unm = src.take_along_axis(indices=unm_idx.expand([n, t1 - r, c]), axis=-2)
-        src = src.take_along_axis(indices=src_idx.expand([n, r, c]), axis=-2)
-
-        dst = scatter_reduce(dst, -2, dst_idx.expand([n, r, c]), src, reduce=mode)
-
-        return paddle.concat([unm, dst], axis=1)
-
-    def unmerge(x: paddle.Tensor) -> paddle.Tensor:
-        unm_len = unm_idx.shape[1]
-        unm, dst = x[..., :unm_len, :], x[..., unm_len:, :]
-        _, _, c = unm.shape
-
-        src = dst.take_along_axis(indices=dst_idx.expand([B, r, c]), axis=-2)
-
-        # Combine back to the original shape
-        out = paddle.zeros([B, N, c], dtype=x.dtype)
-
-        out.put_along_axis_(
-            indices=b_idx.expand([B, num_dst, c]),
-            values=dst,
-            axis=-2,
-        )
-        out.put_along_axis_(
-            indices=a_idx.expand([B, a_idx.shape[1], 1])
-            .take_along_axis(indices=unm_idx, axis=1)
-            .expand([B, unm_len, c]),
-            values=unm,
-            axis=-2,
-        )
-        out.put_along_axis_(
-            indices=a_idx.expand([B, a_idx.shape[1], 1]).take_along_axis(indices=src_idx, axis=1).expand([B, r, c]),
-            values=src,
-            axis=-2,
-        )
-
-        return out
-
-    return merge, unmerge
-
-
-def compute_merge(x: paddle.Tensor, tome_info: Dict[str, Any]) -> Tuple[Callable, ...]:
-    original_h, original_w = tome_info["size"]
-    original_tokens = original_h * original_w
-    downsample = int(math.ceil(math.sqrt(original_tokens // x.shape[1])))
-
-    args = tome_info["args"]
-
-    if downsample <= args["max_downsample"]:
-        w = int(math.ceil(original_w / downsample))
-        h = int(math.ceil(original_h / downsample))
-        r = int(x.shape[1] * args["ratio"])
-        # If the batch size is odd, then it's not possible for promted and unprompted images to be in the same
-        # batch, which causes artifacts with use_rand, so force it to be off.
-        use_rand = False if x.shape[0] % 2 == 1 else args["use_rand"]
-        m, u = bipartite_soft_matching_random2d(x, w, h, args["sx"], args["sy"], r, not use_rand)
-    else:
-        m, u = (do_nothing, do_nothing)
-
-    m_a, u_a = (m, u) if args["merge_attn"] else (do_nothing, do_nothing)
-    m_c, u_c = (m, u) if args["merge_crossattn"] else (do_nothing, do_nothing)
-    m_m, u_m = (m, u) if args["merge_mlp"] else (do_nothing, do_nothing)
-
-    return m_a, m_c, m_m, u_a, u_c, u_m  # Okay this is probably not very good
-
-
-def make_tome_block(block_class: Type[nn.Layer]) -> Type[nn.Layer]:
-    """
-    Make a patched class on the fly so we don't have to import any specific modules.
-    This patch applies ToMe to the forward function of the block.
-    """
-
-    class ToMeBasicTransformerBlock(block_class):
-        # Save for unpatching later
-        _parent = block_class
-
-        def forward(
-            self: BasicTransformerBlock,
-            hidden_states,
-            attention_mask=None,
-            encoder_hidden_states=None,
-            encoder_attention_mask=None,
-            timestep=None,
-            cross_attention_kwargs=None,
-            class_labels=None,
-        ) -> paddle.Tensor:
-            # (1) ToMe
-            m_a, m_c, m_m, u_a, u_c, u_m = compute_merge(hidden_states, self._tome_info)
-
-            if self.use_ada_layer_norm:
-                norm_hidden_states = self.norm1(hidden_states, timestep)
-            elif self.use_ada_layer_norm_zero:
-                norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
-                    hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
-                )
-            else:
-                norm_hidden_states = self.norm1(hidden_states)
-
-            # (2) ToMe m_a
-            norm_hidden_states = m_a(norm_hidden_states)
-
-            # 1. Self-Attention
-            cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
-            attn_output = self.attn1(
-                norm_hidden_states,
-                encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
-                attention_mask=attention_mask,
-                **cross_attention_kwargs,
-            )
-            if self.use_ada_layer_norm_zero:
-                attn_output = gate_msa.unsqueeze(1) * attn_output
-
-            # (3) ToMe u_a
-            hidden_states = u_a(attn_output) + hidden_states
-
-            if self.attn2 is not None:
-                norm_hidden_states = (
-                    self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
-                )
-                # (4) ToMe m_c
-                norm_hidden_states = m_c(norm_hidden_states)
-
-                # 2. Cross-Attention
-                attn_output = self.attn2(
-                    norm_hidden_states,
-                    encoder_hidden_states=encoder_hidden_states,
-                    attention_mask=encoder_attention_mask,
-                    **cross_attention_kwargs,
-                )
-                # (5) ToMe u_c
-                hidden_states = u_c(attn_output) + hidden_states
-
-            # 3. Feed-forward
-            norm_hidden_states = self.norm3(hidden_states)
-
-            if self.use_ada_layer_norm_zero:
-                norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
-
-            # (6) ToMe m_m
-            norm_hidden_states = m_m(norm_hidden_states)
-
-            ff_output = self.ff(norm_hidden_states)
-
-            if self.use_ada_layer_norm_zero:
-                ff_output = gate_mlp.unsqueeze(1) * ff_output
-
-            # (7) ToMe u_m
-            hidden_states = u_m(ff_output) + hidden_states
-
-            return hidden_states
-
-    return ToMeBasicTransformerBlock
-
-
-def hook_tome_model(model: nn.Layer):
-    """Adds a forward pre hook to get the image size. This hook can be removed with remove_patch."""
-
-    def hook(module, args):
-        module._tome_info["size"] = (args[0].shape[2], args[0].shape[3])
-        return None
-
-    model._tome_info["hooks"].append(model.register_forward_pre_hook(hook))
-
-
-@patch_to([DiffusionPipeline, nn.Layer])
-def remove_tome(model_or_pipe: Union[nn.Layer, DiffusionPipeline], only_return_self: bool = True):
-    """Removes a patch from a ToMeXXX module if it was already patched."""
-    model_list = []
-    if isinstance(model_or_pipe, DiffusionPipeline):
-        for _, component in model_or_pipe.components.items():
-            if isinstance(component, nn.Layer):
-                model_list.append(component)
-            elif isinstance(component, (tuple, list)):
-                for each_component in component:
-                    if isinstance(component, nn.Layer):
-                        model_list.append(each_component)
-    elif isinstance(model_or_pipe, nn.Layer):
-        model_list.append(model_or_pipe)
-
-    for model in model_list:
-        for _, module in model.named_sublayers(include_self=True):
-            if hasattr(module, "_tome_info"):
-                for hook in module._tome_info["hooks"]:
-                    hook.remove()
-                module._tome_info["hooks"].clear()
-
-            if module.__class__.__name__.startswith(TOME_PREFIX):
-                module.__class__ = module._parent
-
-    if only_return_self:
-        return model_or_pipe
-    return model_or_pipe, model_list
-
-
-@patch_to([DiffusionPipeline, nn.Layer])
-def apply_tome(
-    model_or_pipe: Union[nn.Layer, DiffusionPipeline],
-    ratio: float = 0.5,
-    max_downsample: int = 1,
-    sx: int = 2,
-    sy: int = 2,
-    use_rand: bool = True,
-    merge_attn: bool = True,
-    merge_crossattn: bool = False,
-    merge_mlp: bool = False,
-):
-    """
-    Patches a stable diffusion model_or_pipe with ToMe.
-    Apply this to the highest level stable diffusion object (i.e., it should have a .unet).
-
-    Important Args:
-     - model_or_pipe: A top level Stable Diffusion module or pipeline to patch in place.
-     - ratio: The ratio of tokens to merge. I.e., 0.4 would reduce the total number of tokens by 40%.
-              The maximum value for this is 1-(1/(sx*sy)). By default, the max is 0.75 (I recommend <= 0.5 though).
-              Higher values result in more speed-up, but with more visual quality loss.
-
-    Args to tinker with if you want:
-     - max_downsample [1, 2, 4, or 8]: Apply ToMe to layers with at most this amount of downsampling.
-                                       E.g., 1 only applies to layers with no downsampling (4/15) while
-                                       8 applies to all layers (15/15). I recommend a value of 1 or 2.
-     - sx, sy: The stride for computing dst sets (see paper). A higher stride means you can merge more tokens,
-               but the default of (2, 2) works well in most cases. Must divide the image size.
-     - use_rand: Whether or not to allow random perturbations when computing dst sets (see paper). Usually
-                 you'd want to leave this on, but if you're having weird artifacts try turning this off.
-     - merge_attn: Whether or not to merge tokens for attention (recommended).
-     - merge_crossattn: Whether or not to merge tokens for cross attention (not recommended).
-     - merge_mlp: Whether or not to merge tokens for the mlp layers (very not recommended).
-
-    """
-    if ratio >= 1 - (1 / (sx * sy)):
-        raise ValueError(f"The tome ratio must be less than {1-(1/(sx*sy))} !")
-
-    # Make sure the model_or_pipe is not currently patched
-    model_list = model_or_pipe.remove_tome(only_return_self=False)[1]
-
-    for model in model_list:
-        need_patch = False
-        model._tome_info = {
-            "size": None,
-            "hooks": [],
-            "args": {
-                "ratio": ratio,
-                "max_downsample": max_downsample,
-                "sx": sx,
-                "sy": sy,
-                "use_rand": use_rand,
-                "merge_attn": merge_attn,
-                "merge_crossattn": merge_crossattn,
-                "merge_mlp": merge_mlp,
-            },
-        }
-        for _, module in model.named_sublayers(include_self=True):
-            # If for some reason this has a different name, create an issue and I'll fix it
-            if isinstance(module, BasicTransformerBlock):
-                module.__class__ = make_tome_block(module.__class__)
-                module._tome_info = model._tome_info
-                need_patch = True
-
-        if need_patch:
-            hook_tome_model(model)
-
-    return model_or_pipe
diff --git a/ppdiffusers/ppdiffusers/patches/webui_lora_patch_utils.py b/ppdiffusers/ppdiffusers/patches/webui_lora_patch_utils.py
deleted file mode 100644
index af5f62435c62..000000000000
--- a/ppdiffusers/ppdiffusers/patches/webui_lora_patch_utils.py
+++ /dev/null
@@ -1,2622 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-import os
-from types import MethodType
-
-import paddle
-import paddle.nn as nn
-
-from ..models.modeling_utils import convert_state_dict
-from ..pipelines import DiffusionPipeline
-from ..utils import is_safetensors_available, is_torch_available
-from ..utils.constants import PPDIFFUSERS_CACHE
-from ..utils.download_utils import ppdiffusers_url_download
-from ..utils.initializer_utils import kaiming_uniform_, zeros_
-from ..utils.load_utils import smart_load
-from .ppnlp_patch_utils import patch_to
-
-if is_safetensors_available():
-    import safetensors
-
-
-def convert_pt_to_pd(state, dtype):
-    if dtype is None:
-        dtype = "float32"
-    new_state = {}
-    for a, b in safetensors_weight_mapping:
-        if a in state:
-            val = state[a]
-            if val.ndim == 2:
-                val = val.T
-            if val.ndim == 0:
-                val = val.reshape((1,))
-            new_state[b] = val.cast(dtype)
-        else:
-            print(f"We find {a} not in state_dict and we will continue!")
-    return new_state
-
-
-def convert_pd_to_pt(state):
-    new_state = {}
-    for a, b in safetensors_weight_mapping:
-        if b in state:
-            val = state[b]
-            if val.ndim == 2:
-                val = val.T
-            # if ".alpha" in a:
-            #     val = val.squeeze()
-
-            new_state[a] = val
-    return new_state
-
-
-def extract_lora_weights(model):
-    sd = {}
-    for k, v in model.state_dict().items():
-        if "lora" in k or ".alpha" in k:
-            sd[k] = v
-    return sd
-
-
-@patch_to([DiffusionPipeline, nn.Layer])
-def save_lora(pipe_or_module, save_directory, WEIGHT_NAME=None):
-    if WEIGHT_NAME is None:
-        WEIGHT_NAME = "text_encoder_unet_lora.safetensors"
-    outdict = {}
-    if isinstance(pipe_or_module, nn.Layer):
-        outdict.update(extract_lora_weights(pipe_or_module))
-    else:
-        if hasattr(pipe_or_module, "text_encoder"):
-            outdict.update(extract_lora_weights(pipe_or_module.text_encoder))
-        if hasattr(pipe_or_module, "unet"):
-            outdict.update(extract_lora_weights(pipe_or_module.unet))
-    os.makedirs(save_directory, exist_ok=True)
-
-    if is_torch_available():
-        save_function = safetensors.torch.save_file
-        outdict = convert_state_dict(convert_pd_to_pt(outdict), framework="torch")
-    else:
-        save_function = safetensors.numpy.save_file
-        outdict = convert_state_dict(convert_pd_to_pt(outdict), framework="numpy")
-
-    save_function(outdict, os.path.join(save_directory, WEIGHT_NAME))
-    del outdict
-    print(f"Model weights saved in {os.path.join(save_directory, WEIGHT_NAME)}")
-
-
-@patch_to([DiffusionPipeline, nn.Layer])
-def set_lora_enabled(pipe_or_module, enable=True):
-    def set_lora(self):
-        if hasattr(self, "enable_lora"):
-            self.enable_lora = enable
-
-    if isinstance(pipe_or_module, nn.Layer):
-        pipe_or_module.apply(set_lora)
-    else:
-        if hasattr(pipe_or_module, "text_encoder"):
-            pipe_or_module.text_encoder.apply(set_lora)
-        if hasattr(pipe_or_module, "unet"):
-            pipe_or_module.unet.apply(set_lora)
-
-
-@patch_to([DiffusionPipeline, nn.Layer])
-def apply_lora(
-    pipe_or_module,
-    lora_weight_or_path=None,
-    rank=4,
-    alpha=None,
-    multiplier=1.0,
-    text_encoder_target_replace_modules=["TransformerEncoderLayer"],
-    unet_target_replace_modules=["Transformer2DModel", "Attention"],
-    enable_lora=True,
-    **kwargs,
-):
-    resume_download = kwargs.pop("resume_download", False)
-    force_download = kwargs.pop("force_download", False)
-    paddle_dtype = kwargs.pop("paddle_dtype", None)
-    cache_dir = kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)
-
-    if paddle_dtype is None:
-        if isinstance(pipe_or_module, nn.Layer):
-            paddle_dtype = pipe_or_module.dtype
-        else:
-            if hasattr(pipe_or_module, "text_encoder"):
-                paddle_dtype = pipe_or_module.text_encoder.dtype
-            if hasattr(pipe_or_module, "unet"):
-                paddle_dtype = pipe_or_module.unet.dtype
-
-    if lora_weight_or_path is not None:
-        lora_weight_or_path = str(lora_weight_or_path)
-        if os.path.isfile(lora_weight_or_path):
-            lora_weight_or_path = lora_weight_or_path
-        elif lora_weight_or_path.startswith("http://") or lora_weight_or_path.startswith("https://"):
-            lora_weight_or_path = ppdiffusers_url_download(
-                lora_weight_or_path,
-                cache_dir=cache_dir,
-                resume_download=resume_download,
-                force_download=force_download,
-            )
-        else:
-            raise EnvironmentError(f"Please check your {lora_weight_or_path}.")
-        lora_weight_or_path = convert_pt_to_pd(smart_load(lora_weight_or_path), paddle_dtype)
-
-        mayberanklist = []
-        maybealphalist = []
-        for k, v in lora_weight_or_path.items():
-            if "lora_down" in k and "alpha" not in k:
-                if v.ndim == 2:
-                    mayberanklist.append(v.shape[1])
-                elif v.ndim == 4:
-                    mayberanklist.append(v.shape[0])
-
-            if "lora_up" in k and "alpha" not in k:
-                if v.ndim == 2:
-                    mayberanklist.append(v.shape[0])
-                elif v.ndim == 4:
-                    mayberanklist.append(v.shape[1])
-
-            if "alpha" in k:
-                # we must cast it to float32
-                maybealphalist.append(v.astype("float32").item())
-            if len(mayberanklist) > 20:
-                break
-        if len(set(mayberanklist)) > 1:
-            print(f"Can't guess rank! Here are the rank list {mayberanklist}. We will use default rank {rank}.")
-        else:
-            rank = mayberanklist[0]
-        print(f"|---------------Currently, rank is {rank}!")
-
-        if len(set(maybealphalist)) > 1:
-            print(f"Can't guess alpha! Here are the rank list {maybealphalist}. We will use default alpha {alpha}")
-        else:
-            alpha = maybealphalist[0]
-        print(f"|---------------Currently, alpha is {alpha}!")
-
-    waitlist = []
-    if isinstance(pipe_or_module, nn.Layer):
-        waitlist.append((pipe_or_module, text_encoder_target_replace_modules + unet_target_replace_modules))
-    else:
-        if hasattr(pipe_or_module, "text_encoder"):
-            waitlist.append((pipe_or_module.text_encoder, text_encoder_target_replace_modules))
-        if hasattr(pipe_or_module, "unet"):
-            waitlist.append((pipe_or_module.unet, unet_target_replace_modules))
-    lora_modules = {}
-    for each_module, target_replace_modules in waitlist:
-        for name1, module in each_module.named_sublayers(include_self=True):
-            if module.__class__.__name__ in target_replace_modules:
-                for name2, child_module in module.named_sublayers(include_self=True):
-                    if not getattr(child_module, "is_lora_linear", False) and (
-                        child_module.__class__.__name__ == "Linear"
-                        or (child_module.__class__.__name__ == "Conv2D" and list(child_module._kernel_size) == [1, 1])
-                    ):
-                        # if we apply lora multi
-                        if hasattr(child_module, "merged") and child_module.merged:
-                            with paddle.no_grad():
-                                if child_module.is_conv:
-                                    new_weight = (
-                                        child_module.weight.squeeze([-1, -2])
-                                        - child_module.lora_up.weight.squeeze([-1, -2])
-                                        @ child_module.lora_down.weight.squeeze([-1, -2])
-                                        * child_module.multiplier
-                                        * child_module.scale
-                                    ).unsqueeze([-1, -2])
-                                else:
-                                    new_weight = (
-                                        child_module.weight
-                                        - child_module.lora_down.weight
-                                        @ child_module.lora_up.weight
-                                        * child_module.multiplier
-                                        * child_module.scale
-                                    )
-                                child_module.weight.set_value(new_weight)
-
-                        in_features, out_features = child_module.weight.shape[0], child_module.weight.shape[1]
-                        child_module.is_conv = False
-                        child_module.merged = False
-
-                        if child_module.weight.ndim == 4:
-                            child_module.is_conv = True
-                            in_features, out_features = out_features, in_features
-
-                        if rank > min(in_features, out_features):
-                            raise ValueError(
-                                f"LoRA rank {rank} must be less or equal than {min(in_features, out_features)}"
-                            )
-
-                        if child_module.is_conv:
-                            child_module.lora_down = nn.Conv2D(in_features, rank, [1, 1], bias_attr=False)
-                            child_module.lora_up = nn.Conv2D(rank, out_features, [1, 1], bias_attr=False)
-                        else:
-                            child_module.lora_down = nn.Linear(in_features, rank, bias_attr=False)
-                            child_module.lora_up = nn.Linear(rank, out_features, bias_attr=False)
-                        child_module.lora_down.is_lora_linear = True
-                        child_module.lora_up.is_lora_linear = True
-                        child_module.rank = rank
-                        child_module.enable_lora = enable_lora
-
-                        if paddle.is_tensor(alpha):
-                            alpha = alpha.detach().cast("float32").numpy()
-                        alpha = rank if alpha is None or alpha == 0 else alpha
-                        child_module.scale = alpha / child_module.rank
-                        child_module.register_buffer("alpha", paddle.to_tensor(alpha, dtype="float32"))
-
-                        # same as microsoft's
-                        kaiming_uniform_(child_module.lora_down.weight, a=math.sqrt(5))
-                        zeros_(child_module.lora_up.weight)
-                        child_module.multiplier = multiplier
-
-                        if getattr(child_module, "raw_forward", None) is None:
-                            child_module.raw_forward = child_module.forward
-
-                        def forward_lora(self, x):
-                            if self.training:
-                                if self.merged:
-                                    with paddle.no_grad():
-                                        if self.is_conv:
-                                            new_weight = (
-                                                self.weight.squeeze([-1, -2])
-                                                - self.lora_up.weight.squeeze([-1, -2])
-                                                @ self.lora_down.weight.squeeze([-1, -2])
-                                                * self.multiplier
-                                                * self.scale
-                                            ).unsqueeze([-1, -2])
-                                        else:
-                                            new_weight = (
-                                                self.weight
-                                                - self.lora_down.weight
-                                                @ self.lora_up.weight
-                                                * self.multiplier
-                                                * self.scale
-                                            )
-                                        self.weight.set_value(new_weight)
-                                        self.merged = False
-                                if not self.enable_lora:
-                                    return self.raw_forward(x)
-                                return (
-                                    self.raw_forward(x)
-                                    + self.lora_up(self.lora_down(x)) * self.multiplier * self.scale
-                                )
-                            else:
-                                if self.enable_lora and not self.merged:
-                                    with paddle.no_grad():
-                                        if self.is_conv:
-                                            new_weight = (
-                                                self.weight.squeeze([-1, -2])
-                                                + self.lora_up.weight.squeeze([-1, -2])
-                                                @ self.lora_down.weight.squeeze([-1, -2])
-                                                * self.multiplier
-                                                * self.scale
-                                            ).unsqueeze([-1, -2])
-                                        else:
-                                            new_weight = (
-                                                self.weight
-                                                + self.lora_down.weight
-                                                @ self.lora_up.weight
-                                                * self.multiplier
-                                                * self.scale
-                                            )
-                                        self.weight.set_value(new_weight)
-                                        self.merged = True
-
-                                if not self.enable_lora and self.merged:
-                                    with paddle.no_grad():
-                                        if self.is_conv:
-                                            new_weight = (
-                                                self.weight.squeeze([-1, -2])
-                                                - self.lora_up.weight.squeeze([-1, -2])
-                                                @ self.lora_down.weight.squeeze([-1, -2])
-                                                * self.multiplier
-                                                * self.scale
-                                            ).unsqueeze([-1, -2])
-                                        else:
-                                            new_weight = (
-                                                self.weight
-                                                - self.lora_down.weight
-                                                @ self.lora_up.weight
-                                                * self.multiplier
-                                                * self.scale
-                                            )
-                                        self.weight.set_value(new_weight)
-                                        self.merged = False
-                                return self.raw_forward(x)
-
-                        child_module.forward = MethodType(forward_lora, child_module)
-                        child_module.lora_down.training = child_module.training
-                        child_module.lora_up.training = child_module.training
-                        child_module.to(dtype=paddle_dtype)
-                        # we will return lora_modules
-                        lora_modules[name1 + "." + name2] = child_module
-
-    if lora_weight_or_path is not None:
-        if isinstance(pipe_or_module, nn.Layer):
-            pipe_or_module.set_dict(lora_weight_or_path)
-        else:
-            if hasattr(pipe_or_module, "text_encoder"):
-                pipe_or_module.text_encoder.set_dict(lora_weight_or_path)
-                pipe_or_module.text_encoder.eval()
-            if hasattr(pipe_or_module, "unet"):
-                pipe_or_module.unet.set_dict(lora_weight_or_path)
-                pipe_or_module.unet.eval()
-
-        del lora_weight_or_path
-        print("Loading lora_weights successfully!")
-    return lora_modules
-
-
-safetensors_weight_mapping = [
-    [
-        "lora_te_text_model_encoder_layers_0_self_attn_q_proj.lora_down.weight",
-        "text_model.transformer.layers.0.self_attn.q_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_down.weight",
-        "text_model.transformer.layers.0.self_attn.k_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_0_self_attn_v_proj.lora_down.weight",
-        "text_model.transformer.layers.0.self_attn.v_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_0_self_attn_out_proj.lora_down.weight",
-        "text_model.transformer.layers.0.self_attn.out_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_0_mlp_fc1.lora_down.weight",
-        "text_model.transformer.layers.0.linear1.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_0_mlp_fc2.lora_down.weight",
-        "text_model.transformer.layers.0.linear2.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_0_self_attn_q_proj.lora_up.weight",
-        "text_model.transformer.layers.0.self_attn.q_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_up.weight",
-        "text_model.transformer.layers.0.self_attn.k_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_0_self_attn_v_proj.lora_up.weight",
-        "text_model.transformer.layers.0.self_attn.v_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_0_self_attn_out_proj.lora_up.weight",
-        "text_model.transformer.layers.0.self_attn.out_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_0_mlp_fc1.lora_up.weight",
-        "text_model.transformer.layers.0.linear1.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_0_mlp_fc2.lora_up.weight",
-        "text_model.transformer.layers.0.linear2.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_0_self_attn_q_proj.alpha",
-        "text_model.transformer.layers.0.self_attn.q_proj.alpha",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_0_self_attn_k_proj.alpha",
-        "text_model.transformer.layers.0.self_attn.k_proj.alpha",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_0_self_attn_v_proj.alpha",
-        "text_model.transformer.layers.0.self_attn.v_proj.alpha",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_0_self_attn_out_proj.alpha",
-        "text_model.transformer.layers.0.self_attn.out_proj.alpha",
-    ],
-    ["lora_te_text_model_encoder_layers_0_mlp_fc1.alpha", "text_model.transformer.layers.0.linear1.alpha"],
-    ["lora_te_text_model_encoder_layers_0_mlp_fc2.alpha", "text_model.transformer.layers.0.linear2.alpha"],
-    [
-        "lora_te_text_model_encoder_layers_1_self_attn_q_proj.lora_down.weight",
-        "text_model.transformer.layers.1.self_attn.q_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_1_self_attn_k_proj.lora_down.weight",
-        "text_model.transformer.layers.1.self_attn.k_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_1_self_attn_v_proj.lora_down.weight",
-        "text_model.transformer.layers.1.self_attn.v_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_1_self_attn_out_proj.lora_down.weight",
-        "text_model.transformer.layers.1.self_attn.out_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_1_mlp_fc1.lora_down.weight",
-        "text_model.transformer.layers.1.linear1.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_1_mlp_fc2.lora_down.weight",
-        "text_model.transformer.layers.1.linear2.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_1_self_attn_q_proj.lora_up.weight",
-        "text_model.transformer.layers.1.self_attn.q_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_1_self_attn_k_proj.lora_up.weight",
-        "text_model.transformer.layers.1.self_attn.k_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_1_self_attn_v_proj.lora_up.weight",
-        "text_model.transformer.layers.1.self_attn.v_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_1_self_attn_out_proj.lora_up.weight",
-        "text_model.transformer.layers.1.self_attn.out_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_1_mlp_fc1.lora_up.weight",
-        "text_model.transformer.layers.1.linear1.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_1_mlp_fc2.lora_up.weight",
-        "text_model.transformer.layers.1.linear2.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_1_self_attn_q_proj.alpha",
-        "text_model.transformer.layers.1.self_attn.q_proj.alpha",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_1_self_attn_k_proj.alpha",
-        "text_model.transformer.layers.1.self_attn.k_proj.alpha",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_1_self_attn_v_proj.alpha",
-        "text_model.transformer.layers.1.self_attn.v_proj.alpha",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_1_self_attn_out_proj.alpha",
-        "text_model.transformer.layers.1.self_attn.out_proj.alpha",
-    ],
-    ["lora_te_text_model_encoder_layers_1_mlp_fc1.alpha", "text_model.transformer.layers.1.linear1.alpha"],
-    ["lora_te_text_model_encoder_layers_1_mlp_fc2.alpha", "text_model.transformer.layers.1.linear2.alpha"],
-    [
-        "lora_te_text_model_encoder_layers_2_self_attn_q_proj.lora_down.weight",
-        "text_model.transformer.layers.2.self_attn.q_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_2_self_attn_k_proj.lora_down.weight",
-        "text_model.transformer.layers.2.self_attn.k_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_2_self_attn_v_proj.lora_down.weight",
-        "text_model.transformer.layers.2.self_attn.v_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_2_self_attn_out_proj.lora_down.weight",
-        "text_model.transformer.layers.2.self_attn.out_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_2_mlp_fc1.lora_down.weight",
-        "text_model.transformer.layers.2.linear1.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_2_mlp_fc2.lora_down.weight",
-        "text_model.transformer.layers.2.linear2.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_2_self_attn_q_proj.lora_up.weight",
-        "text_model.transformer.layers.2.self_attn.q_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_2_self_attn_k_proj.lora_up.weight",
-        "text_model.transformer.layers.2.self_attn.k_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_2_self_attn_v_proj.lora_up.weight",
-        "text_model.transformer.layers.2.self_attn.v_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_2_self_attn_out_proj.lora_up.weight",
-        "text_model.transformer.layers.2.self_attn.out_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_2_mlp_fc1.lora_up.weight",
-        "text_model.transformer.layers.2.linear1.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_2_mlp_fc2.lora_up.weight",
-        "text_model.transformer.layers.2.linear2.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_2_self_attn_q_proj.alpha",
-        "text_model.transformer.layers.2.self_attn.q_proj.alpha",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_2_self_attn_k_proj.alpha",
-        "text_model.transformer.layers.2.self_attn.k_proj.alpha",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_2_self_attn_v_proj.alpha",
-        "text_model.transformer.layers.2.self_attn.v_proj.alpha",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_2_self_attn_out_proj.alpha",
-        "text_model.transformer.layers.2.self_attn.out_proj.alpha",
-    ],
-    ["lora_te_text_model_encoder_layers_2_mlp_fc1.alpha", "text_model.transformer.layers.2.linear1.alpha"],
-    ["lora_te_text_model_encoder_layers_2_mlp_fc2.alpha", "text_model.transformer.layers.2.linear2.alpha"],
-    [
-        "lora_te_text_model_encoder_layers_3_self_attn_q_proj.lora_down.weight",
-        "text_model.transformer.layers.3.self_attn.q_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_3_self_attn_k_proj.lora_down.weight",
-        "text_model.transformer.layers.3.self_attn.k_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_3_self_attn_v_proj.lora_down.weight",
-        "text_model.transformer.layers.3.self_attn.v_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_3_self_attn_out_proj.lora_down.weight",
-        "text_model.transformer.layers.3.self_attn.out_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_3_mlp_fc1.lora_down.weight",
-        "text_model.transformer.layers.3.linear1.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_3_mlp_fc2.lora_down.weight",
-        "text_model.transformer.layers.3.linear2.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_3_self_attn_q_proj.lora_up.weight",
-        "text_model.transformer.layers.3.self_attn.q_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_3_self_attn_k_proj.lora_up.weight",
-        "text_model.transformer.layers.3.self_attn.k_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_3_self_attn_v_proj.lora_up.weight",
-        "text_model.transformer.layers.3.self_attn.v_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_3_self_attn_out_proj.lora_up.weight",
-        "text_model.transformer.layers.3.self_attn.out_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_3_mlp_fc1.lora_up.weight",
-        "text_model.transformer.layers.3.linear1.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_3_mlp_fc2.lora_up.weight",
-        "text_model.transformer.layers.3.linear2.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_3_self_attn_q_proj.alpha",
-        "text_model.transformer.layers.3.self_attn.q_proj.alpha",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_3_self_attn_k_proj.alpha",
-        "text_model.transformer.layers.3.self_attn.k_proj.alpha",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_3_self_attn_v_proj.alpha",
-        "text_model.transformer.layers.3.self_attn.v_proj.alpha",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_3_self_attn_out_proj.alpha",
-        "text_model.transformer.layers.3.self_attn.out_proj.alpha",
-    ],
-    ["lora_te_text_model_encoder_layers_3_mlp_fc1.alpha", "text_model.transformer.layers.3.linear1.alpha"],
-    ["lora_te_text_model_encoder_layers_3_mlp_fc2.alpha", "text_model.transformer.layers.3.linear2.alpha"],
-    [
-        "lora_te_text_model_encoder_layers_4_self_attn_q_proj.lora_down.weight",
-        "text_model.transformer.layers.4.self_attn.q_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_4_self_attn_k_proj.lora_down.weight",
-        "text_model.transformer.layers.4.self_attn.k_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_4_self_attn_v_proj.lora_down.weight",
-        "text_model.transformer.layers.4.self_attn.v_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_4_self_attn_out_proj.lora_down.weight",
-        "text_model.transformer.layers.4.self_attn.out_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_4_mlp_fc1.lora_down.weight",
-        "text_model.transformer.layers.4.linear1.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_4_mlp_fc2.lora_down.weight",
-        "text_model.transformer.layers.4.linear2.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_4_self_attn_q_proj.lora_up.weight",
-        "text_model.transformer.layers.4.self_attn.q_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_4_self_attn_k_proj.lora_up.weight",
-        "text_model.transformer.layers.4.self_attn.k_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_4_self_attn_v_proj.lora_up.weight",
-        "text_model.transformer.layers.4.self_attn.v_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_4_self_attn_out_proj.lora_up.weight",
-        "text_model.transformer.layers.4.self_attn.out_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_4_mlp_fc1.lora_up.weight",
-        "text_model.transformer.layers.4.linear1.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_4_mlp_fc2.lora_up.weight",
-        "text_model.transformer.layers.4.linear2.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_4_self_attn_q_proj.alpha",
-        "text_model.transformer.layers.4.self_attn.q_proj.alpha",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_4_self_attn_k_proj.alpha",
-        "text_model.transformer.layers.4.self_attn.k_proj.alpha",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_4_self_attn_v_proj.alpha",
-        "text_model.transformer.layers.4.self_attn.v_proj.alpha",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_4_self_attn_out_proj.alpha",
-        "text_model.transformer.layers.4.self_attn.out_proj.alpha",
-    ],
-    ["lora_te_text_model_encoder_layers_4_mlp_fc1.alpha", "text_model.transformer.layers.4.linear1.alpha"],
-    ["lora_te_text_model_encoder_layers_4_mlp_fc2.alpha", "text_model.transformer.layers.4.linear2.alpha"],
-    [
-        "lora_te_text_model_encoder_layers_5_self_attn_q_proj.lora_down.weight",
-        "text_model.transformer.layers.5.self_attn.q_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_5_self_attn_k_proj.lora_down.weight",
-        "text_model.transformer.layers.5.self_attn.k_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_5_self_attn_v_proj.lora_down.weight",
-        "text_model.transformer.layers.5.self_attn.v_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_5_self_attn_out_proj.lora_down.weight",
-        "text_model.transformer.layers.5.self_attn.out_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_5_mlp_fc1.lora_down.weight",
-        "text_model.transformer.layers.5.linear1.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_5_mlp_fc2.lora_down.weight",
-        "text_model.transformer.layers.5.linear2.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_5_self_attn_q_proj.lora_up.weight",
-        "text_model.transformer.layers.5.self_attn.q_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_5_self_attn_k_proj.lora_up.weight",
-        "text_model.transformer.layers.5.self_attn.k_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_5_self_attn_v_proj.lora_up.weight",
-        "text_model.transformer.layers.5.self_attn.v_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_5_self_attn_out_proj.lora_up.weight",
-        "text_model.transformer.layers.5.self_attn.out_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_5_mlp_fc1.lora_up.weight",
-        "text_model.transformer.layers.5.linear1.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_5_mlp_fc2.lora_up.weight",
-        "text_model.transformer.layers.5.linear2.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_5_self_attn_q_proj.alpha",
-        "text_model.transformer.layers.5.self_attn.q_proj.alpha",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_5_self_attn_k_proj.alpha",
-        "text_model.transformer.layers.5.self_attn.k_proj.alpha",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_5_self_attn_v_proj.alpha",
-        "text_model.transformer.layers.5.self_attn.v_proj.alpha",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_5_self_attn_out_proj.alpha",
-        "text_model.transformer.layers.5.self_attn.out_proj.alpha",
-    ],
-    ["lora_te_text_model_encoder_layers_5_mlp_fc1.alpha", "text_model.transformer.layers.5.linear1.alpha"],
-    ["lora_te_text_model_encoder_layers_5_mlp_fc2.alpha", "text_model.transformer.layers.5.linear2.alpha"],
-    [
-        "lora_te_text_model_encoder_layers_6_self_attn_q_proj.lora_down.weight",
-        "text_model.transformer.layers.6.self_attn.q_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_6_self_attn_k_proj.lora_down.weight",
-        "text_model.transformer.layers.6.self_attn.k_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_6_self_attn_v_proj.lora_down.weight",
-        "text_model.transformer.layers.6.self_attn.v_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_6_self_attn_out_proj.lora_down.weight",
-        "text_model.transformer.layers.6.self_attn.out_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_6_mlp_fc1.lora_down.weight",
-        "text_model.transformer.layers.6.linear1.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_6_mlp_fc2.lora_down.weight",
-        "text_model.transformer.layers.6.linear2.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_6_self_attn_q_proj.lora_up.weight",
-        "text_model.transformer.layers.6.self_attn.q_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_6_self_attn_k_proj.lora_up.weight",
-        "text_model.transformer.layers.6.self_attn.k_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_6_self_attn_v_proj.lora_up.weight",
-        "text_model.transformer.layers.6.self_attn.v_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_6_self_attn_out_proj.lora_up.weight",
-        "text_model.transformer.layers.6.self_attn.out_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_6_mlp_fc1.lora_up.weight",
-        "text_model.transformer.layers.6.linear1.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_6_mlp_fc2.lora_up.weight",
-        "text_model.transformer.layers.6.linear2.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_6_self_attn_q_proj.alpha",
-        "text_model.transformer.layers.6.self_attn.q_proj.alpha",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_6_self_attn_k_proj.alpha",
-        "text_model.transformer.layers.6.self_attn.k_proj.alpha",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_6_self_attn_v_proj.alpha",
-        "text_model.transformer.layers.6.self_attn.v_proj.alpha",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_6_self_attn_out_proj.alpha",
-        "text_model.transformer.layers.6.self_attn.out_proj.alpha",
-    ],
-    ["lora_te_text_model_encoder_layers_6_mlp_fc1.alpha", "text_model.transformer.layers.6.linear1.alpha"],
-    ["lora_te_text_model_encoder_layers_6_mlp_fc2.alpha", "text_model.transformer.layers.6.linear2.alpha"],
-    [
-        "lora_te_text_model_encoder_layers_7_self_attn_q_proj.lora_down.weight",
-        "text_model.transformer.layers.7.self_attn.q_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_7_self_attn_k_proj.lora_down.weight",
-        "text_model.transformer.layers.7.self_attn.k_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_7_self_attn_v_proj.lora_down.weight",
-        "text_model.transformer.layers.7.self_attn.v_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_7_self_attn_out_proj.lora_down.weight",
-        "text_model.transformer.layers.7.self_attn.out_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_7_mlp_fc1.lora_down.weight",
-        "text_model.transformer.layers.7.linear1.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_7_mlp_fc2.lora_down.weight",
-        "text_model.transformer.layers.7.linear2.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_7_self_attn_q_proj.lora_up.weight",
-        "text_model.transformer.layers.7.self_attn.q_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_7_self_attn_k_proj.lora_up.weight",
-        "text_model.transformer.layers.7.self_attn.k_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_7_self_attn_v_proj.lora_up.weight",
-        "text_model.transformer.layers.7.self_attn.v_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_7_self_attn_out_proj.lora_up.weight",
-        "text_model.transformer.layers.7.self_attn.out_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_7_mlp_fc1.lora_up.weight",
-        "text_model.transformer.layers.7.linear1.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_7_mlp_fc2.lora_up.weight",
-        "text_model.transformer.layers.7.linear2.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_7_self_attn_q_proj.alpha",
-        "text_model.transformer.layers.7.self_attn.q_proj.alpha",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_7_self_attn_k_proj.alpha",
-        "text_model.transformer.layers.7.self_attn.k_proj.alpha",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_7_self_attn_v_proj.alpha",
-        "text_model.transformer.layers.7.self_attn.v_proj.alpha",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_7_self_attn_out_proj.alpha",
-        "text_model.transformer.layers.7.self_attn.out_proj.alpha",
-    ],
-    ["lora_te_text_model_encoder_layers_7_mlp_fc1.alpha", "text_model.transformer.layers.7.linear1.alpha"],
-    ["lora_te_text_model_encoder_layers_7_mlp_fc2.alpha", "text_model.transformer.layers.7.linear2.alpha"],
-    [
-        "lora_te_text_model_encoder_layers_8_self_attn_q_proj.lora_down.weight",
-        "text_model.transformer.layers.8.self_attn.q_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_8_self_attn_k_proj.lora_down.weight",
-        "text_model.transformer.layers.8.self_attn.k_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_8_self_attn_v_proj.lora_down.weight",
-        "text_model.transformer.layers.8.self_attn.v_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_8_self_attn_out_proj.lora_down.weight",
-        "text_model.transformer.layers.8.self_attn.out_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_8_mlp_fc1.lora_down.weight",
-        "text_model.transformer.layers.8.linear1.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_8_mlp_fc2.lora_down.weight",
-        "text_model.transformer.layers.8.linear2.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_8_self_attn_q_proj.lora_up.weight",
-        "text_model.transformer.layers.8.self_attn.q_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_8_self_attn_k_proj.lora_up.weight",
-        "text_model.transformer.layers.8.self_attn.k_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_8_self_attn_v_proj.lora_up.weight",
-        "text_model.transformer.layers.8.self_attn.v_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_8_self_attn_out_proj.lora_up.weight",
-        "text_model.transformer.layers.8.self_attn.out_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_8_mlp_fc1.lora_up.weight",
-        "text_model.transformer.layers.8.linear1.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_8_mlp_fc2.lora_up.weight",
-        "text_model.transformer.layers.8.linear2.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_8_self_attn_q_proj.alpha",
-        "text_model.transformer.layers.8.self_attn.q_proj.alpha",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_8_self_attn_k_proj.alpha",
-        "text_model.transformer.layers.8.self_attn.k_proj.alpha",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_8_self_attn_v_proj.alpha",
-        "text_model.transformer.layers.8.self_attn.v_proj.alpha",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_8_self_attn_out_proj.alpha",
-        "text_model.transformer.layers.8.self_attn.out_proj.alpha",
-    ],
-    ["lora_te_text_model_encoder_layers_8_mlp_fc1.alpha", "text_model.transformer.layers.8.linear1.alpha"],
-    ["lora_te_text_model_encoder_layers_8_mlp_fc2.alpha", "text_model.transformer.layers.8.linear2.alpha"],
-    [
-        "lora_te_text_model_encoder_layers_9_self_attn_q_proj.lora_down.weight",
-        "text_model.transformer.layers.9.self_attn.q_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_9_self_attn_k_proj.lora_down.weight",
-        "text_model.transformer.layers.9.self_attn.k_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_9_self_attn_v_proj.lora_down.weight",
-        "text_model.transformer.layers.9.self_attn.v_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_9_self_attn_out_proj.lora_down.weight",
-        "text_model.transformer.layers.9.self_attn.out_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_9_mlp_fc1.lora_down.weight",
-        "text_model.transformer.layers.9.linear1.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_9_mlp_fc2.lora_down.weight",
-        "text_model.transformer.layers.9.linear2.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_9_self_attn_q_proj.lora_up.weight",
-        "text_model.transformer.layers.9.self_attn.q_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_9_self_attn_k_proj.lora_up.weight",
-        "text_model.transformer.layers.9.self_attn.k_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_9_self_attn_v_proj.lora_up.weight",
-        "text_model.transformer.layers.9.self_attn.v_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_9_self_attn_out_proj.lora_up.weight",
-        "text_model.transformer.layers.9.self_attn.out_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_9_mlp_fc1.lora_up.weight",
-        "text_model.transformer.layers.9.linear1.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_9_mlp_fc2.lora_up.weight",
-        "text_model.transformer.layers.9.linear2.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_9_self_attn_q_proj.alpha",
-        "text_model.transformer.layers.9.self_attn.q_proj.alpha",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_9_self_attn_k_proj.alpha",
-        "text_model.transformer.layers.9.self_attn.k_proj.alpha",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_9_self_attn_v_proj.alpha",
-        "text_model.transformer.layers.9.self_attn.v_proj.alpha",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_9_self_attn_out_proj.alpha",
-        "text_model.transformer.layers.9.self_attn.out_proj.alpha",
-    ],
-    ["lora_te_text_model_encoder_layers_9_mlp_fc1.alpha", "text_model.transformer.layers.9.linear1.alpha"],
-    ["lora_te_text_model_encoder_layers_9_mlp_fc2.alpha", "text_model.transformer.layers.9.linear2.alpha"],
-    [
-        "lora_te_text_model_encoder_layers_10_self_attn_q_proj.lora_down.weight",
-        "text_model.transformer.layers.10.self_attn.q_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_10_self_attn_k_proj.lora_down.weight",
-        "text_model.transformer.layers.10.self_attn.k_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_10_self_attn_v_proj.lora_down.weight",
-        "text_model.transformer.layers.10.self_attn.v_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_10_self_attn_out_proj.lora_down.weight",
-        "text_model.transformer.layers.10.self_attn.out_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_10_mlp_fc1.lora_down.weight",
-        "text_model.transformer.layers.10.linear1.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_10_mlp_fc2.lora_down.weight",
-        "text_model.transformer.layers.10.linear2.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_10_self_attn_q_proj.lora_up.weight",
-        "text_model.transformer.layers.10.self_attn.q_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_10_self_attn_k_proj.lora_up.weight",
-        "text_model.transformer.layers.10.self_attn.k_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_10_self_attn_v_proj.lora_up.weight",
-        "text_model.transformer.layers.10.self_attn.v_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_10_self_attn_out_proj.lora_up.weight",
-        "text_model.transformer.layers.10.self_attn.out_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_10_mlp_fc1.lora_up.weight",
-        "text_model.transformer.layers.10.linear1.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_10_mlp_fc2.lora_up.weight",
-        "text_model.transformer.layers.10.linear2.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_10_self_attn_q_proj.alpha",
-        "text_model.transformer.layers.10.self_attn.q_proj.alpha",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_10_self_attn_k_proj.alpha",
-        "text_model.transformer.layers.10.self_attn.k_proj.alpha",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_10_self_attn_v_proj.alpha",
-        "text_model.transformer.layers.10.self_attn.v_proj.alpha",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_10_self_attn_out_proj.alpha",
-        "text_model.transformer.layers.10.self_attn.out_proj.alpha",
-    ],
-    ["lora_te_text_model_encoder_layers_10_mlp_fc1.alpha", "text_model.transformer.layers.10.linear1.alpha"],
-    ["lora_te_text_model_encoder_layers_10_mlp_fc2.alpha", "text_model.transformer.layers.10.linear2.alpha"],
-    [
-        "lora_te_text_model_encoder_layers_11_self_attn_q_proj.lora_down.weight",
-        "text_model.transformer.layers.11.self_attn.q_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_11_self_attn_k_proj.lora_down.weight",
-        "text_model.transformer.layers.11.self_attn.k_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_11_self_attn_v_proj.lora_down.weight",
-        "text_model.transformer.layers.11.self_attn.v_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_11_self_attn_out_proj.lora_down.weight",
-        "text_model.transformer.layers.11.self_attn.out_proj.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_11_mlp_fc1.lora_down.weight",
-        "text_model.transformer.layers.11.linear1.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_11_mlp_fc2.lora_down.weight",
-        "text_model.transformer.layers.11.linear2.lora_down.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_11_self_attn_q_proj.lora_up.weight",
-        "text_model.transformer.layers.11.self_attn.q_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_11_self_attn_k_proj.lora_up.weight",
-        "text_model.transformer.layers.11.self_attn.k_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_11_self_attn_v_proj.lora_up.weight",
-        "text_model.transformer.layers.11.self_attn.v_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_11_self_attn_out_proj.lora_up.weight",
-        "text_model.transformer.layers.11.self_attn.out_proj.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_11_mlp_fc1.lora_up.weight",
-        "text_model.transformer.layers.11.linear1.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_11_mlp_fc2.lora_up.weight",
-        "text_model.transformer.layers.11.linear2.lora_up.weight",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_11_self_attn_q_proj.alpha",
-        "text_model.transformer.layers.11.self_attn.q_proj.alpha",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_11_self_attn_k_proj.alpha",
-        "text_model.transformer.layers.11.self_attn.k_proj.alpha",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_11_self_attn_v_proj.alpha",
-        "text_model.transformer.layers.11.self_attn.v_proj.alpha",
-    ],
-    [
-        "lora_te_text_model_encoder_layers_11_self_attn_out_proj.alpha",
-        "text_model.transformer.layers.11.self_attn.out_proj.alpha",
-    ],
-    ["lora_te_text_model_encoder_layers_11_mlp_fc1.alpha", "text_model.transformer.layers.11.linear1.alpha"],
-    ["lora_te_text_model_encoder_layers_11_mlp_fc2.alpha", "text_model.transformer.layers.11.linear2.alpha"],
-    [
-        "lora_unet_down_blocks_0_attentions_0_proj_in.lora_down.weight",
-        "down_blocks.0.attentions.0.proj_in.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_0_proj_in.lora_up.weight",
-        "down_blocks.0.attentions.0.proj_in.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_0_transformer_blocks_0_attn1_to_q.lora_down.weight",
-        "down_blocks.0.attentions.0.transformer_blocks.0.attn1.to_q.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_0_transformer_blocks_0_attn1_to_q.lora_up.weight",
-        "down_blocks.0.attentions.0.transformer_blocks.0.attn1.to_q.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_0_transformer_blocks_0_attn1_to_k.lora_down.weight",
-        "down_blocks.0.attentions.0.transformer_blocks.0.attn1.to_k.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_0_transformer_blocks_0_attn1_to_k.lora_up.weight",
-        "down_blocks.0.attentions.0.transformer_blocks.0.attn1.to_k.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_0_transformer_blocks_0_attn1_to_v.lora_down.weight",
-        "down_blocks.0.attentions.0.transformer_blocks.0.attn1.to_v.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_0_transformer_blocks_0_attn1_to_v.lora_up.weight",
-        "down_blocks.0.attentions.0.transformer_blocks.0.attn1.to_v.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_0_transformer_blocks_0_attn1_to_out_0.lora_down.weight",
-        "down_blocks.0.attentions.0.transformer_blocks.0.attn1.to_out.0.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_0_transformer_blocks_0_attn1_to_out_0.lora_up.weight",
-        "down_blocks.0.attentions.0.transformer_blocks.0.attn1.to_out.0.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_0_transformer_blocks_0_ff_net_0_proj.lora_down.weight",
-        "down_blocks.0.attentions.0.transformer_blocks.0.ff.net.0.proj.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_0_transformer_blocks_0_ff_net_0_proj.lora_up.weight",
-        "down_blocks.0.attentions.0.transformer_blocks.0.ff.net.0.proj.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_0_transformer_blocks_0_ff_net_2.lora_down.weight",
-        "down_blocks.0.attentions.0.transformer_blocks.0.ff.net.2.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_0_transformer_blocks_0_ff_net_2.lora_up.weight",
-        "down_blocks.0.attentions.0.transformer_blocks.0.ff.net.2.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_0_transformer_blocks_0_attn2_to_q.lora_down.weight",
-        "down_blocks.0.attentions.0.transformer_blocks.0.attn2.to_q.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_0_transformer_blocks_0_attn2_to_q.lora_up.weight",
-        "down_blocks.0.attentions.0.transformer_blocks.0.attn2.to_q.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_0_transformer_blocks_0_attn2_to_k.lora_down.weight",
-        "down_blocks.0.attentions.0.transformer_blocks.0.attn2.to_k.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_0_transformer_blocks_0_attn2_to_k.lora_up.weight",
-        "down_blocks.0.attentions.0.transformer_blocks.0.attn2.to_k.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_0_transformer_blocks_0_attn2_to_v.lora_down.weight",
-        "down_blocks.0.attentions.0.transformer_blocks.0.attn2.to_v.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_0_transformer_blocks_0_attn2_to_v.lora_up.weight",
-        "down_blocks.0.attentions.0.transformer_blocks.0.attn2.to_v.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_0_transformer_blocks_0_attn2_to_out_0.lora_down.weight",
-        "down_blocks.0.attentions.0.transformer_blocks.0.attn2.to_out.0.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_0_transformer_blocks_0_attn2_to_out_0.lora_up.weight",
-        "down_blocks.0.attentions.0.transformer_blocks.0.attn2.to_out.0.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_0_proj_out.lora_down.weight",
-        "down_blocks.0.attentions.0.proj_out.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_0_proj_out.lora_up.weight",
-        "down_blocks.0.attentions.0.proj_out.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_1_proj_in.lora_down.weight",
-        "down_blocks.0.attentions.1.proj_in.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_1_proj_in.lora_up.weight",
-        "down_blocks.0.attentions.1.proj_in.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_1_transformer_blocks_0_attn1_to_q.lora_down.weight",
-        "down_blocks.0.attentions.1.transformer_blocks.0.attn1.to_q.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_1_transformer_blocks_0_attn1_to_q.lora_up.weight",
-        "down_blocks.0.attentions.1.transformer_blocks.0.attn1.to_q.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_1_transformer_blocks_0_attn1_to_k.lora_down.weight",
-        "down_blocks.0.attentions.1.transformer_blocks.0.attn1.to_k.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_1_transformer_blocks_0_attn1_to_k.lora_up.weight",
-        "down_blocks.0.attentions.1.transformer_blocks.0.attn1.to_k.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_1_transformer_blocks_0_attn1_to_v.lora_down.weight",
-        "down_blocks.0.attentions.1.transformer_blocks.0.attn1.to_v.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_1_transformer_blocks_0_attn1_to_v.lora_up.weight",
-        "down_blocks.0.attentions.1.transformer_blocks.0.attn1.to_v.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_1_transformer_blocks_0_attn1_to_out_0.lora_down.weight",
-        "down_blocks.0.attentions.1.transformer_blocks.0.attn1.to_out.0.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_1_transformer_blocks_0_attn1_to_out_0.lora_up.weight",
-        "down_blocks.0.attentions.1.transformer_blocks.0.attn1.to_out.0.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_1_transformer_blocks_0_ff_net_0_proj.lora_down.weight",
-        "down_blocks.0.attentions.1.transformer_blocks.0.ff.net.0.proj.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_1_transformer_blocks_0_ff_net_0_proj.lora_up.weight",
-        "down_blocks.0.attentions.1.transformer_blocks.0.ff.net.0.proj.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_1_transformer_blocks_0_ff_net_2.lora_down.weight",
-        "down_blocks.0.attentions.1.transformer_blocks.0.ff.net.2.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_1_transformer_blocks_0_ff_net_2.lora_up.weight",
-        "down_blocks.0.attentions.1.transformer_blocks.0.ff.net.2.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_1_transformer_blocks_0_attn2_to_q.lora_down.weight",
-        "down_blocks.0.attentions.1.transformer_blocks.0.attn2.to_q.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_1_transformer_blocks_0_attn2_to_q.lora_up.weight",
-        "down_blocks.0.attentions.1.transformer_blocks.0.attn2.to_q.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_1_transformer_blocks_0_attn2_to_k.lora_down.weight",
-        "down_blocks.0.attentions.1.transformer_blocks.0.attn2.to_k.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_1_transformer_blocks_0_attn2_to_k.lora_up.weight",
-        "down_blocks.0.attentions.1.transformer_blocks.0.attn2.to_k.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_1_transformer_blocks_0_attn2_to_v.lora_down.weight",
-        "down_blocks.0.attentions.1.transformer_blocks.0.attn2.to_v.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_1_transformer_blocks_0_attn2_to_v.lora_up.weight",
-        "down_blocks.0.attentions.1.transformer_blocks.0.attn2.to_v.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_1_transformer_blocks_0_attn2_to_out_0.lora_down.weight",
-        "down_blocks.0.attentions.1.transformer_blocks.0.attn2.to_out.0.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_1_transformer_blocks_0_attn2_to_out_0.lora_up.weight",
-        "down_blocks.0.attentions.1.transformer_blocks.0.attn2.to_out.0.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_1_proj_out.lora_down.weight",
-        "down_blocks.0.attentions.1.proj_out.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_0_attentions_1_proj_out.lora_up.weight",
-        "down_blocks.0.attentions.1.proj_out.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_0_proj_in.lora_down.weight",
-        "down_blocks.1.attentions.0.proj_in.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_0_proj_in.lora_up.weight",
-        "down_blocks.1.attentions.0.proj_in.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_0_transformer_blocks_0_attn1_to_q.lora_down.weight",
-        "down_blocks.1.attentions.0.transformer_blocks.0.attn1.to_q.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_0_transformer_blocks_0_attn1_to_q.lora_up.weight",
-        "down_blocks.1.attentions.0.transformer_blocks.0.attn1.to_q.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_0_transformer_blocks_0_attn1_to_k.lora_down.weight",
-        "down_blocks.1.attentions.0.transformer_blocks.0.attn1.to_k.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_0_transformer_blocks_0_attn1_to_k.lora_up.weight",
-        "down_blocks.1.attentions.0.transformer_blocks.0.attn1.to_k.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_0_transformer_blocks_0_attn1_to_v.lora_down.weight",
-        "down_blocks.1.attentions.0.transformer_blocks.0.attn1.to_v.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_0_transformer_blocks_0_attn1_to_v.lora_up.weight",
-        "down_blocks.1.attentions.0.transformer_blocks.0.attn1.to_v.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_0_transformer_blocks_0_attn1_to_out_0.lora_down.weight",
-        "down_blocks.1.attentions.0.transformer_blocks.0.attn1.to_out.0.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_0_transformer_blocks_0_attn1_to_out_0.lora_up.weight",
-        "down_blocks.1.attentions.0.transformer_blocks.0.attn1.to_out.0.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_0_transformer_blocks_0_ff_net_0_proj.lora_down.weight",
-        "down_blocks.1.attentions.0.transformer_blocks.0.ff.net.0.proj.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_0_transformer_blocks_0_ff_net_0_proj.lora_up.weight",
-        "down_blocks.1.attentions.0.transformer_blocks.0.ff.net.0.proj.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_0_transformer_blocks_0_ff_net_2.lora_down.weight",
-        "down_blocks.1.attentions.0.transformer_blocks.0.ff.net.2.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_0_transformer_blocks_0_ff_net_2.lora_up.weight",
-        "down_blocks.1.attentions.0.transformer_blocks.0.ff.net.2.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_0_transformer_blocks_0_attn2_to_q.lora_down.weight",
-        "down_blocks.1.attentions.0.transformer_blocks.0.attn2.to_q.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_0_transformer_blocks_0_attn2_to_q.lora_up.weight",
-        "down_blocks.1.attentions.0.transformer_blocks.0.attn2.to_q.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_0_transformer_blocks_0_attn2_to_k.lora_down.weight",
-        "down_blocks.1.attentions.0.transformer_blocks.0.attn2.to_k.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_0_transformer_blocks_0_attn2_to_k.lora_up.weight",
-        "down_blocks.1.attentions.0.transformer_blocks.0.attn2.to_k.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_0_transformer_blocks_0_attn2_to_v.lora_down.weight",
-        "down_blocks.1.attentions.0.transformer_blocks.0.attn2.to_v.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_0_transformer_blocks_0_attn2_to_v.lora_up.weight",
-        "down_blocks.1.attentions.0.transformer_blocks.0.attn2.to_v.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_0_transformer_blocks_0_attn2_to_out_0.lora_down.weight",
-        "down_blocks.1.attentions.0.transformer_blocks.0.attn2.to_out.0.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_0_transformer_blocks_0_attn2_to_out_0.lora_up.weight",
-        "down_blocks.1.attentions.0.transformer_blocks.0.attn2.to_out.0.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_0_proj_out.lora_down.weight",
-        "down_blocks.1.attentions.0.proj_out.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_0_proj_out.lora_up.weight",
-        "down_blocks.1.attentions.0.proj_out.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_1_proj_in.lora_down.weight",
-        "down_blocks.1.attentions.1.proj_in.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_1_proj_in.lora_up.weight",
-        "down_blocks.1.attentions.1.proj_in.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_1_transformer_blocks_0_attn1_to_q.lora_down.weight",
-        "down_blocks.1.attentions.1.transformer_blocks.0.attn1.to_q.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_1_transformer_blocks_0_attn1_to_q.lora_up.weight",
-        "down_blocks.1.attentions.1.transformer_blocks.0.attn1.to_q.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_1_transformer_blocks_0_attn1_to_k.lora_down.weight",
-        "down_blocks.1.attentions.1.transformer_blocks.0.attn1.to_k.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_1_transformer_blocks_0_attn1_to_k.lora_up.weight",
-        "down_blocks.1.attentions.1.transformer_blocks.0.attn1.to_k.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_1_transformer_blocks_0_attn1_to_v.lora_down.weight",
-        "down_blocks.1.attentions.1.transformer_blocks.0.attn1.to_v.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_1_transformer_blocks_0_attn1_to_v.lora_up.weight",
-        "down_blocks.1.attentions.1.transformer_blocks.0.attn1.to_v.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_1_transformer_blocks_0_attn1_to_out_0.lora_down.weight",
-        "down_blocks.1.attentions.1.transformer_blocks.0.attn1.to_out.0.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_1_transformer_blocks_0_attn1_to_out_0.lora_up.weight",
-        "down_blocks.1.attentions.1.transformer_blocks.0.attn1.to_out.0.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_1_transformer_blocks_0_ff_net_0_proj.lora_down.weight",
-        "down_blocks.1.attentions.1.transformer_blocks.0.ff.net.0.proj.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_1_transformer_blocks_0_ff_net_0_proj.lora_up.weight",
-        "down_blocks.1.attentions.1.transformer_blocks.0.ff.net.0.proj.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_1_transformer_blocks_0_ff_net_2.lora_down.weight",
-        "down_blocks.1.attentions.1.transformer_blocks.0.ff.net.2.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_1_transformer_blocks_0_ff_net_2.lora_up.weight",
-        "down_blocks.1.attentions.1.transformer_blocks.0.ff.net.2.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_1_transformer_blocks_0_attn2_to_q.lora_down.weight",
-        "down_blocks.1.attentions.1.transformer_blocks.0.attn2.to_q.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_1_transformer_blocks_0_attn2_to_q.lora_up.weight",
-        "down_blocks.1.attentions.1.transformer_blocks.0.attn2.to_q.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_1_transformer_blocks_0_attn2_to_k.lora_down.weight",
-        "down_blocks.1.attentions.1.transformer_blocks.0.attn2.to_k.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_1_transformer_blocks_0_attn2_to_k.lora_up.weight",
-        "down_blocks.1.attentions.1.transformer_blocks.0.attn2.to_k.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_1_transformer_blocks_0_attn2_to_v.lora_down.weight",
-        "down_blocks.1.attentions.1.transformer_blocks.0.attn2.to_v.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_1_transformer_blocks_0_attn2_to_v.lora_up.weight",
-        "down_blocks.1.attentions.1.transformer_blocks.0.attn2.to_v.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_1_transformer_blocks_0_attn2_to_out_0.lora_down.weight",
-        "down_blocks.1.attentions.1.transformer_blocks.0.attn2.to_out.0.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_1_transformer_blocks_0_attn2_to_out_0.lora_up.weight",
-        "down_blocks.1.attentions.1.transformer_blocks.0.attn2.to_out.0.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_1_proj_out.lora_down.weight",
-        "down_blocks.1.attentions.1.proj_out.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_1_attentions_1_proj_out.lora_up.weight",
-        "down_blocks.1.attentions.1.proj_out.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_0_proj_in.lora_down.weight",
-        "down_blocks.2.attentions.0.proj_in.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_0_proj_in.lora_up.weight",
-        "down_blocks.2.attentions.0.proj_in.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_0_transformer_blocks_0_attn1_to_q.lora_down.weight",
-        "down_blocks.2.attentions.0.transformer_blocks.0.attn1.to_q.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_0_transformer_blocks_0_attn1_to_q.lora_up.weight",
-        "down_blocks.2.attentions.0.transformer_blocks.0.attn1.to_q.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_0_transformer_blocks_0_attn1_to_k.lora_down.weight",
-        "down_blocks.2.attentions.0.transformer_blocks.0.attn1.to_k.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_0_transformer_blocks_0_attn1_to_k.lora_up.weight",
-        "down_blocks.2.attentions.0.transformer_blocks.0.attn1.to_k.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_0_transformer_blocks_0_attn1_to_v.lora_down.weight",
-        "down_blocks.2.attentions.0.transformer_blocks.0.attn1.to_v.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_0_transformer_blocks_0_attn1_to_v.lora_up.weight",
-        "down_blocks.2.attentions.0.transformer_blocks.0.attn1.to_v.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_0_transformer_blocks_0_attn1_to_out_0.lora_down.weight",
-        "down_blocks.2.attentions.0.transformer_blocks.0.attn1.to_out.0.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_0_transformer_blocks_0_attn1_to_out_0.lora_up.weight",
-        "down_blocks.2.attentions.0.transformer_blocks.0.attn1.to_out.0.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_0_transformer_blocks_0_ff_net_0_proj.lora_down.weight",
-        "down_blocks.2.attentions.0.transformer_blocks.0.ff.net.0.proj.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_0_transformer_blocks_0_ff_net_0_proj.lora_up.weight",
-        "down_blocks.2.attentions.0.transformer_blocks.0.ff.net.0.proj.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_0_transformer_blocks_0_ff_net_2.lora_down.weight",
-        "down_blocks.2.attentions.0.transformer_blocks.0.ff.net.2.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_0_transformer_blocks_0_ff_net_2.lora_up.weight",
-        "down_blocks.2.attentions.0.transformer_blocks.0.ff.net.2.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_0_transformer_blocks_0_attn2_to_q.lora_down.weight",
-        "down_blocks.2.attentions.0.transformer_blocks.0.attn2.to_q.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_0_transformer_blocks_0_attn2_to_q.lora_up.weight",
-        "down_blocks.2.attentions.0.transformer_blocks.0.attn2.to_q.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_0_transformer_blocks_0_attn2_to_k.lora_down.weight",
-        "down_blocks.2.attentions.0.transformer_blocks.0.attn2.to_k.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_0_transformer_blocks_0_attn2_to_k.lora_up.weight",
-        "down_blocks.2.attentions.0.transformer_blocks.0.attn2.to_k.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_0_transformer_blocks_0_attn2_to_v.lora_down.weight",
-        "down_blocks.2.attentions.0.transformer_blocks.0.attn2.to_v.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_0_transformer_blocks_0_attn2_to_v.lora_up.weight",
-        "down_blocks.2.attentions.0.transformer_blocks.0.attn2.to_v.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_0_transformer_blocks_0_attn2_to_out_0.lora_down.weight",
-        "down_blocks.2.attentions.0.transformer_blocks.0.attn2.to_out.0.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_0_transformer_blocks_0_attn2_to_out_0.lora_up.weight",
-        "down_blocks.2.attentions.0.transformer_blocks.0.attn2.to_out.0.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_0_proj_out.lora_down.weight",
-        "down_blocks.2.attentions.0.proj_out.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_0_proj_out.lora_up.weight",
-        "down_blocks.2.attentions.0.proj_out.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_1_proj_in.lora_down.weight",
-        "down_blocks.2.attentions.1.proj_in.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_1_proj_in.lora_up.weight",
-        "down_blocks.2.attentions.1.proj_in.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_1_transformer_blocks_0_attn1_to_q.lora_down.weight",
-        "down_blocks.2.attentions.1.transformer_blocks.0.attn1.to_q.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_1_transformer_blocks_0_attn1_to_q.lora_up.weight",
-        "down_blocks.2.attentions.1.transformer_blocks.0.attn1.to_q.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_1_transformer_blocks_0_attn1_to_k.lora_down.weight",
-        "down_blocks.2.attentions.1.transformer_blocks.0.attn1.to_k.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_1_transformer_blocks_0_attn1_to_k.lora_up.weight",
-        "down_blocks.2.attentions.1.transformer_blocks.0.attn1.to_k.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_1_transformer_blocks_0_attn1_to_v.lora_down.weight",
-        "down_blocks.2.attentions.1.transformer_blocks.0.attn1.to_v.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_1_transformer_blocks_0_attn1_to_v.lora_up.weight",
-        "down_blocks.2.attentions.1.transformer_blocks.0.attn1.to_v.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_1_transformer_blocks_0_attn1_to_out_0.lora_down.weight",
-        "down_blocks.2.attentions.1.transformer_blocks.0.attn1.to_out.0.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_1_transformer_blocks_0_attn1_to_out_0.lora_up.weight",
-        "down_blocks.2.attentions.1.transformer_blocks.0.attn1.to_out.0.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_1_transformer_blocks_0_ff_net_0_proj.lora_down.weight",
-        "down_blocks.2.attentions.1.transformer_blocks.0.ff.net.0.proj.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_1_transformer_blocks_0_ff_net_0_proj.lora_up.weight",
-        "down_blocks.2.attentions.1.transformer_blocks.0.ff.net.0.proj.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_1_transformer_blocks_0_ff_net_2.lora_down.weight",
-        "down_blocks.2.attentions.1.transformer_blocks.0.ff.net.2.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_1_transformer_blocks_0_ff_net_2.lora_up.weight",
-        "down_blocks.2.attentions.1.transformer_blocks.0.ff.net.2.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_1_transformer_blocks_0_attn2_to_q.lora_down.weight",
-        "down_blocks.2.attentions.1.transformer_blocks.0.attn2.to_q.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_1_transformer_blocks_0_attn2_to_q.lora_up.weight",
-        "down_blocks.2.attentions.1.transformer_blocks.0.attn2.to_q.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_1_transformer_blocks_0_attn2_to_k.lora_down.weight",
-        "down_blocks.2.attentions.1.transformer_blocks.0.attn2.to_k.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_1_transformer_blocks_0_attn2_to_k.lora_up.weight",
-        "down_blocks.2.attentions.1.transformer_blocks.0.attn2.to_k.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_1_transformer_blocks_0_attn2_to_v.lora_down.weight",
-        "down_blocks.2.attentions.1.transformer_blocks.0.attn2.to_v.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_1_transformer_blocks_0_attn2_to_v.lora_up.weight",
-        "down_blocks.2.attentions.1.transformer_blocks.0.attn2.to_v.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_1_transformer_blocks_0_attn2_to_out_0.lora_down.weight",
-        "down_blocks.2.attentions.1.transformer_blocks.0.attn2.to_out.0.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_1_transformer_blocks_0_attn2_to_out_0.lora_up.weight",
-        "down_blocks.2.attentions.1.transformer_blocks.0.attn2.to_out.0.lora_up.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_1_proj_out.lora_down.weight",
-        "down_blocks.2.attentions.1.proj_out.lora_down.weight",
-    ],
-    [
-        "lora_unet_down_blocks_2_attentions_1_proj_out.lora_up.weight",
-        "down_blocks.2.attentions.1.proj_out.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_0_proj_in.lora_down.weight",
-        "up_blocks.1.attentions.0.proj_in.lora_down.weight",
-    ],
-    ["lora_unet_up_blocks_1_attentions_0_proj_in.lora_up.weight", "up_blocks.1.attentions.0.proj_in.lora_up.weight"],
-    [
-        "lora_unet_up_blocks_1_attentions_0_transformer_blocks_0_attn1_to_q.lora_down.weight",
-        "up_blocks.1.attentions.0.transformer_blocks.0.attn1.to_q.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_0_transformer_blocks_0_attn1_to_q.lora_up.weight",
-        "up_blocks.1.attentions.0.transformer_blocks.0.attn1.to_q.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_0_transformer_blocks_0_attn1_to_k.lora_down.weight",
-        "up_blocks.1.attentions.0.transformer_blocks.0.attn1.to_k.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_0_transformer_blocks_0_attn1_to_k.lora_up.weight",
-        "up_blocks.1.attentions.0.transformer_blocks.0.attn1.to_k.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_0_transformer_blocks_0_attn1_to_v.lora_down.weight",
-        "up_blocks.1.attentions.0.transformer_blocks.0.attn1.to_v.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_0_transformer_blocks_0_attn1_to_v.lora_up.weight",
-        "up_blocks.1.attentions.0.transformer_blocks.0.attn1.to_v.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_0_transformer_blocks_0_attn1_to_out_0.lora_down.weight",
-        "up_blocks.1.attentions.0.transformer_blocks.0.attn1.to_out.0.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_0_transformer_blocks_0_attn1_to_out_0.lora_up.weight",
-        "up_blocks.1.attentions.0.transformer_blocks.0.attn1.to_out.0.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_0_transformer_blocks_0_ff_net_0_proj.lora_down.weight",
-        "up_blocks.1.attentions.0.transformer_blocks.0.ff.net.0.proj.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_0_transformer_blocks_0_ff_net_0_proj.lora_up.weight",
-        "up_blocks.1.attentions.0.transformer_blocks.0.ff.net.0.proj.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_0_transformer_blocks_0_ff_net_2.lora_down.weight",
-        "up_blocks.1.attentions.0.transformer_blocks.0.ff.net.2.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_0_transformer_blocks_0_ff_net_2.lora_up.weight",
-        "up_blocks.1.attentions.0.transformer_blocks.0.ff.net.2.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_0_transformer_blocks_0_attn2_to_q.lora_down.weight",
-        "up_blocks.1.attentions.0.transformer_blocks.0.attn2.to_q.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_0_transformer_blocks_0_attn2_to_q.lora_up.weight",
-        "up_blocks.1.attentions.0.transformer_blocks.0.attn2.to_q.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_0_transformer_blocks_0_attn2_to_k.lora_down.weight",
-        "up_blocks.1.attentions.0.transformer_blocks.0.attn2.to_k.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_0_transformer_blocks_0_attn2_to_k.lora_up.weight",
-        "up_blocks.1.attentions.0.transformer_blocks.0.attn2.to_k.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_0_transformer_blocks_0_attn2_to_v.lora_down.weight",
-        "up_blocks.1.attentions.0.transformer_blocks.0.attn2.to_v.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_0_transformer_blocks_0_attn2_to_v.lora_up.weight",
-        "up_blocks.1.attentions.0.transformer_blocks.0.attn2.to_v.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_0_transformer_blocks_0_attn2_to_out_0.lora_down.weight",
-        "up_blocks.1.attentions.0.transformer_blocks.0.attn2.to_out.0.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_0_transformer_blocks_0_attn2_to_out_0.lora_up.weight",
-        "up_blocks.1.attentions.0.transformer_blocks.0.attn2.to_out.0.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_0_proj_out.lora_down.weight",
-        "up_blocks.1.attentions.0.proj_out.lora_down.weight",
-    ],
-    ["lora_unet_up_blocks_1_attentions_0_proj_out.lora_up.weight", "up_blocks.1.attentions.0.proj_out.lora_up.weight"],
-    [
-        "lora_unet_up_blocks_1_attentions_1_proj_in.lora_down.weight",
-        "up_blocks.1.attentions.1.proj_in.lora_down.weight",
-    ],
-    ["lora_unet_up_blocks_1_attentions_1_proj_in.lora_up.weight", "up_blocks.1.attentions.1.proj_in.lora_up.weight"],
-    [
-        "lora_unet_up_blocks_1_attentions_1_transformer_blocks_0_attn1_to_q.lora_down.weight",
-        "up_blocks.1.attentions.1.transformer_blocks.0.attn1.to_q.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_1_transformer_blocks_0_attn1_to_q.lora_up.weight",
-        "up_blocks.1.attentions.1.transformer_blocks.0.attn1.to_q.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_1_transformer_blocks_0_attn1_to_k.lora_down.weight",
-        "up_blocks.1.attentions.1.transformer_blocks.0.attn1.to_k.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_1_transformer_blocks_0_attn1_to_k.lora_up.weight",
-        "up_blocks.1.attentions.1.transformer_blocks.0.attn1.to_k.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_1_transformer_blocks_0_attn1_to_v.lora_down.weight",
-        "up_blocks.1.attentions.1.transformer_blocks.0.attn1.to_v.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_1_transformer_blocks_0_attn1_to_v.lora_up.weight",
-        "up_blocks.1.attentions.1.transformer_blocks.0.attn1.to_v.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_1_transformer_blocks_0_attn1_to_out_0.lora_down.weight",
-        "up_blocks.1.attentions.1.transformer_blocks.0.attn1.to_out.0.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_1_transformer_blocks_0_attn1_to_out_0.lora_up.weight",
-        "up_blocks.1.attentions.1.transformer_blocks.0.attn1.to_out.0.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_1_transformer_blocks_0_ff_net_0_proj.lora_down.weight",
-        "up_blocks.1.attentions.1.transformer_blocks.0.ff.net.0.proj.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_1_transformer_blocks_0_ff_net_0_proj.lora_up.weight",
-        "up_blocks.1.attentions.1.transformer_blocks.0.ff.net.0.proj.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_1_transformer_blocks_0_ff_net_2.lora_down.weight",
-        "up_blocks.1.attentions.1.transformer_blocks.0.ff.net.2.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_1_transformer_blocks_0_ff_net_2.lora_up.weight",
-        "up_blocks.1.attentions.1.transformer_blocks.0.ff.net.2.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_1_transformer_blocks_0_attn2_to_q.lora_down.weight",
-        "up_blocks.1.attentions.1.transformer_blocks.0.attn2.to_q.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_1_transformer_blocks_0_attn2_to_q.lora_up.weight",
-        "up_blocks.1.attentions.1.transformer_blocks.0.attn2.to_q.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_1_transformer_blocks_0_attn2_to_k.lora_down.weight",
-        "up_blocks.1.attentions.1.transformer_blocks.0.attn2.to_k.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_1_transformer_blocks_0_attn2_to_k.lora_up.weight",
-        "up_blocks.1.attentions.1.transformer_blocks.0.attn2.to_k.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_1_transformer_blocks_0_attn2_to_v.lora_down.weight",
-        "up_blocks.1.attentions.1.transformer_blocks.0.attn2.to_v.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_1_transformer_blocks_0_attn2_to_v.lora_up.weight",
-        "up_blocks.1.attentions.1.transformer_blocks.0.attn2.to_v.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_1_transformer_blocks_0_attn2_to_out_0.lora_down.weight",
-        "up_blocks.1.attentions.1.transformer_blocks.0.attn2.to_out.0.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_1_transformer_blocks_0_attn2_to_out_0.lora_up.weight",
-        "up_blocks.1.attentions.1.transformer_blocks.0.attn2.to_out.0.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_1_proj_out.lora_down.weight",
-        "up_blocks.1.attentions.1.proj_out.lora_down.weight",
-    ],
-    ["lora_unet_up_blocks_1_attentions_1_proj_out.lora_up.weight", "up_blocks.1.attentions.1.proj_out.lora_up.weight"],
-    [
-        "lora_unet_up_blocks_1_attentions_2_proj_in.lora_down.weight",
-        "up_blocks.1.attentions.2.proj_in.lora_down.weight",
-    ],
-    ["lora_unet_up_blocks_1_attentions_2_proj_in.lora_up.weight", "up_blocks.1.attentions.2.proj_in.lora_up.weight"],
-    [
-        "lora_unet_up_blocks_1_attentions_2_transformer_blocks_0_attn1_to_q.lora_down.weight",
-        "up_blocks.1.attentions.2.transformer_blocks.0.attn1.to_q.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_2_transformer_blocks_0_attn1_to_q.lora_up.weight",
-        "up_blocks.1.attentions.2.transformer_blocks.0.attn1.to_q.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_2_transformer_blocks_0_attn1_to_k.lora_down.weight",
-        "up_blocks.1.attentions.2.transformer_blocks.0.attn1.to_k.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_2_transformer_blocks_0_attn1_to_k.lora_up.weight",
-        "up_blocks.1.attentions.2.transformer_blocks.0.attn1.to_k.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_2_transformer_blocks_0_attn1_to_v.lora_down.weight",
-        "up_blocks.1.attentions.2.transformer_blocks.0.attn1.to_v.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_2_transformer_blocks_0_attn1_to_v.lora_up.weight",
-        "up_blocks.1.attentions.2.transformer_blocks.0.attn1.to_v.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_2_transformer_blocks_0_attn1_to_out_0.lora_down.weight",
-        "up_blocks.1.attentions.2.transformer_blocks.0.attn1.to_out.0.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_2_transformer_blocks_0_attn1_to_out_0.lora_up.weight",
-        "up_blocks.1.attentions.2.transformer_blocks.0.attn1.to_out.0.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_2_transformer_blocks_0_ff_net_0_proj.lora_down.weight",
-        "up_blocks.1.attentions.2.transformer_blocks.0.ff.net.0.proj.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_2_transformer_blocks_0_ff_net_0_proj.lora_up.weight",
-        "up_blocks.1.attentions.2.transformer_blocks.0.ff.net.0.proj.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_2_transformer_blocks_0_ff_net_2.lora_down.weight",
-        "up_blocks.1.attentions.2.transformer_blocks.0.ff.net.2.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_2_transformer_blocks_0_ff_net_2.lora_up.weight",
-        "up_blocks.1.attentions.2.transformer_blocks.0.ff.net.2.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_2_transformer_blocks_0_attn2_to_q.lora_down.weight",
-        "up_blocks.1.attentions.2.transformer_blocks.0.attn2.to_q.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_2_transformer_blocks_0_attn2_to_q.lora_up.weight",
-        "up_blocks.1.attentions.2.transformer_blocks.0.attn2.to_q.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_2_transformer_blocks_0_attn2_to_k.lora_down.weight",
-        "up_blocks.1.attentions.2.transformer_blocks.0.attn2.to_k.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_2_transformer_blocks_0_attn2_to_k.lora_up.weight",
-        "up_blocks.1.attentions.2.transformer_blocks.0.attn2.to_k.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_2_transformer_blocks_0_attn2_to_v.lora_down.weight",
-        "up_blocks.1.attentions.2.transformer_blocks.0.attn2.to_v.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_2_transformer_blocks_0_attn2_to_v.lora_up.weight",
-        "up_blocks.1.attentions.2.transformer_blocks.0.attn2.to_v.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_2_transformer_blocks_0_attn2_to_out_0.lora_down.weight",
-        "up_blocks.1.attentions.2.transformer_blocks.0.attn2.to_out.0.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_2_transformer_blocks_0_attn2_to_out_0.lora_up.weight",
-        "up_blocks.1.attentions.2.transformer_blocks.0.attn2.to_out.0.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_1_attentions_2_proj_out.lora_down.weight",
-        "up_blocks.1.attentions.2.proj_out.lora_down.weight",
-    ],
-    ["lora_unet_up_blocks_1_attentions_2_proj_out.lora_up.weight", "up_blocks.1.attentions.2.proj_out.lora_up.weight"],
-    [
-        "lora_unet_up_blocks_2_attentions_0_proj_in.lora_down.weight",
-        "up_blocks.2.attentions.0.proj_in.lora_down.weight",
-    ],
-    ["lora_unet_up_blocks_2_attentions_0_proj_in.lora_up.weight", "up_blocks.2.attentions.0.proj_in.lora_up.weight"],
-    [
-        "lora_unet_up_blocks_2_attentions_0_transformer_blocks_0_attn1_to_q.lora_down.weight",
-        "up_blocks.2.attentions.0.transformer_blocks.0.attn1.to_q.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_0_transformer_blocks_0_attn1_to_q.lora_up.weight",
-        "up_blocks.2.attentions.0.transformer_blocks.0.attn1.to_q.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_0_transformer_blocks_0_attn1_to_k.lora_down.weight",
-        "up_blocks.2.attentions.0.transformer_blocks.0.attn1.to_k.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_0_transformer_blocks_0_attn1_to_k.lora_up.weight",
-        "up_blocks.2.attentions.0.transformer_blocks.0.attn1.to_k.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_0_transformer_blocks_0_attn1_to_v.lora_down.weight",
-        "up_blocks.2.attentions.0.transformer_blocks.0.attn1.to_v.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_0_transformer_blocks_0_attn1_to_v.lora_up.weight",
-        "up_blocks.2.attentions.0.transformer_blocks.0.attn1.to_v.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_0_transformer_blocks_0_attn1_to_out_0.lora_down.weight",
-        "up_blocks.2.attentions.0.transformer_blocks.0.attn1.to_out.0.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_0_transformer_blocks_0_attn1_to_out_0.lora_up.weight",
-        "up_blocks.2.attentions.0.transformer_blocks.0.attn1.to_out.0.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_0_transformer_blocks_0_ff_net_0_proj.lora_down.weight",
-        "up_blocks.2.attentions.0.transformer_blocks.0.ff.net.0.proj.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_0_transformer_blocks_0_ff_net_0_proj.lora_up.weight",
-        "up_blocks.2.attentions.0.transformer_blocks.0.ff.net.0.proj.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_0_transformer_blocks_0_ff_net_2.lora_down.weight",
-        "up_blocks.2.attentions.0.transformer_blocks.0.ff.net.2.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_0_transformer_blocks_0_ff_net_2.lora_up.weight",
-        "up_blocks.2.attentions.0.transformer_blocks.0.ff.net.2.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_0_transformer_blocks_0_attn2_to_q.lora_down.weight",
-        "up_blocks.2.attentions.0.transformer_blocks.0.attn2.to_q.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_0_transformer_blocks_0_attn2_to_q.lora_up.weight",
-        "up_blocks.2.attentions.0.transformer_blocks.0.attn2.to_q.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_0_transformer_blocks_0_attn2_to_k.lora_down.weight",
-        "up_blocks.2.attentions.0.transformer_blocks.0.attn2.to_k.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_0_transformer_blocks_0_attn2_to_k.lora_up.weight",
-        "up_blocks.2.attentions.0.transformer_blocks.0.attn2.to_k.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_0_transformer_blocks_0_attn2_to_v.lora_down.weight",
-        "up_blocks.2.attentions.0.transformer_blocks.0.attn2.to_v.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_0_transformer_blocks_0_attn2_to_v.lora_up.weight",
-        "up_blocks.2.attentions.0.transformer_blocks.0.attn2.to_v.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_0_transformer_blocks_0_attn2_to_out_0.lora_down.weight",
-        "up_blocks.2.attentions.0.transformer_blocks.0.attn2.to_out.0.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_0_transformer_blocks_0_attn2_to_out_0.lora_up.weight",
-        "up_blocks.2.attentions.0.transformer_blocks.0.attn2.to_out.0.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_0_proj_out.lora_down.weight",
-        "up_blocks.2.attentions.0.proj_out.lora_down.weight",
-    ],
-    ["lora_unet_up_blocks_2_attentions_0_proj_out.lora_up.weight", "up_blocks.2.attentions.0.proj_out.lora_up.weight"],
-    [
-        "lora_unet_up_blocks_2_attentions_1_proj_in.lora_down.weight",
-        "up_blocks.2.attentions.1.proj_in.lora_down.weight",
-    ],
-    ["lora_unet_up_blocks_2_attentions_1_proj_in.lora_up.weight", "up_blocks.2.attentions.1.proj_in.lora_up.weight"],
-    [
-        "lora_unet_up_blocks_2_attentions_1_transformer_blocks_0_attn1_to_q.lora_down.weight",
-        "up_blocks.2.attentions.1.transformer_blocks.0.attn1.to_q.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_1_transformer_blocks_0_attn1_to_q.lora_up.weight",
-        "up_blocks.2.attentions.1.transformer_blocks.0.attn1.to_q.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_1_transformer_blocks_0_attn1_to_k.lora_down.weight",
-        "up_blocks.2.attentions.1.transformer_blocks.0.attn1.to_k.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_1_transformer_blocks_0_attn1_to_k.lora_up.weight",
-        "up_blocks.2.attentions.1.transformer_blocks.0.attn1.to_k.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_1_transformer_blocks_0_attn1_to_v.lora_down.weight",
-        "up_blocks.2.attentions.1.transformer_blocks.0.attn1.to_v.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_1_transformer_blocks_0_attn1_to_v.lora_up.weight",
-        "up_blocks.2.attentions.1.transformer_blocks.0.attn1.to_v.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_1_transformer_blocks_0_attn1_to_out_0.lora_down.weight",
-        "up_blocks.2.attentions.1.transformer_blocks.0.attn1.to_out.0.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_1_transformer_blocks_0_attn1_to_out_0.lora_up.weight",
-        "up_blocks.2.attentions.1.transformer_blocks.0.attn1.to_out.0.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_1_transformer_blocks_0_ff_net_0_proj.lora_down.weight",
-        "up_blocks.2.attentions.1.transformer_blocks.0.ff.net.0.proj.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_1_transformer_blocks_0_ff_net_0_proj.lora_up.weight",
-        "up_blocks.2.attentions.1.transformer_blocks.0.ff.net.0.proj.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_1_transformer_blocks_0_ff_net_2.lora_down.weight",
-        "up_blocks.2.attentions.1.transformer_blocks.0.ff.net.2.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_1_transformer_blocks_0_ff_net_2.lora_up.weight",
-        "up_blocks.2.attentions.1.transformer_blocks.0.ff.net.2.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_1_transformer_blocks_0_attn2_to_q.lora_down.weight",
-        "up_blocks.2.attentions.1.transformer_blocks.0.attn2.to_q.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_1_transformer_blocks_0_attn2_to_q.lora_up.weight",
-        "up_blocks.2.attentions.1.transformer_blocks.0.attn2.to_q.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_1_transformer_blocks_0_attn2_to_k.lora_down.weight",
-        "up_blocks.2.attentions.1.transformer_blocks.0.attn2.to_k.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_1_transformer_blocks_0_attn2_to_k.lora_up.weight",
-        "up_blocks.2.attentions.1.transformer_blocks.0.attn2.to_k.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_1_transformer_blocks_0_attn2_to_v.lora_down.weight",
-        "up_blocks.2.attentions.1.transformer_blocks.0.attn2.to_v.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_1_transformer_blocks_0_attn2_to_v.lora_up.weight",
-        "up_blocks.2.attentions.1.transformer_blocks.0.attn2.to_v.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_1_transformer_blocks_0_attn2_to_out_0.lora_down.weight",
-        "up_blocks.2.attentions.1.transformer_blocks.0.attn2.to_out.0.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_1_transformer_blocks_0_attn2_to_out_0.lora_up.weight",
-        "up_blocks.2.attentions.1.transformer_blocks.0.attn2.to_out.0.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_1_proj_out.lora_down.weight",
-        "up_blocks.2.attentions.1.proj_out.lora_down.weight",
-    ],
-    ["lora_unet_up_blocks_2_attentions_1_proj_out.lora_up.weight", "up_blocks.2.attentions.1.proj_out.lora_up.weight"],
-    [
-        "lora_unet_up_blocks_2_attentions_2_proj_in.lora_down.weight",
-        "up_blocks.2.attentions.2.proj_in.lora_down.weight",
-    ],
-    ["lora_unet_up_blocks_2_attentions_2_proj_in.lora_up.weight", "up_blocks.2.attentions.2.proj_in.lora_up.weight"],
-    [
-        "lora_unet_up_blocks_2_attentions_2_transformer_blocks_0_attn1_to_q.lora_down.weight",
-        "up_blocks.2.attentions.2.transformer_blocks.0.attn1.to_q.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_2_transformer_blocks_0_attn1_to_q.lora_up.weight",
-        "up_blocks.2.attentions.2.transformer_blocks.0.attn1.to_q.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_2_transformer_blocks_0_attn1_to_k.lora_down.weight",
-        "up_blocks.2.attentions.2.transformer_blocks.0.attn1.to_k.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_2_transformer_blocks_0_attn1_to_k.lora_up.weight",
-        "up_blocks.2.attentions.2.transformer_blocks.0.attn1.to_k.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_2_transformer_blocks_0_attn1_to_v.lora_down.weight",
-        "up_blocks.2.attentions.2.transformer_blocks.0.attn1.to_v.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_2_transformer_blocks_0_attn1_to_v.lora_up.weight",
-        "up_blocks.2.attentions.2.transformer_blocks.0.attn1.to_v.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_2_transformer_blocks_0_attn1_to_out_0.lora_down.weight",
-        "up_blocks.2.attentions.2.transformer_blocks.0.attn1.to_out.0.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_2_transformer_blocks_0_attn1_to_out_0.lora_up.weight",
-        "up_blocks.2.attentions.2.transformer_blocks.0.attn1.to_out.0.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_2_transformer_blocks_0_ff_net_0_proj.lora_down.weight",
-        "up_blocks.2.attentions.2.transformer_blocks.0.ff.net.0.proj.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_2_transformer_blocks_0_ff_net_0_proj.lora_up.weight",
-        "up_blocks.2.attentions.2.transformer_blocks.0.ff.net.0.proj.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_2_transformer_blocks_0_ff_net_2.lora_down.weight",
-        "up_blocks.2.attentions.2.transformer_blocks.0.ff.net.2.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_2_transformer_blocks_0_ff_net_2.lora_up.weight",
-        "up_blocks.2.attentions.2.transformer_blocks.0.ff.net.2.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_2_transformer_blocks_0_attn2_to_q.lora_down.weight",
-        "up_blocks.2.attentions.2.transformer_blocks.0.attn2.to_q.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_2_transformer_blocks_0_attn2_to_q.lora_up.weight",
-        "up_blocks.2.attentions.2.transformer_blocks.0.attn2.to_q.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_2_transformer_blocks_0_attn2_to_k.lora_down.weight",
-        "up_blocks.2.attentions.2.transformer_blocks.0.attn2.to_k.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_2_transformer_blocks_0_attn2_to_k.lora_up.weight",
-        "up_blocks.2.attentions.2.transformer_blocks.0.attn2.to_k.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_2_transformer_blocks_0_attn2_to_v.lora_down.weight",
-        "up_blocks.2.attentions.2.transformer_blocks.0.attn2.to_v.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_2_transformer_blocks_0_attn2_to_v.lora_up.weight",
-        "up_blocks.2.attentions.2.transformer_blocks.0.attn2.to_v.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_2_transformer_blocks_0_attn2_to_out_0.lora_down.weight",
-        "up_blocks.2.attentions.2.transformer_blocks.0.attn2.to_out.0.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_2_transformer_blocks_0_attn2_to_out_0.lora_up.weight",
-        "up_blocks.2.attentions.2.transformer_blocks.0.attn2.to_out.0.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_2_attentions_2_proj_out.lora_down.weight",
-        "up_blocks.2.attentions.2.proj_out.lora_down.weight",
-    ],
-    ["lora_unet_up_blocks_2_attentions_2_proj_out.lora_up.weight", "up_blocks.2.attentions.2.proj_out.lora_up.weight"],
-    [
-        "lora_unet_up_blocks_3_attentions_0_proj_in.lora_down.weight",
-        "up_blocks.3.attentions.0.proj_in.lora_down.weight",
-    ],
-    ["lora_unet_up_blocks_3_attentions_0_proj_in.lora_up.weight", "up_blocks.3.attentions.0.proj_in.lora_up.weight"],
-    [
-        "lora_unet_up_blocks_3_attentions_0_transformer_blocks_0_attn1_to_q.lora_down.weight",
-        "up_blocks.3.attentions.0.transformer_blocks.0.attn1.to_q.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_0_transformer_blocks_0_attn1_to_q.lora_up.weight",
-        "up_blocks.3.attentions.0.transformer_blocks.0.attn1.to_q.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_0_transformer_blocks_0_attn1_to_k.lora_down.weight",
-        "up_blocks.3.attentions.0.transformer_blocks.0.attn1.to_k.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_0_transformer_blocks_0_attn1_to_k.lora_up.weight",
-        "up_blocks.3.attentions.0.transformer_blocks.0.attn1.to_k.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_0_transformer_blocks_0_attn1_to_v.lora_down.weight",
-        "up_blocks.3.attentions.0.transformer_blocks.0.attn1.to_v.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_0_transformer_blocks_0_attn1_to_v.lora_up.weight",
-        "up_blocks.3.attentions.0.transformer_blocks.0.attn1.to_v.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_0_transformer_blocks_0_attn1_to_out_0.lora_down.weight",
-        "up_blocks.3.attentions.0.transformer_blocks.0.attn1.to_out.0.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_0_transformer_blocks_0_attn1_to_out_0.lora_up.weight",
-        "up_blocks.3.attentions.0.transformer_blocks.0.attn1.to_out.0.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_0_transformer_blocks_0_ff_net_0_proj.lora_down.weight",
-        "up_blocks.3.attentions.0.transformer_blocks.0.ff.net.0.proj.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_0_transformer_blocks_0_ff_net_0_proj.lora_up.weight",
-        "up_blocks.3.attentions.0.transformer_blocks.0.ff.net.0.proj.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_0_transformer_blocks_0_ff_net_2.lora_down.weight",
-        "up_blocks.3.attentions.0.transformer_blocks.0.ff.net.2.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_0_transformer_blocks_0_ff_net_2.lora_up.weight",
-        "up_blocks.3.attentions.0.transformer_blocks.0.ff.net.2.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_0_transformer_blocks_0_attn2_to_q.lora_down.weight",
-        "up_blocks.3.attentions.0.transformer_blocks.0.attn2.to_q.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_0_transformer_blocks_0_attn2_to_q.lora_up.weight",
-        "up_blocks.3.attentions.0.transformer_blocks.0.attn2.to_q.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_0_transformer_blocks_0_attn2_to_k.lora_down.weight",
-        "up_blocks.3.attentions.0.transformer_blocks.0.attn2.to_k.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_0_transformer_blocks_0_attn2_to_k.lora_up.weight",
-        "up_blocks.3.attentions.0.transformer_blocks.0.attn2.to_k.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_0_transformer_blocks_0_attn2_to_v.lora_down.weight",
-        "up_blocks.3.attentions.0.transformer_blocks.0.attn2.to_v.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_0_transformer_blocks_0_attn2_to_v.lora_up.weight",
-        "up_blocks.3.attentions.0.transformer_blocks.0.attn2.to_v.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_0_transformer_blocks_0_attn2_to_out_0.lora_down.weight",
-        "up_blocks.3.attentions.0.transformer_blocks.0.attn2.to_out.0.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_0_transformer_blocks_0_attn2_to_out_0.lora_up.weight",
-        "up_blocks.3.attentions.0.transformer_blocks.0.attn2.to_out.0.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_0_proj_out.lora_down.weight",
-        "up_blocks.3.attentions.0.proj_out.lora_down.weight",
-    ],
-    ["lora_unet_up_blocks_3_attentions_0_proj_out.lora_up.weight", "up_blocks.3.attentions.0.proj_out.lora_up.weight"],
-    [
-        "lora_unet_up_blocks_3_attentions_1_proj_in.lora_down.weight",
-        "up_blocks.3.attentions.1.proj_in.lora_down.weight",
-    ],
-    ["lora_unet_up_blocks_3_attentions_1_proj_in.lora_up.weight", "up_blocks.3.attentions.1.proj_in.lora_up.weight"],
-    [
-        "lora_unet_up_blocks_3_attentions_1_transformer_blocks_0_attn1_to_q.lora_down.weight",
-        "up_blocks.3.attentions.1.transformer_blocks.0.attn1.to_q.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_1_transformer_blocks_0_attn1_to_q.lora_up.weight",
-        "up_blocks.3.attentions.1.transformer_blocks.0.attn1.to_q.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_1_transformer_blocks_0_attn1_to_k.lora_down.weight",
-        "up_blocks.3.attentions.1.transformer_blocks.0.attn1.to_k.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_1_transformer_blocks_0_attn1_to_k.lora_up.weight",
-        "up_blocks.3.attentions.1.transformer_blocks.0.attn1.to_k.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_1_transformer_blocks_0_attn1_to_v.lora_down.weight",
-        "up_blocks.3.attentions.1.transformer_blocks.0.attn1.to_v.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_1_transformer_blocks_0_attn1_to_v.lora_up.weight",
-        "up_blocks.3.attentions.1.transformer_blocks.0.attn1.to_v.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_1_transformer_blocks_0_attn1_to_out_0.lora_down.weight",
-        "up_blocks.3.attentions.1.transformer_blocks.0.attn1.to_out.0.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_1_transformer_blocks_0_attn1_to_out_0.lora_up.weight",
-        "up_blocks.3.attentions.1.transformer_blocks.0.attn1.to_out.0.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_1_transformer_blocks_0_ff_net_0_proj.lora_down.weight",
-        "up_blocks.3.attentions.1.transformer_blocks.0.ff.net.0.proj.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_1_transformer_blocks_0_ff_net_0_proj.lora_up.weight",
-        "up_blocks.3.attentions.1.transformer_blocks.0.ff.net.0.proj.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_1_transformer_blocks_0_ff_net_2.lora_down.weight",
-        "up_blocks.3.attentions.1.transformer_blocks.0.ff.net.2.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_1_transformer_blocks_0_ff_net_2.lora_up.weight",
-        "up_blocks.3.attentions.1.transformer_blocks.0.ff.net.2.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_1_transformer_blocks_0_attn2_to_q.lora_down.weight",
-        "up_blocks.3.attentions.1.transformer_blocks.0.attn2.to_q.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_1_transformer_blocks_0_attn2_to_q.lora_up.weight",
-        "up_blocks.3.attentions.1.transformer_blocks.0.attn2.to_q.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_1_transformer_blocks_0_attn2_to_k.lora_down.weight",
-        "up_blocks.3.attentions.1.transformer_blocks.0.attn2.to_k.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_1_transformer_blocks_0_attn2_to_k.lora_up.weight",
-        "up_blocks.3.attentions.1.transformer_blocks.0.attn2.to_k.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_1_transformer_blocks_0_attn2_to_v.lora_down.weight",
-        "up_blocks.3.attentions.1.transformer_blocks.0.attn2.to_v.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_1_transformer_blocks_0_attn2_to_v.lora_up.weight",
-        "up_blocks.3.attentions.1.transformer_blocks.0.attn2.to_v.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_1_transformer_blocks_0_attn2_to_out_0.lora_down.weight",
-        "up_blocks.3.attentions.1.transformer_blocks.0.attn2.to_out.0.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_1_transformer_blocks_0_attn2_to_out_0.lora_up.weight",
-        "up_blocks.3.attentions.1.transformer_blocks.0.attn2.to_out.0.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_1_proj_out.lora_down.weight",
-        "up_blocks.3.attentions.1.proj_out.lora_down.weight",
-    ],
-    ["lora_unet_up_blocks_3_attentions_1_proj_out.lora_up.weight", "up_blocks.3.attentions.1.proj_out.lora_up.weight"],
-    [
-        "lora_unet_up_blocks_3_attentions_2_proj_in.lora_down.weight",
-        "up_blocks.3.attentions.2.proj_in.lora_down.weight",
-    ],
-    ["lora_unet_up_blocks_3_attentions_2_proj_in.lora_up.weight", "up_blocks.3.attentions.2.proj_in.lora_up.weight"],
-    [
-        "lora_unet_up_blocks_3_attentions_2_transformer_blocks_0_attn1_to_q.lora_down.weight",
-        "up_blocks.3.attentions.2.transformer_blocks.0.attn1.to_q.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_2_transformer_blocks_0_attn1_to_q.lora_up.weight",
-        "up_blocks.3.attentions.2.transformer_blocks.0.attn1.to_q.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_2_transformer_blocks_0_attn1_to_k.lora_down.weight",
-        "up_blocks.3.attentions.2.transformer_blocks.0.attn1.to_k.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_2_transformer_blocks_0_attn1_to_k.lora_up.weight",
-        "up_blocks.3.attentions.2.transformer_blocks.0.attn1.to_k.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_2_transformer_blocks_0_attn1_to_v.lora_down.weight",
-        "up_blocks.3.attentions.2.transformer_blocks.0.attn1.to_v.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_2_transformer_blocks_0_attn1_to_v.lora_up.weight",
-        "up_blocks.3.attentions.2.transformer_blocks.0.attn1.to_v.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_2_transformer_blocks_0_attn1_to_out_0.lora_down.weight",
-        "up_blocks.3.attentions.2.transformer_blocks.0.attn1.to_out.0.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_2_transformer_blocks_0_attn1_to_out_0.lora_up.weight",
-        "up_blocks.3.attentions.2.transformer_blocks.0.attn1.to_out.0.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_2_transformer_blocks_0_ff_net_0_proj.lora_down.weight",
-        "up_blocks.3.attentions.2.transformer_blocks.0.ff.net.0.proj.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_2_transformer_blocks_0_ff_net_0_proj.lora_up.weight",
-        "up_blocks.3.attentions.2.transformer_blocks.0.ff.net.0.proj.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_2_transformer_blocks_0_ff_net_2.lora_down.weight",
-        "up_blocks.3.attentions.2.transformer_blocks.0.ff.net.2.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_2_transformer_blocks_0_ff_net_2.lora_up.weight",
-        "up_blocks.3.attentions.2.transformer_blocks.0.ff.net.2.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_2_transformer_blocks_0_attn2_to_q.lora_down.weight",
-        "up_blocks.3.attentions.2.transformer_blocks.0.attn2.to_q.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_2_transformer_blocks_0_attn2_to_q.lora_up.weight",
-        "up_blocks.3.attentions.2.transformer_blocks.0.attn2.to_q.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_2_transformer_blocks_0_attn2_to_k.lora_down.weight",
-        "up_blocks.3.attentions.2.transformer_blocks.0.attn2.to_k.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_2_transformer_blocks_0_attn2_to_k.lora_up.weight",
-        "up_blocks.3.attentions.2.transformer_blocks.0.attn2.to_k.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_2_transformer_blocks_0_attn2_to_v.lora_down.weight",
-        "up_blocks.3.attentions.2.transformer_blocks.0.attn2.to_v.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_2_transformer_blocks_0_attn2_to_v.lora_up.weight",
-        "up_blocks.3.attentions.2.transformer_blocks.0.attn2.to_v.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_2_transformer_blocks_0_attn2_to_out_0.lora_down.weight",
-        "up_blocks.3.attentions.2.transformer_blocks.0.attn2.to_out.0.lora_down.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_2_transformer_blocks_0_attn2_to_out_0.lora_up.weight",
-        "up_blocks.3.attentions.2.transformer_blocks.0.attn2.to_out.0.lora_up.weight",
-    ],
-    [
-        "lora_unet_up_blocks_3_attentions_2_proj_out.lora_down.weight",
-        "up_blocks.3.attentions.2.proj_out.lora_down.weight",
-    ],
-    ["lora_unet_up_blocks_3_attentions_2_proj_out.lora_up.weight", "up_blocks.3.attentions.2.proj_out.lora_up.weight"],
-    ["lora_unet_mid_block_attentions_0_proj_in.lora_down.weight", "mid_block.attentions.0.proj_in.lora_down.weight"],
-    ["lora_unet_mid_block_attentions_0_proj_in.lora_up.weight", "mid_block.attentions.0.proj_in.lora_up.weight"],
-    [
-        "lora_unet_mid_block_attentions_0_transformer_blocks_0_attn1_to_q.lora_down.weight",
-        "mid_block.attentions.0.transformer_blocks.0.attn1.to_q.lora_down.weight",
-    ],
-    [
-        "lora_unet_mid_block_attentions_0_transformer_blocks_0_attn1_to_q.lora_up.weight",
-        "mid_block.attentions.0.transformer_blocks.0.attn1.to_q.lora_up.weight",
-    ],
-    [
-        "lora_unet_mid_block_attentions_0_transformer_blocks_0_attn1_to_k.lora_down.weight",
-        "mid_block.attentions.0.transformer_blocks.0.attn1.to_k.lora_down.weight",
-    ],
-    [
-        "lora_unet_mid_block_attentions_0_transformer_blocks_0_attn1_to_k.lora_up.weight",
-        "mid_block.attentions.0.transformer_blocks.0.attn1.to_k.lora_up.weight",
-    ],
-    [
-        "lora_unet_mid_block_attentions_0_transformer_blocks_0_attn1_to_v.lora_down.weight",
-        "mid_block.attentions.0.transformer_blocks.0.attn1.to_v.lora_down.weight",
-    ],
-    [
-        "lora_unet_mid_block_attentions_0_transformer_blocks_0_attn1_to_v.lora_up.weight",
-        "mid_block.attentions.0.transformer_blocks.0.attn1.to_v.lora_up.weight",
-    ],
-    [
-        "lora_unet_mid_block_attentions_0_transformer_blocks_0_attn1_to_out_0.lora_down.weight",
-        "mid_block.attentions.0.transformer_blocks.0.attn1.to_out.0.lora_down.weight",
-    ],
-    [
-        "lora_unet_mid_block_attentions_0_transformer_blocks_0_attn1_to_out_0.lora_up.weight",
-        "mid_block.attentions.0.transformer_blocks.0.attn1.to_out.0.lora_up.weight",
-    ],
-    [
-        "lora_unet_mid_block_attentions_0_transformer_blocks_0_ff_net_0_proj.lora_down.weight",
-        "mid_block.attentions.0.transformer_blocks.0.ff.net.0.proj.lora_down.weight",
-    ],
-    [
-        "lora_unet_mid_block_attentions_0_transformer_blocks_0_ff_net_0_proj.lora_up.weight",
-        "mid_block.attentions.0.transformer_blocks.0.ff.net.0.proj.lora_up.weight",
-    ],
-    [
-        "lora_unet_mid_block_attentions_0_transformer_blocks_0_ff_net_2.lora_down.weight",
-        "mid_block.attentions.0.transformer_blocks.0.ff.net.2.lora_down.weight",
-    ],
-    [
-        "lora_unet_mid_block_attentions_0_transformer_blocks_0_ff_net_2.lora_up.weight",
-        "mid_block.attentions.0.transformer_blocks.0.ff.net.2.lora_up.weight",
-    ],
-    [
-        "lora_unet_mid_block_attentions_0_transformer_blocks_0_attn2_to_q.lora_down.weight",
-        "mid_block.attentions.0.transformer_blocks.0.attn2.to_q.lora_down.weight",
-    ],
-    [
-        "lora_unet_mid_block_attentions_0_transformer_blocks_0_attn2_to_q.lora_up.weight",
-        "mid_block.attentions.0.transformer_blocks.0.attn2.to_q.lora_up.weight",
-    ],
-    [
-        "lora_unet_mid_block_attentions_0_transformer_blocks_0_attn2_to_k.lora_down.weight",
-        "mid_block.attentions.0.transformer_blocks.0.attn2.to_k.lora_down.weight",
-    ],
-    [
-        "lora_unet_mid_block_attentions_0_transformer_blocks_0_attn2_to_k.lora_up.weight",
-        "mid_block.attentions.0.transformer_blocks.0.attn2.to_k.lora_up.weight",
-    ],
-    [
-        "lora_unet_mid_block_attentions_0_transformer_blocks_0_attn2_to_v.lora_down.weight",
-        "mid_block.attentions.0.transformer_blocks.0.attn2.to_v.lora_down.weight",
-    ],
-    [
-        "lora_unet_mid_block_attentions_0_transformer_blocks_0_attn2_to_v.lora_up.weight",
-        "mid_block.attentions.0.transformer_blocks.0.attn2.to_v.lora_up.weight",
-    ],
-    [
-        "lora_unet_mid_block_attentions_0_transformer_blocks_0_attn2_to_out_0.lora_down.weight",
-        "mid_block.attentions.0.transformer_blocks.0.attn2.to_out.0.lora_down.weight",
-    ],
-    [
-        "lora_unet_mid_block_attentions_0_transformer_blocks_0_attn2_to_out_0.lora_up.weight",
-        "mid_block.attentions.0.transformer_blocks.0.attn2.to_out.0.lora_up.weight",
-    ],
-    ["lora_unet_mid_block_attentions_0_proj_out.lora_down.weight", "mid_block.attentions.0.proj_out.lora_down.weight"],
-    ["lora_unet_mid_block_attentions_0_proj_out.lora_up.weight", "mid_block.attentions.0.proj_out.lora_up.weight"],
-]
diff --git a/ppdiffusers/ppdiffusers/pipeline_utils.py b/ppdiffusers/ppdiffusers/pipeline_utils.py
deleted file mode 100644
index 176f1d2ffc2a..000000000000
--- a/ppdiffusers/ppdiffusers/pipeline_utils.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-
-# limitations under the License.
-
-# NOTE: This file is deprecated and will be removed in a future version.
-# It only exists so that temporarely `from diffusers.pipelines import DiffusionPipeline` works
-
-from .pipelines import (  # noqa: F401
-    DiffusionPipeline,
-    ImagePipelineOutput,
-    TextPipelineOutput,
-)
diff --git a/ppdiffusers/ppdiffusers/pipelines/README.md b/ppdiffusers/ppdiffusers/pipelines/README.md
deleted file mode 100644
index 0d1bfbd56385..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/README.md
+++ /dev/null
@@ -1,569 +0,0 @@
-# PPDiffusers Pipelines
-
-Pipelines提供了一种对各种SOTA扩散模型进行各种下游任务推理的简单方式。
-大多数扩散模型系统由多个独立训练的模型和高度自适应的调度器(scheduler)组成，通过pipeline我们可以很方便的对这些扩散模型系统进行端到端的推理。
-
-举例来说， Stable Diffusion由以下组件构成:
-- Autoencoder
-- Conditional Unet
-- CLIP text encoder
-- Scheduler
-- CLIPFeatureExtractor
-- Safety checker
-
-这些组件之间是独立训练或创建的，同时在Stable Diffusion的推理运行中也是必需的，我们可以通过pipelines来对整个系统进行封装，从而提供一个简洁的推理接口。
-
-我们通过pipelines在统一的API下提供所有开源且SOTA的扩散模型系统的推理能力。具体来说，我们的pipelines能够提供以下功能：
-1. 可以加载官方发布的权重，并根据相应的论文复现出与原始实现相同的输出
-2. 提供一个简单的用户界面来推理运行扩散模型系统，参见[Pipelines API](#pipelines-api)部分
-3. 提供易于理解的代码实现，可以与官方文档一起阅读，参见[Pipelines汇总](#Pipelines汇总)部分
-4. 支持多种模态下的10+种任务，参见[任务展示](#任务展示)部分
-5. 可以很容易地与社区建立联系
-
-**【注意】** Pipelines不（也不应该）提供任何训练功能。
-如果您正在寻找训练的相关示例，请查看[examples](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples).
-
-
-## 任务展示
-### 文本图像多模
-
-<details>
-<summary>&emsp;文图生成（Text-to-Image Generation）</summary>
-
-#### text_to_image_generation-stable_diffusion
-
-```python
-from ppdiffusers import StableDiffusionPipeline
-
-# 加载模型和scheduler
-pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-
-# 执行pipeline进行推理
-prompt = "a photo of an astronaut riding a horse on mars"
-image = pipe(prompt).images[0]
-
-# 保存图片
-image.save("astronaut_rides_horse_sd.png")
-```
-<div align="center">
-<img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/209322401-6ecfeaaa-6878-4302-b592-07a31de4e590.png">
-</div>
-
-
-#### text_to_image_generation-deepfloyd_if
-
-```python
-import paddle
-
-from ppdiffusers import DiffusionPipeline, IFPipeline, IFSuperResolutionPipeline
-from ppdiffusers.utils import pd_to_pil
-
-# Stage 1: generate images
-pipe = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", paddle_dtype=paddle.float16)
-pipe.enable_xformers_memory_efficient_attention()
-prompt = 'a photo of a kangaroo wearing an orange hoodie and blue sunglasses standing in front of the eiffel tower holding a sign that says "very deep learning"'
-prompt_embeds, negative_embeds = pipe.encode_prompt(prompt)
-image = pipe(
-    prompt_embeds=prompt_embeds,
-    negative_prompt_embeds=negative_embeds,
-    output_type="pd",
-).images
-
-# save intermediate image
-pil_image = pd_to_pil(image)
-pil_image[0].save("text_to_image_generation-deepfloyd_if-result-if_stage_I.png")
-# save gpu memory
-pipe.to(paddle_device="cpu")
-
-# Stage 2: super resolution stage1
-super_res_1_pipe = IFSuperResolutionPipeline.from_pretrained(
-    "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", paddle_dtype=paddle.float16
-)
-super_res_1_pipe.enable_xformers_memory_efficient_attention()
-
-image = super_res_1_pipe(
-    image=image,
-    prompt_embeds=prompt_embeds,
-    negative_prompt_embeds=negative_embeds,
-    output_type="pd",
-).images
-# save intermediate image
-pil_image = pd_to_pil(image)
-pil_image[0].save("text_to_image_generation-deepfloyd_if-result-if_stage_II.png")
-# save gpu memory
-super_res_1_pipe.to(paddle_device="cpu")
-
-# Stage 3: super resolution stage2
-super_res_2_pipe = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-x4-upscaler", paddle_dtype=paddle.float16
-)
-super_res_2_pipe.enable_xformers_memory_efficient_attention()
-
-image = super_res_2_pipe(
-    prompt=prompt,
-    image=image,
-).images
-image[0].save("text_to_image_generation-deepfloyd_if-result-if_stage_III.png")
-```
-<div align="center">
-<img alt="image" src="https://user-images.githubusercontent.com/20476674/246785766-700dfad9-159d-4bfb-bfc7-c18df938a052.png">
-<center>if_stage_I</center>
-<img alt="image" src="https://user-images.githubusercontent.com/20476674/246785773-3359ca5f-dadf-4cc8-b318-ff1f9d4a2d35.png">
-<center>if_stage_II</center>
-<img alt="image" src="https://user-images.githubusercontent.com/20476674/246785774-8870829a-354b-4a87-9d67-93af315f51e6.png">
-<center>if_stage_III</center>
-</div>
-</details>
-
-
-<details><summary>&emsp;文本引导的图像放大（Text-Guided Image Upscaling）</summary>
-
-#### text_guided_image_upscaling-stable_diffusion_2
-
-```python
-from ppdiffusers import StableDiffusionUpscalePipeline
-from ppdiffusers.utils import load_image
-
-pipe = StableDiffusionUpscalePipeline.from_pretrained("stabilityai/stable-diffusion-x4-upscaler")
-
-url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/low_res_cat.png"
-low_res_img = load_image(url).resize((128, 128))
-
-prompt = "a white cat"
-upscaled_image = pipe(prompt=prompt, image=low_res_img).images[0]
-upscaled_image.save("upsampled_cat_sd2.png")
-```
-<div align="center">
-<img alt="image" src="https://user-images.githubusercontent.com/20476674/209324085-0d058b70-89b0-43c2-affe-534eedf116cf.png">
-<center>原图像</center>
-<img alt="image" src="https://user-images.githubusercontent.com/20476674/209323862-ce2d8658-a52b-4f35-90cb-aa7d310022e7.png">
-<center>生成图像</center>
-</div>
-</details>
-
-<details><summary>&emsp;文本引导的图像编辑（Text-Guided Image Inpainting）</summary>
-
-#### text_guided_image_inpainting-stable_diffusion_2
-
-```python
-from ppdiffusers import StableDiffusionUpscalePipeline
-from ppdiffusers.utils import load_image
-
-pipe = StableDiffusionUpscalePipeline.from_pretrained("stabilityai/stable-diffusion-x4-upscaler")
-
-url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/low_res_cat.png"
-low_res_img = load_image(url).resize((128, 128))
-
-prompt = "a white cat"
-upscaled_image = pipe(prompt=prompt, image=low_res_img).images[0]
-upscaled_image.save("upsampled_cat_sd2.png")
-```
-<div align="center">
-<img alt="image" src="https://user-images.githubusercontent.com/20476674/209324085-0d058b70-89b0-43c2-affe-534eedf116cf.png">
-<center>原图像</center>
-<img alt="image" src="https://user-images.githubusercontent.com/20476674/209323862-ce2d8658-a52b-4f35-90cb-aa7d310022e7.png">
-<center>生成图像</center>
-</div>
-</details>
-
-
-<details><summary>&emsp;文本引导的图像变换（Image-to-Image Text-Guided Generation）</summary>
-
-#### image_to_image_text_guided_generation-stable_diffusion
-```python
-import paddle
-
-from ppdiffusers import StableDiffusionImg2ImgPipeline
-from ppdiffusers.utils import load_image
-
-# 加载pipeline
-pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-
-# 下载初始图片
-url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
-
-init_image = load_image(url).resize((768, 512))
-
-prompt = "A fantasy landscape, trending on artstation"
-# 使用fp16加快生成速度
-with paddle.amp.auto_cast(True):
-    image = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images[0]
-
-image.save("fantasy_landscape.png")
-```
-<div align="center">
-<img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/209327142-d8e1d0c7-3bf8-4a08-a0e8-b11451fc84d8.png">
-<center>原图像</center>
-<img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/209325799-d9ff279b-0d57-435f-bda7-763e3323be23.png">
-<center>生成图像</center>
-</div>
-</details>
-</details>
-
-<details><summary>&emsp;文本图像双引导图像生成（Dual Text and Image Guided Generation）</summary>
-
-#### dual_text_and_image_guided_generation-versatile_diffusion
-```python
-from ppdiffusers import VersatileDiffusionDualGuidedPipeline
-from ppdiffusers.utils import load_image
-
-url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/benz.jpg"
-image = load_image(url)
-text = "a red car in the sun"
-
-pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained("shi-labs/versatile-diffusion")
-pipe.remove_unused_weights()
-
-text_to_image_strength = 0.75
-image = pipe(prompt=text, image=image, text_to_image_strength=text_to_image_strength).images[0]
-image.save("versatile-diffusion-red_car.png")
-```
-<div align="center">
-<img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/209325965-2475e9c4-a524-4970-8498-dfe10ff9cf24.jpg" >
-<center>原图像</center>
-<img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/209325293-049098d0-d591-4abc-b151-9291ac2636da.png">
-<center>生成图像</center>
-</div>
-</details>
-
-### 文本视频多模
-
-<details>
-<summary>&emsp;文本条件的视频生成（Text-to-Video Generation）</summary>
-
-#### text_to_video_generation-synth
-
-```python
-import imageio
-
-from ppdiffusers import DPMSolverMultistepScheduler, TextToVideoSDPipeline
-
-pipe = TextToVideoSDPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b")
-pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
-
-prompt = "An astronaut riding a horse."
-video_frames = pipe(prompt, num_inference_steps=25).frames
-imageio.mimsave("text_to_video_generation-synth-result-astronaut_riding_a_horse.mp4", video_frames, fps=8)
-```
-<div align="center">
-<img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/246780441-8242a955-490b-4326-8415-84264a54a938.gif">
-</div>
-
-#### text_to_video_generation-zero
-
-```python
-import imageio
-
-# pip install imageio[ffmpeg]
-import paddle
-
-from ppdiffusers import TextToVideoZeroPipeline
-
-model_id = "runwayml/stable-diffusion-v1-5"
-pipe = TextToVideoZeroPipeline.from_pretrained(model_id, paddle_dtype=paddle.float16)
-
-prompt = "A panda is playing guitar on times square"
-result = pipe(prompt=prompt).images
-result = [(r * 255).astype("uint8") for r in result]
-imageio.mimsave("text_to_video_generation-zero-result-panda.mp4", result, fps=4)
-```
-<div align="center">
-<img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/246779321-c2b0c2b4-e383-40c7-a4d8-f417e8062b35.gif">
-</div>
-
-</details>
-
-### 文本音频多模
-
-<details>
-<summary>&emsp;文本条件的音频生成（Text-to-Audio Generation）</summary>
-
-#### text_to_audio_generation-audio_ldm
-
-```python
-import paddle
-import scipy
-
-from ppdiffusers import AudioLDMPipeline
-
-pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm", paddle_dtype=paddle.float16)
-
-prompt = "Techno music with a strong, upbeat tempo and high melodic riffs"
-audio = pipe(prompt, num_inference_steps=10, audio_length_in_s=5.0).audios[0]
-
-output_path = "text_to_audio_generation-audio_ldm-techno.wav"
-# save the audio sample as a .wav file
-scipy.io.wavfile.write(output_path, rate=16000, data=audio)
-```
-<div align = "center">
-  <thead>
-  </thead>
-  <tbody>
-   <tr>
-      <td align = "center">
-      <a href="https://paddlenlp.bj.bcebos.com/models/community/westfish/develop_ppdiffusers_data/techno.wav" rel="nofollow">
-            <img align="center" src="https://user-images.githubusercontent.com/20476674/209344877-edbf1c24-f08d-4e3b-88a4-a27e1fd0a858.png" width="200 style="max-width: 100%;"></a><br>
-      </td>
-    </tr>
-  </tbody>
-</div>
-</details>
-
-### 图像
-
-<details><summary>&emsp;无条件图像生成（Unconditional Image Generation）</summary>
-
-#### unconditional_image_generation-latent_diffusion_uncond
-
-```python
-from ppdiffusers import LDMPipeline
-
-# 加载模型和scheduler
-pipe = LDMPipeline.from_pretrained("CompVis/ldm-celebahq-256")
-
-# 执行pipeline进行推理
-image = pipe(num_inference_steps=200).images[0]
-
-# 保存图片
-image.save("ldm_generated_image.png")
-```
-<div align="center">
-<img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/209327936-7fe914e0-0ea0-4e21-a433-24eaed6ee94c.png">
-</div>
-</details>
-
-<details><summary>&emsp;超分（Super Superresolution）</summary>
-
-#### super_resolution-latent_diffusion
-```python
-import paddle
-
-from ppdiffusers import LDMSuperResolutionPipeline
-from ppdiffusers.utils import load_image
-
-# 加载pipeline
-pipe = LDMSuperResolutionPipeline.from_pretrained("CompVis/ldm-super-resolution-4x-openimages")
-
-# 下载初始图片
-url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
-
-init_image = load_image(url).resize((128, 128))
-init_image.save("original-image.png")
-
-# 使用fp16加快生成速度
-with paddle.amp.auto_cast(True):
-    image = pipe(init_image, num_inference_steps=100, eta=1).images[0]
-
-image.save("super-resolution-image.png")
-```
-<div align="center">
-<img  alt="image" src="https://user-images.githubusercontent.com/20476674/209328660-9700fdc3-72b3-43bd-9a00-23b370ba030b.png">
-<center>原图像</center>
-<img  alt="image" src="https://user-images.githubusercontent.com/20476674/209328479-4eaea5d8-aa4a-4f31-aa2a-b47e3c730f15.png">
-<center>生成图像</center>
-</div>
-</details>
-
-
-<details><summary>&emsp;图像编辑（Image Inpainting）</summary>
-
-#### image_inpainting-repaint
-```python
-from ppdiffusers import RePaintPipeline, RePaintScheduler
-from ppdiffusers.utils import load_image
-
-img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/celeba_hq_256.png"
-mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/mask_256.png"
-
-# Load the original image and the mask as PIL images
-original_image = load_image(img_url).resize((256, 256))
-mask_image = load_image(mask_url).resize((256, 256))
-
-scheduler = RePaintScheduler.from_pretrained("google/ddpm-ema-celebahq-256", subfolder="scheduler")
-pipe = RePaintPipeline.from_pretrained("google/ddpm-ema-celebahq-256", scheduler=scheduler)
-
-output = pipe(
-    original_image=original_image,
-    mask_image=mask_image,
-    num_inference_steps=250,
-    eta=0.0,
-    jump_length=10,
-    jump_n_sample=10,
-)
-inpainted_image = output.images[0]
-
-inpainted_image.save("repaint-image.png")
-```
-<div align="center">
-<img  alt="image" src="https://user-images.githubusercontent.com/20476674/209329052-b6fc2aaf-1a59-49a3-92ef-60180fdffd81.png">
-<center>原图像</center>
-<img  alt="image" src="https://user-images.githubusercontent.com/20476674/209329048-4fe12176-32a0-4800-98f2-49bd8d593799.png">
-<center>mask图像</center>
-<img  alt="image" src="https://user-images.githubusercontent.com/20476674/209329241-b7e4d99e-468a-4b95-8829-d77ee14bfe98.png">
-<center>生成图像</center>
-</div>
-</details>
-
-
-
-<details><summary>&emsp;图像变化（Image Variation）</summary>
-
-#### image_variation-versatile_diffusion
-```
-from ppdiffusers import VersatileDiffusionImageVariationPipeline
-from ppdiffusers.utils import load_image
-
-url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/benz.jpg"
-image = load_image(url)
-
-pipe = VersatileDiffusionImageVariationPipeline.from_pretrained("shi-labs/versatile-diffusion")
-
-image = pipe(image).images[0]
-image.save("versatile-diffusion-car_variation.png")
-```
-<div align="center">
-<img  width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/209331434-51f6cdbd-b8e4-4faa-8e49-1cc852e35603.jpg">
-<center>原图像</center>
-<img  width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/209331591-f6cc4cd8-8430-4627-8d22-bf404fb2bfdd.png">
-<center>生成图像</center>
-</div>
-</details>
-
-
-
-
-
-### 音频
-
-<details><summary>&emsp;无条件音频生成（Unconditional Audio Generation）</summary>
-
-#### unconditional_audio_generation-audio_diffusion
-
-```
-from scipy.io.wavfile import write
-from ppdiffusers import AudioDiffusionPipeline
-import paddle
-
-# 加载模型和scheduler
-pipe = AudioDiffusionPipeline.from_pretrained("teticio/audio-diffusion-ddim-256")
-pipe.set_progress_bar_config(disable=None)
-generator = paddle.Generator().manual_seed(42)
-
-output = pipe(generator=generator)
-audio = output.audios[0]
-image = output.images[0]
-
-# 保存音频到本地
-for i, audio in enumerate(audio):
-    write(f"audio_diffusion_test{i}.wav", pipe.mel.sample_rate, audio.transpose())
-
-# 保存图片
-image.save("audio_diffusion_test.png")
-```
-<div align = "center">
-  <thead>
-  </thead>
-  <tbody>
-   <tr>
-      <td align = "center">
-      <a href="https://paddlenlp.bj.bcebos.com/models/community/teticio/data/audio_diffusion_test0.wav" rel="nofollow">
-            <img align="center" src="https://user-images.githubusercontent.com/20476674/209344877-edbf1c24-f08d-4e3b-88a4-a27e1fd0a858.png" width="200 style="max-width: 100%;"></a><br>
-      </td>
-    </tr>
-  </tbody>
-</div>
-
-<div align="center">
-<img  width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/209342125-93e8715e-895b-4115-9e1e-e65c6c2cd95a.png">
-</div>
-
-
-#### unconditional_audio_generation-spectrogram_diffusion
-
-```
-import paddle
-import scipy
-
-from ppdiffusers import MidiProcessor, SpectrogramDiffusionPipeline
-from ppdiffusers.utils.download_utils import ppdiffusers_url_download
-
-# Download MIDI from: wget https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/beethoven_hammerklavier_2.mid
-mid_file_path = ppdiffusers_url_download(
-    "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/beethoven_hammerklavier_2.mid", cache_dir="."
-)
-pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion", paddle_dtype=paddle.float16)
-processor = MidiProcessor()
-output = pipe(processor(mid_file_path))
-audio = output.audios[0]
-
-output_path = "unconditional_audio_generation-spectrogram_diffusion-result-beethoven_hammerklavier_2.wav"
-# save the audio sample as a .wav file
-scipy.io.wavfile.write(output_path, rate=16000, data=audio)
-```
-<div align = "center">
-  <thead>
-  </thead>
-  <tbody>
-   <tr>
-      <td align = "center">
-      <a href="https://paddlenlp.bj.bcebos.com/models/community/westfish/develop_ppdiffusers_data/beethoven_hammerklavier_2.wav" rel="nofollow">
-            <img align="center" src="https://user-images.githubusercontent.com/20476674/209344877-edbf1c24-f08d-4e3b-88a4-a27e1fd0a858.png" width="200 style="max-width: 100%;"></a><br>
-      </td>
-    </tr>
-  </tbody>
-</div>
-</details>
-
-
-## Pipelines汇总
-
-下表总结了所有支持的Pipelines，以及相应的来源、任务、推理脚本。
-
-| Pipeline                                                                                                                      | 源链接                                                                                                                       | 任务 | 推理脚本
-|-------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------|:---:|:---:|
-| [alt_diffusion](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/alt_diffusion)                 | [**Alt Diffusion**](https://arxiv.org/abs/2211.06679)   | *Text-to-Image Generation* |  [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/text_to_image_generation-alt_diffusion.py)
-| [alt_diffusion](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/alt_diffusion)                 | [**Alt Diffusion**](https://arxiv.org/abs/2211.06679)   | *Image-to-Image Text-Guided Generation* |  [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/image_to_image_text_guided_generation-alt_diffusion.py)
-| [audio_diffusion](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/audio_diffusion)                 | [**Audio Diffusion**](https://github.com/teticio/audio-diffusion)   | *Unconditional Audio Generation* |  [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/unconditional_audio_generation-audio_diffusion.py)
-| [controlnet](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/controlnet)                 | [**ControlNet with Stable Diffusion**](https://arxiv.org/abs/2302.05543)   | *Image-to-Image Text-Guided Generation* |  [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/image_to_image_text_guided_generation-controlnet.py)
-| [dance_diffusion](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/dance_diffusion)                 | [**Dance Diffusion**](https://github.com/Harmonai-org/sample-generator)                                                      | *Unconditional Audio Generation* |  [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/unconditional_audio_generation-dance_diffusion.py)
-| [ddpm](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/ddpm)                                       | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239)                                             | *Unconditional Image Generation* |  [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/unconditional_image_generation-ddpm.py)
-| [ddim](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/ddim)                                       | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502)                                                  | *Unconditional Image Generation* | [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/unconditional_image_generation-ddim.py)
-| [latent_diffusion](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/latent_diffusion)               | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)                         | *Text-to-Image Generation* | [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/text_to_image_generation-latent_diffusion.py)
-| [latent_diffusion](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/latent_diffusion)               | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)                         | *Super Superresolution* | [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/super_resolution-latent_diffusion.py)
-| [latent_diffusion_uncond](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/latent_diffusion_uncond) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)                         | *Unconditional Image Generation* | [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/unconditional_image_generation-latent_diffusion_uncond.py)
-| [paint_by_example](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/paint_by_example)                                       | [**Paint by Example: Exemplar-based Image Editing with Diffusion Models**](https://arxiv.org/abs/2211.13227)                           | *Image-Guided Image Inpainting* | [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/image_guided_image_inpainting-paint_by_example.py)
-| [pndm](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/pndm)                                       | [**Pseudo Numerical Methods for Diffusion Models on Manifolds**](https://arxiv.org/abs/2202.09778)                           | *Unconditional Image Generation* | [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/unconditional_image_generation-pndm.py)
-| [repaint](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/repaint)                 | [**Repaint**](https://arxiv.org/abs/2201.09865)                                                      | *Image Inpainting* |  [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/image_inpainting-repaint.py)
-| [score_sde_ve](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/score_sde_ve)                       | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | *Unconditional Image Generation* | [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/unconditional_image_generation-score_sde_ve.py)
-| [semantic_stable_diffusion](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion)                | [**Semantic Guidance**](https://arxiv.org/abs/2301.12247)                                            | *Text-Guided Generation* | [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/text_guided_generation-semantic_stable_diffusion.py)
-| [stable_diffusion](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/stable_diffusion)                | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release)                                            | *Text-to-Image Generation* | [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/text_to_image_generation-stable_diffusion.py)
-| [stable_diffusion](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/stable_diffusion)               | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release)                                            | *Image-to-Image Text-Guided Generation* | [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/image_to_image_text_guided_generation-stable_diffusion.py)
-| [stable_diffusion](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/stable_diffusion)                 | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release)                                            | *Text-Guided Image Inpainting* | [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/text_guided_image_inpainting-stable_diffusion.py)
-| [stable_diffusion_2](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/stable_diffusion)                | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release)                                            | *Text-to-Image Generation* | [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/text_to_image_generation-stable_diffusion_2.py)
-| [stable_diffusion_2](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/stable_diffusion)               | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release)                                            | *Image-to-Image Text-Guided Generation* | [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/image_to_image_text_guided_generation-stable_diffusion_2.py)
-| [stable_diffusion_2](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/stable_diffusion)                 | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release)                                            | *Text-Guided Image Inpainting* | [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/text_guided_image_inpainting-stable_diffusion_2.py)
-| [stable_diffusion_2](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/stable_diffusion)                 | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release)                                            | *Text-Guided Image Upscaling* | [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/text_guided_image_upscaling-stable_diffusion_2.py)
-| [stable_diffusion_2](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/stable_diffusion)                 | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release)                                            | *Text-Guided Image Upscaling* | [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/text_guided_image_upscaling-stable_diffusion_2.py)
-| [stable_diffusion_safe](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_safe)                 | [**Safe Stable Diffusion**](https://arxiv.org/abs/2211.05105)                                                      | *Text-to-Image Generation* |  [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/text_to_image_generation-stable_diffusion_safe.py)
-| [stochastic_karras_ve](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/stochastic_karras_ve)       | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364)                    | *Unconditional Image Generation* | [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/unconditional_image_generation-stochastic_karras_ve.py)
-| [unclip](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/unclip)                 | [**UnCLIP**](https://arxiv.org/abs/2204.06125)                                                      | *Text-to-Image Generation* |  [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/text_to_image_generation-unclip.py)
-| [versatile_diffusion](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion)                 | [**Versatile Diffusion**](https://arxiv.org/abs/2211.08332)                                                      | *Text-to-Image Generation* |  [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/text_to_image_generation-versatile_diffusion.py)
-| [versatile_diffusion](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion)                 | [**Versatile Diffusion**](https://arxiv.org/abs/2211.08332)                                                      | *Image Variation* |  [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/image_variation-versatile_diffusion.py)
-| [versatile_diffusion](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion)                 | [**Versatile Diffusion**](https://arxiv.org/abs/2211.08332)                                                      | *Dual Text and Image Guided Generation* |  [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/dual_text_and_image_guided_generation-versatile_diffusion.py)
-| [vq_diffusion](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/ppdiffusers/pipelines/vq_diffusion)                 | [**VQ Diffusion**](https://arxiv.org/abs/2111.14822)                                                      | *Text-to-Image Generation* |  [link](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers/examples/inference/text_to_image_generation-vq_diffusion.py)
-
-
-**【注意】** Pipelines可以端到端的展示相应论文中描述的扩散模型系统。然而，大多数Pipelines可以使用不同的调度器组件，甚至不同的模型组件。
-
-## Pipelines API
-
-扩散模型系统通常由多个独立训练的模型以及调度器等其他组件构成。
-其中每个模型都是在不同的任务上独立训练的，调度器可以很容易地进行替换。
-然而，在推理过程中，我们希望能够轻松地加载所有组件并在推理中使用它们，即使某个组件来自不同的库, 为此，所有pipeline都提供以下功能：
-
-
-- `from_pretrained` 该方法接收PaddleNLP模型库id（例如`runwayml/stable-diffusion-v1-5`）或本地目录路径。为了能够准确加载相应的模型和组件，相应目录下必须提供`model_index.json`文件。
-
-- `save_pretrained` 该方法接受一个本地目录路径，Pipelines的所有模型或组件都将被保存到该目录下。对于每个模型或组件，都会在给定目录下创建一个子文件夹。同时`model_index.json`文件将会创建在本地目录路径的根目录下，以便可以再次从本地路径实例化整个Pipelines。
-
-- `__call__` Pipelines在推理时将调用该方法。该方法定义了Pipelines的推理逻辑，它应该包括预处理、张量在不同模型之间的前向传播、后处理等整个推理流程。
diff --git a/ppdiffusers/ppdiffusers/pipelines/__init__.py b/ppdiffusers/ppdiffusers/pipelines/__init__.py
deleted file mode 100644
index 7c28798207e8..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/__init__.py
+++ /dev/null
@@ -1,163 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ..utils import (
-    OptionalDependencyNotAvailable,
-    is_einops_available,
-    is_fastdeploy_available,
-    is_k_diffusion_available,
-    is_librosa_available,
-    is_note_seq_available,
-    is_paddle_available,
-    is_paddlenlp_available,
-)
-
-try:
-    if not is_paddle_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ..utils.dummy_paddle_objects import *  # noqa F403
-else:
-    from .dance_diffusion import DanceDiffusionPipeline
-    from .ddim import DDIMPipeline
-    from .ddpm import DDPMPipeline
-    from .dit import DiTPipeline
-    from .latent_diffusion import LDMSuperResolutionPipeline
-    from .latent_diffusion_uncond import LDMPipeline
-    from .pipeline_utils import (
-        AudioPipelineOutput,
-        DiffusionPipeline,
-        ImagePipelineOutput,
-        TextPipelineOutput,
-    )
-    from .pndm import PNDMPipeline
-    from .repaint import RePaintPipeline
-    from .score_sde_ve import ScoreSdeVePipeline
-    from .stochastic_karras_ve import KarrasVePipeline
-
-try:
-    if not (is_paddle_available() and is_librosa_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ..utils.dummy_paddle_and_librosa_objects import *  # noqa F403
-else:
-    from .audio_diffusion import AudioDiffusionPipeline, Mel
-
-try:
-    if not (is_paddle_available() and is_paddlenlp_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ..utils.dummy_paddle_and_paddlenlp_objects import *  # noqa F403
-else:
-    from .alt_diffusion import AltDiffusionImg2ImgPipeline, AltDiffusionPipeline
-    from .audioldm import AudioLDMPipeline
-    from .deepfloyd_if import (
-        IFImg2ImgPipeline,
-        IFImg2ImgSuperResolutionPipeline,
-        IFInpaintingPipeline,
-        IFInpaintingSuperResolutionPipeline,
-        IFPipeline,
-        IFSuperResolutionPipeline,
-    )
-    from .latent_diffusion import LDMTextToImagePipeline
-    from .paint_by_example import PaintByExamplePipeline
-    from .semantic_stable_diffusion import SemanticStableDiffusionPipeline
-    from .stable_diffusion import (
-        CycleDiffusionPipeline,
-        StableDiffusionAdapterPipeline,
-        StableDiffusionAttendAndExcitePipeline,
-        StableDiffusionControlNetPipeline,
-        StableDiffusionDepth2ImgPipeline,
-        StableDiffusionImageVariationPipeline,
-        StableDiffusionImg2ImgPipeline,
-        StableDiffusionInpaintPipeline,
-        StableDiffusionInpaintPipelineLegacy,
-        StableDiffusionInstructPix2PixPipeline,
-        StableDiffusionLatentUpscalePipeline,
-        StableDiffusionMegaPipeline,
-        StableDiffusionModelEditingPipeline,
-        StableDiffusionPanoramaPipeline,
-        StableDiffusionPipeline,
-        StableDiffusionPipelineAllinOne,
-        StableDiffusionPix2PixZeroPipeline,
-        StableDiffusionSAGPipeline,
-        StableDiffusionUpscalePipeline,
-        StableUnCLIPImg2ImgPipeline,
-        StableUnCLIPPipeline,
-    )
-    from .stable_diffusion_safe import StableDiffusionPipelineSafe
-    from .text_to_video_synthesis import TextToVideoSDPipeline, TextToVideoZeroPipeline
-    from .unclip import UnCLIPImageVariationPipeline, UnCLIPPipeline
-    from .versatile_diffusion import (
-        VersatileDiffusionDualGuidedPipeline,
-        VersatileDiffusionImageVariationPipeline,
-        VersatileDiffusionPipeline,
-        VersatileDiffusionTextToImagePipeline,
-    )
-    from .vq_diffusion import VQDiffusionPipeline
-
-try:
-    if not is_fastdeploy_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ..utils.dummy_fastdeploy_objects import *  # noqa F403
-else:
-    from .fastdeploy_utils import (
-        FastDeployDiffusionPipelineMixin,
-        FastDeployRuntimeModel,
-    )
-
-try:
-    if not (is_paddle_available() and is_paddlenlp_available() and is_fastdeploy_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ..utils.dummy_paddle_and_paddlenlp_and_fastdeploy_objects import *  # noqa F403
-else:
-    from .stable_diffusion import (
-        FastDeployCycleDiffusionPipeline,
-        FastDeployStableDiffusionControlNetPipeline,
-        FastDeployStableDiffusionImageVariationPipeline,
-        FastDeployStableDiffusionImg2ImgPipeline,
-        FastDeployStableDiffusionInpaintPipeline,
-        FastDeployStableDiffusionInpaintPipelineLegacy,
-        FastDeployStableDiffusionMegaPipeline,
-        FastDeployStableDiffusionPipeline,
-        FastDeployStableDiffusionUpscalePipeline,
-    )
-
-try:
-    if not (is_paddle_available() and is_paddlenlp_available() and is_k_diffusion_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ..utils.dummy_paddle_and_paddlenlp_and_k_diffusion_objects import *  # noqa F403
-else:
-    from .stable_diffusion import StableDiffusionKDiffusionPipeline
-
-
-try:
-    if not (is_paddle_available() and is_paddlenlp_available() and is_einops_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ..utils.dummy_paddle_and_paddlenlp_and_einops_objects import *  # noqa F403
-else:
-    from .unidiffuser import UniDiffuserPipeline
-
-try:
-    if not (is_paddle_available() and is_paddlenlp_available() and is_note_seq_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ..utils.dummy_paddle_and_paddlenlp_and_note_seq_objects import *  # noqa F403
-else:
-    from .spectrogram_diffusion import MidiProcessor, SpectrogramDiffusionPipeline
diff --git a/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/__init__.py b/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/__init__.py
deleted file mode 100644
index 98090866859b..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/__init__.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# flake8: noqa
-
-from dataclasses import dataclass
-from typing import List, Optional, Union
-
-import numpy as np
-import PIL.Image
-
-from ...utils import BaseOutput, is_paddle_available, is_paddlenlp_available
-
-
-@dataclass
-# Copied from ppdiffusers.pipelines.stable_diffusion.__init__.StableDiffusionPipelineOutput with Stable->Alt
-class AltDiffusionPipelineOutput(BaseOutput):
-    """
-    Output class for Alt Diffusion pipelines.
-
-    Args:
-        images (`List[PIL.Image.Image]` or `np.ndarray`)
-            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
-            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
-        nsfw_content_detected (`List[bool]`)
-            List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, or `None` if safety checking could not be performed.
-    """
-
-    images: Union[List[PIL.Image.Image], np.ndarray]
-    nsfw_content_detected: Optional[List[bool]]
-
-
-if is_paddlenlp_available() and is_paddle_available():
-    from .modeling_roberta_series import RobertaSeriesModelWithTransformation
-    from .pipeline_alt_diffusion import AltDiffusionPipeline
-    from .pipeline_alt_diffusion_img2img import AltDiffusionImg2ImgPipeline
diff --git a/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/modeling_roberta_series.py b/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/modeling_roberta_series.py
deleted file mode 100644
index 9e27c50bcc92..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/modeling_roberta_series.py
+++ /dev/null
@@ -1,155 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from dataclasses import dataclass
-from typing import Optional, Tuple
-
-import paddle
-from paddle import nn
-
-from paddlenlp.transformers import RobertaConfig as XLMRobertaConfig
-from paddlenlp.transformers import RobertaModel as XLMRobertaModel
-from paddlenlp.transformers import RobertaPretrainedModel
-from paddlenlp.transformers.model_outputs import ModelOutput
-
-
-def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
-    """
-    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
-    are ignored. This is modified from fairseq's `utils.make_positions`.
-
-    Args:
-        x: paddle.Tensor x:
-    Returns: paddle.Tensor
-
-    """
-    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
-    mask = (input_ids != padding_idx).cast("int64")
-    incremental_indices = (paddle.cumsum(mask, axis=1) + past_key_values_length) * mask
-    return incremental_indices + padding_idx
-
-
-@dataclass
-class TransformationModelOutput(ModelOutput):
-    """
-    Base class for text model's outputs that also contains a pooling of the last hidden states.
-
-    Args:
-        text_embeds (`paddle.Tensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
-            The text embeddings obtained by applying the projection layer to the pooler_output.
-        last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
-        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    projection_state: Optional[paddle.Tensor] = None
-    last_hidden_state: paddle.Tensor = None
-    hidden_states: Optional[Tuple[paddle.Tensor]] = None
-    attentions: Optional[Tuple[paddle.Tensor]] = None
-
-
-class RobertaSeriesConfig(XLMRobertaConfig):
-    model_type = "roberta"
-
-    def __init__(
-        self,
-        pad_token_id=1,
-        bos_token_id=0,
-        eos_token_id=2,
-        project_dim=512,
-        pooler_fn="cls",
-        learn_encoder=False,
-        use_attention_mask=True,
-        **kwargs,
-    ):
-        kwargs["return_dict"] = kwargs.pop("return_dict", True)
-        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
-        self.project_dim = project_dim
-        self.pooler_fn = pooler_fn
-        self.learn_encoder = learn_encoder
-        self.use_attention_mask = use_attention_mask
-
-
-class RobertaSeriesModelWithTransformation(RobertaPretrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler", r"logit_scale"]
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
-    base_model_prefix = "roberta"
-    config_class = RobertaSeriesConfig
-
-    def __init__(self, config: RobertaSeriesConfig):
-        super().__init__(config)
-        self.roberta = XLMRobertaModel(config)
-        # must reset _padding_idx
-        self.roberta.embeddings.word_embeddings._padding_idx = None
-        self.transformation = nn.Linear(config.hidden_size, config.project_dim)
-        self.has_pre_transformation = getattr(config, "has_pre_transformation", False)
-        if self.has_pre_transformation:
-            self.transformation_pre = nn.Linear(config.hidden_size, config.project_dim)
-            self.pre_LN = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.init_weights()
-
-    def forward(
-        self,
-        input_ids: Optional[paddle.Tensor] = None,
-        attention_mask: Optional[paddle.Tensor] = None,
-        token_type_ids: Optional[paddle.Tensor] = None,
-        position_ids: Optional[paddle.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-    ):
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if position_ids is None:
-            position_ids = create_position_ids_from_input_ids(input_ids, self.config.pad_token_id)
-        outputs = self.base_model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            output_attentions=output_attentions,
-            output_hidden_states=True if self.has_pre_transformation else output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        if self.has_pre_transformation:
-            sequence_output2 = outputs["hidden_states"][-2]
-            sequence_output2 = self.pre_LN(sequence_output2)
-            projection_state2 = self.transformation_pre(sequence_output2)
-
-            return TransformationModelOutput(
-                projection_state=projection_state2,
-                last_hidden_state=outputs.last_hidden_state,
-                hidden_states=outputs.hidden_states,
-                attentions=outputs.attentions,
-            )
-        else:
-            projection_state = self.transformation(outputs.last_hidden_state)
-            return TransformationModelOutput(
-                projection_state=projection_state,
-                last_hidden_state=outputs.last_hidden_state,
-                hidden_states=outputs.hidden_states,
-                attentions=outputs.attentions,
-            )
diff --git a/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
deleted file mode 100644
index 17cb7039240c..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
+++ /dev/null
@@ -1,611 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import paddle
-from packaging import version
-
-from paddlenlp.transformers import CLIPImageProcessor, XLMRobertaTokenizer
-
-from ...configuration_utils import FrozenDict
-from ...loaders import TextualInversionLoaderMixin
-from ...models import AutoencoderKL, UNet2DConditionModel
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import deprecate, logging, randn_tensor, replace_example_docstring
-from ..pipeline_utils import DiffusionPipeline
-from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
-from . import AltDiffusionPipelineOutput, RobertaSeriesModelWithTransformation
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> import paddle
-        >>> from ppdiffusers import AltDiffusionPipeline
-
-        >>> pipe = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion-m9", paddle_dtype=paddle.float16)
-
-        >>> # "dark elf princess, highly detailed, d & d, fantasy, highly detailed, digital painting, trending on artstation, concept art, sharp focus, illustration, art by artgerm and greg rutkowski and fuji choko and viktoria gavrilenko and hoang lap"
-        >>> prompt = "黑暗精灵公主，非常详细，幻想，非常详细，数字绘画，概念艺术，敏锐的焦点，插图"
-        >>> image = pipe(prompt).images[0]
-        ```
-"""
-
-
-# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline with Stable->Alt, CLIPTextModel->RobertaSeriesModelWithTransformation, CLIPTokenizer->XLMRobertaTokenizer, AltDiffusionSafetyChecker->StableDiffusionSafetyChecker
-class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
-    r"""
-    Pipeline for text-to-image generation using Alt Diffusion.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    In addition the pipeline inherits the following loading methods:
-        - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
-        - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`]
-        - *Ckpt*: [`loaders.FromCkptMixin.from_ckpt`]
-    as well as the following saving methods:
-        - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`]
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`RobertaSeriesModelWithTransformation`]):
-            Frozen text-encoder. Alt Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.RobertaSeriesModelWithTransformation),
-            specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`XLMRobertaTokenizer`):
-            Tokenizer of class
-            [XLMRobertaTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.XLMRobertaTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPImageProcessor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-    _optional_components = ["safety_checker", "feature_extractor"]
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: RobertaSeriesModelWithTransformation,
-        tokenizer: XLMRobertaTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPImageProcessor,
-        requires_safety_checker: bool = True,
-    ):
-        super().__init__()
-
-        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
-                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
-                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
-                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
-                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file"
-            )
-            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["steps_offset"] = 1
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
-                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
-                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
-                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
-                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
-            )
-            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["clip_sample"] = False
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Alt Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. PaddleNLP team, diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                f"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-
-        is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse(
-            version.parse(unet.config._ppdiffusers_version).base_version
-        ) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
-        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
-            deprecation_message = (
-                "The configuration file of the unet has set the default `sample_size` to smaller than"
-                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
-                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
-                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
-                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
-                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
-                " in the config might lead to incorrect results in future versions. If you have downloaded this"
-                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
-                " the `unet/config.json` file"
-            )
-            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(unet.config)
-            new_config["sample_size"] = 64
-            unet._internal_dict = FrozenDict(new_config)
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-
-    def _encode_prompt(
-        self,
-        prompt,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-             prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-        """
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
-
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because XLM-Roberta can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask
-            else:
-                attention_mask = None
-
-            prompt_embeds = self.text_encoder(
-                text_input_ids,
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
-
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = uncond_input.attention_mask
-            else:
-                attention_mask = None
-
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids,
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
-
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
-
-        return prompt_embeds
-
-    def run_safety_checker(self, image, dtype):
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
-            )
-        else:
-            has_nsfw_concept = None
-        return image, has_nsfw_concept
-
-    def decode_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
-        image = self.vae.decode(latents).sample
-        image = (image / 2 + 0.5).clip(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        image = image.transpose([0, 2, 3, 1]).cast("float32").numpy()
-        return image
-
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    def check_inputs(
-        self,
-        prompt,
-        height,
-        width,
-        callback_steps,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-    ):
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, dtype=dtype)
-        else:
-            latents = latents
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    @paddle.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.AltDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [ppdiffusers.cross_attention](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/ppdiffusers/ppdiffusers/models/cross_attention.py).
-
-        Examples:
-
-        Returns:
-            [`~pipelines.stable_diffusion.AltDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.AltDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * self.vae_scale_factor
-        width = width or self.unet.config.sample_size * self.vae_scale_factor
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
-        )
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-        )
-
-        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps = self.scheduler.timesteps
-
-        # 5. Prepare latent variables
-        num_channels_latents = self.unet.config.in_channels
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            prompt_embeds.dtype,
-            generator,
-            latents,
-        )
-
-        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 7. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                # predict the noise residual
-                noise_pred = self.unet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                ).sample
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        if output_type == "latent":
-            image = latents
-            has_nsfw_concept = None
-        elif output_type == "pil":
-            # 8. Post-processing
-            image = self.decode_latents(latents)
-
-            # 9. Run safety checker
-            image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
-
-            # 10. Convert to PIL
-            image = self.numpy_to_pil(image)
-        else:
-            # 8. Post-processing
-            image = self.decode_latents(latents)
-
-            # 9. Run safety checker
-            image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return AltDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
deleted file mode 100644
index d6a7eb120d5e..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ /dev/null
@@ -1,682 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import numpy as np
-import paddle
-import PIL
-from packaging import version
-
-from paddlenlp.transformers import CLIPImageProcessor, XLMRobertaTokenizer
-
-from ...configuration_utils import FrozenDict
-from ...image_processor import VaeImageProcessor
-from ...loaders import TextualInversionLoaderMixin
-from ...models import AutoencoderKL, UNet2DConditionModel
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import (
-    PIL_INTERPOLATION,
-    deprecate,
-    logging,
-    randn_tensor,
-    replace_example_docstring,
-)
-from ..pipeline_utils import DiffusionPipeline
-from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
-from . import AltDiffusionPipelineOutput, RobertaSeriesModelWithTransformation
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> import requests
-        >>> import paddle
-        >>> from PIL import Image
-        >>> from io import BytesIO
-
-        >>> from ppdiffusers import AltDiffusionImg2ImgPipeline
-
-        >>> model_id_or_path = "BAAI/AltDiffusion-m9"
-        >>> pipe = AltDiffusionImg2ImgPipeline.from_pretrained(model_id_or_path, paddle_dtype=paddle.float16)
-
-        >>> url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
-
-        >>> response = requests.get(url)
-        >>> init_image = Image.open(BytesIO(response.content)).convert("RGB")
-        >>> init_image = init_image.resize((768, 512))
-
-        >>> # "A fantasy landscape, trending on artstation"
-        >>> prompt = "幻想风景, artstation"
-
-        >>> images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
-        >>> images[0].save("幻想风景.png")
-        ```
-"""
-
-
-# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
-def preprocess(image):
-    if isinstance(image, paddle.Tensor):
-        return image
-    elif isinstance(image, PIL.Image.Image):
-        image = [image]
-
-    if isinstance(image[0], PIL.Image.Image):
-        w, h = image[0].size
-        w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
-
-        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
-        image = np.concatenate(image, axis=0)
-        image = np.array(image).astype(np.float32) / 255.0
-        image = image.transpose(0, 3, 1, 2)
-        image = 2.0 * image - 1.0
-        image = paddle.to_tensor(image)
-    elif isinstance(image[0], paddle.Tensor):
-        image = paddle.concat(image, axis=0)
-    return image
-
-
-# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline with Stable->Alt, CLIPTextModel->RobertaSeriesModelWithTransformation, CLIPTokenizer->XLMRobertaTokenizer, AltDiffusionSafetyChecker->StableDiffusionSafetyChecker
-class AltDiffusionImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
-    r"""
-    Pipeline for text-guided image to image generation using Alt Diffusion.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    In addition the pipeline inherits the following loading methods:
-        - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
-        - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`]
-        - *Ckpt*: [`loaders.FromCkptMixin.from_ckpt`]
-    as well as the following saving methods:
-        - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`]
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`RobertaSeriesModelWithTransformation`]):
-            Frozen text-encoder. Alt Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.RobertaSeriesModelWithTransformation),
-            specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`XLMRobertaTokenizer`):
-            Tokenizer of class
-            [XLMRobertaTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.XLMRobertaTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPImageProcessor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-    _optional_components = ["safety_checker", "feature_extractor"]
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: RobertaSeriesModelWithTransformation,
-        tokenizer: XLMRobertaTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPImageProcessor,
-        requires_safety_checker: bool = True,
-    ):
-        super().__init__()
-
-        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
-                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
-                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
-                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
-                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file"
-            )
-            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["steps_offset"] = 1
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
-                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
-                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
-                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
-                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
-            )
-            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["clip_sample"] = False
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Alt Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. PaddleNLP team, diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                f"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-
-        is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse(
-            version.parse(unet.config._ppdiffusers_version).base_version
-        ) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
-        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
-            deprecation_message = (
-                "The configuration file of the unet has set the default `sample_size` to smaller than"
-                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
-                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
-                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
-                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
-                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
-                " in the config might lead to incorrect results in future versions. If you have downloaded this"
-                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
-                " the `unet/config.json` file"
-            )
-            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(unet.config)
-            new_config["sample_size"] = 64
-            unet._internal_dict = FrozenDict(new_config)
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
-        self.register_to_config(
-            requires_safety_checker=requires_safety_checker,
-        )
-
-    def _encode_prompt(
-        self,
-        prompt,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-        """
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
-
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because XLM-Roberta can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask
-            else:
-                attention_mask = None
-
-            prompt_embeds = self.text_encoder(
-                text_input_ids,
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
-
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = uncond_input.attention_mask
-            else:
-                attention_mask = None
-
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids,
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
-
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
-
-        return prompt_embeds
-
-    def run_safety_checker(self, image, dtype):
-        if self.safety_checker is None:
-            has_nsfw_concept = None
-        else:
-            if paddle.is_tensor(image):
-                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
-            else:
-                feature_extractor_input = self.image_processor.numpy_to_pil(image)
-            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pd")
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=paddle.cast(safety_checker_input.pixel_values, dtype)
-            )
-        return image, has_nsfw_concept
-
-    def decode_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
-        image = self.vae.decode(latents).sample
-        # image = (image / 2 + 0.5).clip(0, 1)
-        return image
-
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    def check_inputs(
-        self, prompt, strength, callback_steps, negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None
-    ):
-        if strength < 0 or strength > 1:
-            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-    def get_timesteps(self, num_inference_steps, strength):
-        # get the original timestep using init_timestep
-        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
-
-        t_start = max(num_inference_steps - init_timestep, 0)
-        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
-
-        return timesteps, num_inference_steps - t_start
-
-    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None):
-        if not isinstance(image, (paddle.Tensor, PIL.Image.Image, list)):
-            raise ValueError(
-                f"`image` has to be of type `paddle.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
-            )
-
-        image = image.cast(dtype)
-
-        batch_size = batch_size * num_images_per_prompt
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if isinstance(generator, list):
-            init_latents = [
-                self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
-            ]
-            init_latents = paddle.concat(init_latents, axis=0)
-        else:
-            init_latents = self.vae.encode(image).latent_dist.sample(generator)
-
-        init_latents = self.vae.config.scaling_factor * init_latents
-
-        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
-            # expand init_latents for batch_size
-            deprecation_message = (
-                f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
-                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
-                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
-                " your script to pass as many initial images as text prompts to suppress this warning."
-            )
-            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
-            additional_image_per_prompt = batch_size // init_latents.shape[0]
-            init_latents = paddle.concat([init_latents] * additional_image_per_prompt, axis=0)
-        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
-            raise ValueError(
-                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
-            )
-        else:
-            init_latents = paddle.concat([init_latents], axis=0)
-
-        shape = init_latents.shape
-        noise = randn_tensor(shape, generator=generator, dtype=dtype)
-
-        # get latents
-        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
-        latents = init_latents
-
-        return latents
-
-    @paddle.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        image: Union[paddle.Tensor, PIL.Image.Image] = None,
-        strength: float = 0.8,
-        num_inference_steps: Optional[int] = 50,
-        guidance_scale: Optional[float] = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: Optional[float] = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            image (`paddle.Tensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, that will be used as the starting point for the
-                process.
-            strength (`float`, *optional*, defaults to 0.8):
-                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
-                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
-                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
-                be maximum and the denoising process will run for the full number of iterations specified in
-                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference. This parameter will be modulated by `strength`.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
-                is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.AltDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
-        Examples:
-
-        Returns:
-            [`~pipelines.stable_diffusion.AltDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.AltDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(prompt, strength, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds)
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-        )
-
-        # 4. Preprocess image
-        image = self.image_processor.preprocess(image)
-
-        # 5. set timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
-        latent_timestep = timesteps[:1].tile((batch_size * num_images_per_prompt,))
-
-        # 6. Prepare latent variables
-        latents = self.prepare_latents(
-            image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, generator
-        )
-
-        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 8. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                # predict the noise residual
-                noise_pred = self.unet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                ).sample
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        if not output_type == "latent":
-            image = self.decode_latents(latents)
-            image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
-        else:
-            image = latents
-            has_nsfw_concept = None
-
-        if has_nsfw_concept is None:
-            do_denormalize = [True] * image.shape[0]
-        else:
-            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
-
-        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return AltDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/audio_diffusion/__init__.py b/ppdiffusers/ppdiffusers/pipelines/audio_diffusion/__init__.py
deleted file mode 100644
index 1b1f0198a9ce..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/audio_diffusion/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# flake8: noqa
-from .mel import Mel
-from .pipeline_audio_diffusion import AudioDiffusionPipeline
diff --git a/ppdiffusers/ppdiffusers/pipelines/audio_diffusion/mel.py b/ppdiffusers/ppdiffusers/pipelines/audio_diffusion/mel.py
deleted file mode 100644
index 00b19b100722..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/audio_diffusion/mel.py
+++ /dev/null
@@ -1,160 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import numpy as np  # noqa: E402
-
-from ...configuration_utils import ConfigMixin, register_to_config
-from ...schedulers.scheduling_utils import SchedulerMixin
-
-try:
-    import librosa  # noqa: E402
-
-    _librosa_can_be_imported = True
-    _import_error = ""
-except Exception as e:
-    _librosa_can_be_imported = False
-    _import_error = (
-        f"Cannot import librosa because {e}. Make sure to correctly install librosa to be able to install it."
-    )
-
-
-from PIL import Image  # noqa: E402
-
-
-class Mel(ConfigMixin, SchedulerMixin):
-    """
-    Parameters:
-        x_res (`int`): x resolution of spectrogram (time)
-        y_res (`int`): y resolution of spectrogram (frequency bins)
-        sample_rate (`int`): sample rate of audio
-        n_fft (`int`): number of Fast Fourier Transforms
-        hop_length (`int`): hop length (a higher number is recommended for lower than 256 y_res)
-        top_db (`int`): loudest in decibels
-        n_iter (`int`): number of iterations for Griffin Linn mel inversion
-    """
-
-    config_name = "mel_config.json"
-
-    @register_to_config
-    def __init__(
-        self,
-        x_res: int = 256,
-        y_res: int = 256,
-        sample_rate: int = 22050,
-        n_fft: int = 2048,
-        hop_length: int = 512,
-        top_db: int = 80,
-        n_iter: int = 32,
-    ):
-        self.hop_length = hop_length
-        self.sr = sample_rate
-        self.n_fft = n_fft
-        self.top_db = top_db
-        self.n_iter = n_iter
-        self.set_resolution(x_res, y_res)
-        self.audio = None
-
-        if not _librosa_can_be_imported:
-            raise ValueError(_import_error)
-
-    def set_resolution(self, x_res: int, y_res: int):
-        """Set resolution.
-
-        Args:
-            x_res (`int`): x resolution of spectrogram (time)
-            y_res (`int`): y resolution of spectrogram (frequency bins)
-        """
-        self.x_res = x_res
-        self.y_res = y_res
-        self.n_mels = self.y_res
-        self.slice_size = self.x_res * self.hop_length - 1
-
-    def load_audio(self, audio_file: str = None, raw_audio: np.ndarray = None):
-        """Load audio.
-
-        Args:
-            audio_file (`str`): must be a file on disk due to Librosa limitation or
-            raw_audio (`np.ndarray`): audio as numpy array
-        """
-        if audio_file is not None:
-            self.audio, _ = librosa.load(audio_file, mono=True, sr=self.sr)
-        else:
-            self.audio = raw_audio
-
-        # Pad with silence if necessary.
-        if len(self.audio) < self.x_res * self.hop_length:
-            self.audio = np.concatenate([self.audio, np.zeros((self.x_res * self.hop_length - len(self.audio),))])
-
-    def get_number_of_slices(self) -> int:
-        """Get number of slices in audio.
-
-        Returns:
-            `int`: number of spectograms audio can be sliced into
-        """
-        return len(self.audio) // self.slice_size
-
-    def get_audio_slice(self, slice: int = 0) -> np.ndarray:
-        """Get slice of audio.
-
-        Args:
-            slice (`int`): slice number of audio (out of get_number_of_slices())
-
-        Returns:
-            `np.ndarray`: audio as numpy array
-        """
-        return self.audio[self.slice_size * slice : self.slice_size * (slice + 1)]
-
-    def get_sample_rate(self) -> int:
-        """Get sample rate:
-
-        Returns:
-            `int`: sample rate of audio
-        """
-        return self.sr
-
-    def audio_slice_to_image(self, slice: int) -> Image.Image:
-        """Convert slice of audio to spectrogram.
-
-        Args:
-            slice (`int`): slice number of audio to convert (out of get_number_of_slices())
-
-        Returns:
-            `PIL Image`: grayscale image of x_res x y_res
-        """
-        S = librosa.feature.melspectrogram(
-            y=self.get_audio_slice(slice), sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length, n_mels=self.n_mels
-        )
-        log_S = librosa.power_to_db(S, ref=np.max, top_db=self.top_db)
-        bytedata = (((log_S + self.top_db) * 255 / self.top_db).clip(0, 255) + 0.5).astype(np.uint8)
-        image = Image.fromarray(bytedata)
-        return image
-
-    def image_to_audio(self, image: Image.Image) -> np.ndarray:
-        """Converts spectrogram to audio.
-
-        Args:
-            image (`PIL Image`): x_res x y_res grayscale image
-
-        Returns:
-            audio (`np.ndarray`): raw audio
-        """
-        bytedata = np.frombuffer(image.tobytes(), dtype="uint8").reshape((image.height, image.width))
-        log_S = bytedata.astype("float") * self.top_db / 255 - self.top_db
-        S = librosa.db_to_power(log_S)
-        audio = librosa.feature.inverse.mel_to_audio(
-            S, sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length, n_iter=self.n_iter
-        )
-        return audio
diff --git a/ppdiffusers/ppdiffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py
deleted file mode 100644
index ae6d2b23e521..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py
+++ /dev/null
@@ -1,273 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from math import acos, sin
-from typing import List, Tuple, Union
-
-import numpy as np
-import paddle
-from PIL import Image
-
-from ...models import AutoencoderKL, UNet2DConditionModel
-from ...schedulers import DDIMScheduler, DDPMScheduler
-from ...utils import randn_tensor
-from ..pipeline_utils import (
-    AudioPipelineOutput,
-    BaseOutput,
-    DiffusionPipeline,
-    ImagePipelineOutput,
-)
-from .mel import Mel
-
-
-class AudioDiffusionPipeline(DiffusionPipeline):
-    """
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Parameters:
-        vqae ([`AutoencoderKL`]): Variational AutoEncoder for Latent Audio Diffusion or None
-        unet ([`UNet2DConditionModel`]): UNET model
-        mel ([`Mel`]): transform audio <-> spectrogram
-        scheduler ([`DDIMScheduler` or `DDPMScheduler`]): de-noising scheduler
-    """
-
-    _optional_components = ["vqvae"]
-
-    def __init__(
-        self,
-        vqvae: AutoencoderKL,
-        unet: UNet2DConditionModel,
-        mel: Mel,
-        scheduler: Union[DDIMScheduler, DDPMScheduler],
-    ):
-        super().__init__()
-        self.register_modules(unet=unet, scheduler=scheduler, mel=mel, vqvae=vqvae)
-
-    def get_input_dims(self) -> Tuple:
-        """Returns dimension of input image
-
-        Returns:
-            `Tuple`: (height, width)
-        """
-        input_module = self.vqvae if self.vqvae is not None else self.unet
-        # For backwards compatibility
-        sample_size = (
-            (input_module.config.sample_size, input_module.config.sample_size)
-            if type(input_module.config.sample_size) == int
-            else input_module.config.sample_size
-        )
-        return sample_size
-
-    def get_default_steps(self) -> int:
-        """Returns default number of steps recommended for inference
-
-        Returns:
-            `int`: number of steps
-        """
-        return 50 if isinstance(self.scheduler, DDIMScheduler) else 1000
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        batch_size: int = 1,
-        audio_file: str = None,
-        raw_audio: np.ndarray = None,
-        slice: int = 0,
-        start_step: int = 0,
-        steps: int = None,
-        generator: paddle.Generator = None,
-        mask_start_secs: float = 0,
-        mask_end_secs: float = 0,
-        step_generator: paddle.Generator = None,
-        eta: float = 0,
-        noise: paddle.Tensor = None,
-        encoding: paddle.Tensor = None,
-        return_dict=True,
-    ) -> Union[
-        Union[AudioPipelineOutput, ImagePipelineOutput],
-        Tuple[List[Image.Image], Tuple[int, List[np.ndarray]]],
-    ]:
-        """Generate random mel spectrogram from audio input and convert to audio.
-
-        Args:
-            batch_size (`int`): number of samples to generate
-            audio_file (`str`): must be a file on disk due to Librosa limitation or
-            raw_audio (`np.ndarray`): audio as numpy array
-            slice (`int`): slice number of audio to convert
-            start_step (int): step to start from
-            steps (`int`): number of de-noising steps (defaults to 50 for DDIM, 1000 for DDPM)
-            generator (`paddle.Generator`): random number generator or None
-            mask_start_secs (`float`): number of seconds of audio to mask (not generate) at start
-            mask_end_secs (`float`): number of seconds of audio to mask (not generate) at end
-            step_generator (`paddle.Generator`): random number generator used to de-noise or None
-            eta (`float`): parameter between 0 and 1 used with DDIM scheduler
-            noise (`paddle.Tensor`): noise tensor of shape (batch_size, 1, height, width) or None
-            encoding (`paddle.Tensor`): for UNet2DConditionModel shape (batch_size, seq_length, cross_attention_dim)
-            return_dict (`bool`): if True return AudioPipelineOutput, ImagePipelineOutput else Tuple
-
-        Returns:
-            `List[PIL Image]`: mel spectrograms (`float`, `List[np.ndarray]`): sample rate and raw audios
-        """
-
-        steps = steps or self.get_default_steps()
-        self.scheduler.set_timesteps(steps)
-        step_generator = step_generator or generator
-        # For backwards compatibility
-        if type(self.unet.config.sample_size) == int:
-            self.unet.config.sample_size = (self.unet.config.sample_size, self.unet.config.sample_size)
-        input_dims = self.get_input_dims()
-        self.mel.set_resolution(x_res=input_dims[1], y_res=input_dims[0])
-        if noise is None:
-            noise = randn_tensor(
-                (
-                    batch_size,
-                    self.unet.config.in_channels,
-                    self.unet.config.sample_size[0],
-                    self.unet.config.sample_size[1],
-                ),
-                generator=generator,
-            )
-        images = noise
-        mask = None
-
-        if audio_file is not None or raw_audio is not None:
-            self.mel.load_audio(audio_file, raw_audio)
-            input_image = self.mel.audio_slice_to_image(slice)
-            input_image = np.frombuffer(input_image.tobytes(), dtype="uint8").reshape(
-                (input_image.height, input_image.width)
-            )
-            input_image = (input_image / 255) * 2 - 1
-            input_images = paddle.to_tensor(input_image[np.newaxis, :, :], dtype=paddle.float32)
-
-            if self.vqvae is not None:
-                input_images = self.vqvae.encode(paddle.unsqueeze(input_images, 0)).latent_dist.sample(
-                    generator=generator
-                )[0]
-                input_images = self.vqvae.config.scaling_factor * input_images
-
-            if start_step > 0:
-                images[0, 0] = self.scheduler.add_noise(input_images, noise, self.scheduler.timesteps[start_step - 1])
-
-            pixels_per_second = (
-                self.unet.config.sample_size[1] * self.mel.get_sample_rate() / self.mel.x_res / self.mel.hop_length
-            )
-            mask_start = int(mask_start_secs * pixels_per_second)
-            mask_end = int(mask_end_secs * pixels_per_second)
-            mask = self.scheduler.add_noise(
-                input_images, noise, paddle.to_tensor(self.scheduler.timesteps[start_step:])
-            )
-
-        for step, t in enumerate(self.progress_bar(self.scheduler.timesteps[start_step:])):
-            if isinstance(self.unet, UNet2DConditionModel):
-                model_output = self.unet(images, t, encoding)["sample"]
-            else:
-                model_output = self.unet(images, t)["sample"]
-
-            if isinstance(self.scheduler, DDIMScheduler):
-                images = self.scheduler.step(
-                    model_output=model_output,
-                    timestep=t,
-                    sample=images,
-                    eta=eta,
-                    generator=step_generator,
-                )["prev_sample"]
-            else:
-                images = self.scheduler.step(
-                    model_output=model_output,
-                    timestep=t,
-                    sample=images,
-                    generator=step_generator,
-                )["prev_sample"]
-
-            if mask is not None:
-                if mask_start > 0:
-                    images[:, :, :, :mask_start] = mask[:, step, :, :mask_start]
-                if mask_end > 0:
-                    images[:, :, :, -mask_end:] = mask[:, step, :, -mask_end:]
-
-        if self.vqvae is not None:
-            # 0.18215 was scaling factor used in training to ensure unit variance
-            images = 1 / self.vqvae.config.scaling_factor * images
-            images = self.vqvae.decode(images)["sample"]
-
-        images = (images / 2 + 0.5).clip(0, 1)
-        images = images.transpose([0, 2, 3, 1]).cast("float32").numpy()
-        images = (images * 255).round().astype("uint8")
-        images = list(
-            (Image.fromarray(_[:, :, 0]) for _ in images)
-            if images.shape[3] == 1
-            else (Image.fromarray(_, mode="RGB").convert("L") for _ in images)
-        )
-
-        audios = [self.mel.image_to_audio(_) for _ in images]
-        if not return_dict:
-            return images, (self.mel.get_sample_rate(), audios)
-
-        return BaseOutput(**AudioPipelineOutput(np.array(audios)[:, np.newaxis, :]), **ImagePipelineOutput(images))
-
-    @paddle.no_grad()
-    def encode(self, images: List[Image.Image], steps: int = 50) -> np.ndarray:
-        """Reverse step process: recover noisy image from generated image.
-
-        Args:
-            images (`List[PIL Image]`): list of images to encode
-            steps (`int`): number of encoding steps to perform (defaults to 50)
-
-        Returns:
-            `np.ndarray`: noise tensor of shape (batch_size, 1, height, width)
-        """
-
-        # Only works with DDIM as this method is deterministic
-        assert isinstance(self.scheduler, DDIMScheduler)
-        self.scheduler.set_timesteps(steps)
-        sample = np.array(
-            [np.frombuffer(image.tobytes(), dtype="uint8").reshape((1, image.height, image.width)) for image in images]
-        )
-        sample = (sample / 255) * 2 - 1
-        sample = paddle.to_tensor(sample)
-
-        for t in self.progress_bar(paddle.flip(self.scheduler.timesteps, (0,))):
-            prev_timestep = t - self.scheduler.num_train_timesteps // self.scheduler.num_inference_steps
-            alpha_prod_t = self.scheduler.alphas_cumprod[t]
-            alpha_prod_t_prev = (
-                self.scheduler.alphas_cumprod[prev_timestep]
-                if prev_timestep >= 0
-                else self.scheduler.final_alpha_cumprod
-            )
-            beta_prod_t = 1 - alpha_prod_t
-            model_output = self.unet(sample, t)["sample"]
-            pred_sample_direction = (1 - alpha_prod_t_prev) ** (0.5) * model_output
-            sample = (sample - pred_sample_direction) * alpha_prod_t_prev ** (-0.5)
-            sample = sample * alpha_prod_t ** (0.5) + beta_prod_t ** (0.5) * model_output
-
-        return sample
-
-    @staticmethod
-    def slerp(x0: paddle.Tensor, x1: paddle.Tensor, alpha: float) -> paddle.Tensor:
-        """Spherical Linear intERPolation
-
-        Args:
-            x0 (`paddle.Tensor`): first tensor to interpolate between
-            x1 (`paddle.Tensor`): seconds tensor to interpolate between
-            alpha (`float`): interpolation between 0 and 1
-
-        Returns:
-            `paddle.Tensor`: interpolated tensor
-        """
-
-        theta = acos(paddle.dot(paddle.flatten(x0), paddle.flatten(x1)) / paddle.norm(x0) / paddle.norm(x1))
-        return sin((1 - alpha) * theta) * x0 / sin(theta) + sin(alpha * theta) * x1 / sin(theta)
diff --git a/ppdiffusers/ppdiffusers/pipelines/audioldm/__init__.py b/ppdiffusers/ppdiffusers/pipelines/audioldm/__init__.py
deleted file mode 100644
index 4ab25efc2000..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/audioldm/__init__.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ...utils import (
-    OptionalDependencyNotAvailable,
-    is_paddle_available,
-    is_paddlenlp_available,
-    is_paddlenlp_version,
-)
-
-try:
-    if not (is_paddlenlp_available() and is_paddle_available() and is_paddlenlp_version(">=", "2.5.2")):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ...utils.dummy_paddle_and_paddlenlp_objects import AudioLDMPipeline
-else:
-    from .pipeline_audioldm import AudioLDMPipeline
diff --git a/ppdiffusers/ppdiffusers/pipelines/audioldm/pipeline_audioldm.py b/ppdiffusers/ppdiffusers/pipelines/audioldm/pipeline_audioldm.py
deleted file mode 100644
index 95fb0f89dee6..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/audioldm/pipeline_audioldm.py
+++ /dev/null
@@ -1,505 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import numpy as np
-import paddle
-import paddle.nn.functional as F
-
-from paddlenlp.transformers import (
-    ClapTextModelWithProjection,
-    RobertaTokenizer,
-    SpeechT5HifiGan,
-)
-
-from ...models import AutoencoderKL, UNet2DConditionModel
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import logging, randn_tensor, replace_example_docstring
-from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
-
-logger = logging.get_logger(__name__)
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> import paddle
-        >>> from ppdiffusers import AudioLDMPipeline
-
-        >>> pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm", paddle_dtype=paddle.float16)
-
-        >>> prompt = "A hammer hitting a wooden surface"
-        >>> audio = pipe(prompt).audio[0]
-        ```
-"""
-
-
-class AudioLDMPipeline(DiffusionPipeline):
-    """
-    Pipeline for text-to-audio generation using AudioLDM.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode audios to and from latent representations.
-        text_encoder ([`ClapTextModelWithProjection`]):
-            Frozen text-encoder. AudioLDM uses the text portion of CLAP,
-            specifically the RoBERTa HSTAT-unfused variant.
-        tokenizer ([`PreTrainedTokenizer`]):
-            Tokenizer of class RobertaTokenizer.
-        unet ([`UNet2DConditionModel`]): U-Net architecture to denoise the encoded audio latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded audio latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        vocoder ([`SpeechT5HifiGan`]):
-            Vocoder of class SpeechT5HifiGan.
-    """
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: ClapTextModelWithProjection,
-        tokenizer: RobertaTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
-        vocoder: SpeechT5HifiGan,
-    ):
-        super().__init__()
-        self.register_modules(
-            vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, unet=unet, scheduler=scheduler, vocoder=vocoder
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-
-    def _encode_prompt(
-        self,
-        prompt,
-        num_waveforms_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-    ):
-        """
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            num_waveforms_per_prompt (`int`):
-                number of waveforms that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the audio generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-        """
-        if self.text_encoder.text_model.embeddings.token_type_ids.dtype not in [
-            paddle.int16,
-            paddle.int32,
-            paddle.int64,
-        ]:
-            self.text_encoder.text_model.embeddings.token_type_ids = (
-                self.text_encoder.text_model.embeddings.token_type_ids.cast("int32")
-            )
-
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-        if prompt_embeds is None:
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                return_attention_mask=True,
-                truncation=True,
-                return_tensors="pd",
-            )
-            text_input_ids = text_inputs.input_ids
-            attention_mask = text_inputs.attention_mask
-            untruncated_ids = self.tokenizer(
-                prompt, padding="longest", return_tensors="pd", return_attention_mask=True
-            ).input_ids
-            if (
-                untruncated_ids.shape[-1] >= text_input_ids.shape[-1]
-                and not paddle.equal_all(x=text_input_ids, y=untruncated_ids).item()
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    f"The following part of your input was truncated because CLAP can only handle sequences up to {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-            prompt_embeds = self.text_encoder(text_input_ids.cast("int32"), attention_mask=attention_mask)
-            prompt_embeds = prompt_embeds.text_embeds
-            # additional L_2 normalization over each hidden-state
-            prompt_embeds = F.normalize(x=prompt_embeds, axis=-1)
-        prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
-        bs_embed, seq_len = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.tile(repeat_times=[1, num_waveforms_per_prompt])
-        prompt_embeds = prompt_embeds.reshape([bs_embed * num_waveforms_per_prompt, seq_len])
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} != {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`: {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pd",
-                return_attention_mask=True,
-            )
-            uncond_input_ids = uncond_input.input_ids
-            attention_mask = uncond_input.attention_mask
-            negative_prompt_embeds = self.text_encoder(uncond_input_ids.cast("int32"), attention_mask=attention_mask)
-            negative_prompt_embeds = negative_prompt_embeds.text_embeds
-            # additional L_2 normalization over each hidden-state
-            negative_prompt_embeds = F.normalize(x=negative_prompt_embeds, axis=-1)
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
-
-            negative_prompt_embeds = negative_prompt_embeds.tile(repeat_times=[1, num_waveforms_per_prompt])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_waveforms_per_prompt, seq_len])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = paddle.concat(x=[negative_prompt_embeds, prompt_embeds])
-        return prompt_embeds
-
-    def decode_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
-        mel_spectrogram = self.vae.decode(latents).sample
-        return mel_spectrogram
-
-    def mel_spectrogram_to_waveform(self, mel_spectrogram):
-        if mel_spectrogram.dim() == 4:
-            mel_spectrogram = mel_spectrogram.squeeze(axis=1)
-        waveform = self.vocoder(mel_spectrogram)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        waveform = waveform.astype(dtype="float32").cpu()
-        return waveform
-
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    def check_inputs(
-        self,
-        prompt,
-        audio_length_in_s,
-        vocoder_upsample_factor,
-        callback_steps,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-    ):
-        min_audio_length_in_s = vocoder_upsample_factor * self.vae_scale_factor
-        if audio_length_in_s < min_audio_length_in_s:
-            raise ValueError(
-                f"`audio_length_in_s` has to be a positive value greater than or equal to {min_audio_length_in_s}, but is {audio_length_in_s}."
-            )
-        if self.vocoder.config.model_in_dim % self.vae_scale_factor != 0:
-            raise ValueError(
-                f"The number of frequency bins in the vocoder's log-mel spectrogram has to be divisible by the VAE scale factor, but got {self.vocoder.config.model_in_dim} bins and a scale factor of {self.vae_scale_factor}."
-            )
-        if (
-            callback_steps is None
-            or callback_steps is not None
-            and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type {type(callback_steps)}."
-            )
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`: {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    f"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds` {negative_prompt_embeds.shape}."
-                )
-
-    def prepare_latents(self, batch_size, num_channels_latents, height, dtype, generator, latents=None):
-        shape = (
-            batch_size,
-            num_channels_latents,
-            height // self.vae_scale_factor,
-            self.vocoder.config.model_in_dim // self.vae_scale_factor,
-        )
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, dtype=dtype)
-        else:
-            latents = latents
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    @paddle.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        audio_length_in_s: Optional[float] = None,
-        num_inference_steps: int = 10,
-        guidance_scale: float = 2.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_waveforms_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        output_type: Optional[str] = "np",
-    ):
-        """
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the audio generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            audio_length_in_s (`int`, *optional*, defaults to 5.12):
-                The length of the generated audio sample in seconds.
-            num_inference_steps (`int`, *optional*, defaults to 10):
-                The number of denoising steps. More denoising steps usually lead to a higher quality audio at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 2.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate audios that are closely linked to the text `prompt`,
-                usually at the expense of lower sound quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the audio generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            num_waveforms_per_prompt (`int`, *optional*, defaults to 1):
-                The number of waveforms to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
-                One or a list of paddle generator(s)
-                to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for audio
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
-                `self.processor` in ppdiffusers.cross_attention.
-            output_type (`str`, *optional*, defaults to `"np"`):
-                The output format of the generate image. Choose between:
-                - `"np"`: Return Numpy `np.ndarray` objects.
-                - `"pt"`: Return `paddle.Tensor` objects.
-
-        Examples:
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated audios.
-        """
-        # 0. Convert audio input length from seconds to spectrogram height
-        vocoder_upsample_factor = np.prod(self.vocoder.config.upsample_rates) / self.vocoder.config.sampling_rate
-        if audio_length_in_s is None:
-            audio_length_in_s = self.unet.config.sample_size * self.vae_scale_factor * vocoder_upsample_factor
-        height = int(audio_length_in_s / vocoder_upsample_factor)
-        original_waveform_length = int(audio_length_in_s * self.vocoder.config.sampling_rate)
-        if height % self.vae_scale_factor != 0:
-            height = int(np.ceil(height / self.vae_scale_factor)) * self.vae_scale_factor
-            logger.info(
-                f"Audio length in seconds {audio_length_in_s} is increased to {height * vocoder_upsample_factor} so that it can be handled by the model. It will be cut to {audio_length_in_s} after the denoising process."
-            )
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt,
-            audio_length_in_s,
-            vocoder_upsample_factor,
-            callback_steps,
-            negative_prompt,
-            prompt_embeds,
-            negative_prompt_embeds,
-        )
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_waveforms_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-        )
-
-        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps = self.scheduler.timesteps
-
-        # 5. Prepare latent variables
-        num_channels_latents = self.unet.config.in_channels
-        latents = self.prepare_latents(
-            batch_size * num_waveforms_per_prompt,
-            num_channels_latents,
-            height,
-            prompt_embeds.dtype,
-            generator,
-            latents,
-        )
-
-        # 6. Prepare extra step kwargs
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 7. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat(x=[latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                # predict the noise residual
-                noise_pred = self.unet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=None,
-                    class_labels=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                ).sample
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(chunks=2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or i + 1 > num_warmup_steps and (i + 1) % self.scheduler.order == 0:
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        # 8. Post-processing
-        mel_spectrogram = self.decode_latents(latents)
-
-        audio = self.mel_spectrogram_to_waveform(mel_spectrogram)
-
-        audio = audio[:, :original_waveform_length]
-
-        if output_type == "np":
-            audio = audio.numpy()
-
-        if not return_dict:
-            return (audio,)
-
-        return AudioPipelineOutput(audios=audio)
diff --git a/ppdiffusers/ppdiffusers/pipelines/dance_diffusion/__init__.py b/ppdiffusers/ppdiffusers/pipelines/dance_diffusion/__init__.py
deleted file mode 100644
index 21884d53c3b1..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/dance_diffusion/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# flake8: noqa
-from .pipeline_dance_diffusion import DanceDiffusionPipeline
diff --git a/ppdiffusers/ppdiffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
deleted file mode 100644
index b4bc68019bf3..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import List, Optional, Tuple, Union
-
-import paddle
-
-from ...utils import logging, randn_tensor
-from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
-
-logger = logging.get_logger(__name__)
-
-
-class DanceDiffusionPipeline(DiffusionPipeline):
-    """
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Parameters:
-        unet ([`UNet1DModel`]): U-Net architecture to denoise the encoded image.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
-            [`IPNDMScheduler`].
-    """
-
-    def __init__(self, unet, scheduler):
-        super().__init__()
-        self.register_modules(unet=unet, scheduler=scheduler)
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        batch_size: int = 1,
-        num_inference_steps: int = 100,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        audio_length_in_s: Optional[float] = None,
-        return_dict: bool = True,
-    ) -> Union[AudioPipelineOutput, Tuple]:
-        """
-        Args:
-            batch_size (`int`, *optional*, defaults to 1):
-                The number of audio samples to generate.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality audio sample at
-                the expense of slower inference.
-            generator (`paddle.Generator`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            audio_length_in_s (`float`, *optional*, defaults to `self.unet.config.sample_size/self.unet.config.sample_rate`):
-                The length of the generated audio sample in seconds. Note that the output of the pipeline, *i.e.*
-                `sample_size`, will be `audio_length_in_s` * `self.unet.config.sample_rate`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.AudioPipelineOutput`] instead of a plain tuple.
-
-        Returns:
-            [`~pipelines.AudioPipelineOutput`] or `tuple`: [`~pipelines.utils.AudioPipelineOutput`] if `return_dict` is
-            True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
-        """
-        if audio_length_in_s is None:
-            audio_length_in_s = self.unet.config.sample_size / self.unet.config.sample_rate
-        sample_size = audio_length_in_s * self.unet.config.sample_rate
-        down_scale_factor = 2 ** len(self.unet.up_blocks)
-        if sample_size < 3 * down_scale_factor:
-            raise ValueError(
-                f"{audio_length_in_s} is too small. Make sure it's bigger or equal to {3 * down_scale_factor / self.unet.config.sample_rate}."
-            )
-        original_sample_size = int(sample_size)
-        if sample_size % down_scale_factor != 0:
-            sample_size = (
-                audio_length_in_s * self.unet.config.sample_rate // down_scale_factor + 1
-            ) * down_scale_factor
-            logger.info(
-                f"{audio_length_in_s} is increased to {sample_size / self.unet.config.sample_rate} so that it can be handled by the model. It will be cut to {original_sample_size / self.unet.config.sample_rate} after the denoising process."
-            )
-        sample_size = int(sample_size)
-        dtype = self.unet.dtype
-        shape = batch_size, self.unet.config.in_channels, sample_size
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-        audio = randn_tensor(shape, generator=generator, dtype=dtype)
-        # set step values
-        self.scheduler.set_timesteps(num_inference_steps)
-        # TODO donot cast dtype here
-        # self.scheduler.timesteps = self.scheduler.timesteps.cast(dtype)
-
-        for t in self.progress_bar(self.scheduler.timesteps):
-            # 1. predict noise model_output
-            model_output = self.unet(audio, t).sample
-
-            # 2. compute previous image: x_t -> t_t-1
-            audio = self.scheduler.step(model_output, t, audio).prev_sample
-
-        audio = audio.clip(min=-1, max=1).astype(dtype="float32").cpu().numpy()
-        audio = audio[:, :, :original_sample_size]
-        if not return_dict:
-            return (audio,)
-        return AudioPipelineOutput(audios=audio)
diff --git a/ppdiffusers/ppdiffusers/pipelines/ddim/__init__.py b/ppdiffusers/ppdiffusers/pipelines/ddim/__init__.py
deleted file mode 100644
index e45978737c4c..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/ddim/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# flake8: noqa
-from .pipeline_ddim import DDIMPipeline
diff --git a/ppdiffusers/ppdiffusers/pipelines/ddim/pipeline_ddim.py b/ppdiffusers/ppdiffusers/pipelines/ddim/pipeline_ddim.py
deleted file mode 100644
index b3248d65baef..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/ddim/pipeline_ddim.py
+++ /dev/null
@@ -1,117 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import List, Optional, Tuple, Union
-
-import paddle
-
-from ...schedulers import DDIMScheduler
-from ...utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
-
-
-class DDIMPipeline(DiffusionPipeline):
-    """
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Parameters:
-        unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
-            [`DDPMScheduler`], or [`DDIMScheduler`].
-    """
-
-    def __init__(self, unet, scheduler):
-        super().__init__()
-        # make sure scheduler can always be converted to DDIM
-        scheduler = DDIMScheduler.from_config(scheduler.config)
-
-        self.register_modules(unet=unet, scheduler=scheduler)
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        batch_size: int = 1,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        eta: float = 0.0,
-        num_inference_steps: int = 50,
-        use_clipped_model_output: Optional[bool] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-    ) -> Union[ImagePipelineOutput, Tuple]:
-        """
-        Args:
-            batch_size (`int`, *optional*, defaults to 1):
-                The number of images to generate.
-            generator (`paddle.Generator`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            eta (`float`, *optional*, defaults to 0.0):
-                The eta parameter which controls the scale of the variance (0 is DDIM and 1 is one type of DDPM).
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            use_clipped_model_output (`bool`, *optional*, defaults to `None`):
-                if `True` or `False`, see documentation for `DDIMScheduler.step`. If `None`, nothing is passed
-                downstream to the scheduler. So use `None` for schedulers which don't support this argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
-
-        Returns:
-            [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is
-            True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
-        """
-
-        # Sample gaussian noise to begin loop
-        if isinstance(self.unet.config.sample_size, int):
-            image_shape = (
-                batch_size,
-                self.unet.config.in_channels,
-                self.unet.config.sample_size,
-                self.unet.config.sample_size,
-            )
-        else:
-            image_shape = (batch_size, self.unet.config.in_channels, *self.unet.config.sample_size)
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        image = randn_tensor(image_shape, generator=generator, dtype=self.unet.dtype)
-
-        # set step values
-        self.scheduler.set_timesteps(num_inference_steps)
-
-        for t in self.progress_bar(self.scheduler.timesteps):
-            # 1. predict noise model_output
-            model_output = self.unet(image, t).sample
-
-            # 2. predict previous mean of image x_t-1 and add variance depending on eta
-            # eta corresponds to η in paper and should be between [0, 1]
-            # do x_t -> x_t-1
-            image = self.scheduler.step(
-                model_output, t, image, eta=eta, use_clipped_model_output=use_clipped_model_output, generator=generator
-            ).prev_sample
-        image = (image / 2 + 0.5).clip(min=0, max=1)
-        image = image.cpu().transpose(perm=[0, 2, 3, 1]).numpy()
-
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image,)
-        return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/ddpm/__init__.py b/ppdiffusers/ppdiffusers/pipelines/ddpm/__init__.py
deleted file mode 100644
index 31d34721ae94..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/ddpm/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# flake8: noqa
-from .pipeline_ddpm import DDPMPipeline
diff --git a/ppdiffusers/ppdiffusers/pipelines/ddpm/pipeline_ddpm.py b/ppdiffusers/ppdiffusers/pipelines/ddpm/pipeline_ddpm.py
deleted file mode 100644
index 3637c9ea3f9e..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/ddpm/pipeline_ddpm.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import List, Optional, Tuple, Union
-
-import paddle
-
-from ...utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
-
-
-class DDPMPipeline(DiffusionPipeline):
-    """
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Parameters:
-        unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
-            [`DDPMScheduler`], or [`DDIMScheduler`].
-    """
-
-    def __init__(self, unet, scheduler):
-        super().__init__()
-        self.register_modules(unet=unet, scheduler=scheduler)
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        batch_size: int = 1,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        num_inference_steps: int = 1000,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-    ) -> Union[ImagePipelineOutput, Tuple]:
-        """
-        Args:
-            batch_size (`int`, *optional*, defaults to 1):
-                The number of images to generate.
-            generator (`paddle.Generator`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            num_inference_steps (`int`, *optional*, defaults to 1000):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
-
-        Returns:
-            [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is
-            True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
-        """
-        # Sample gaussian noise to begin loop
-        if isinstance(self.unet.config.sample_size, int):
-            image_shape = (
-                batch_size,
-                self.unet.config.in_channels,
-                self.unet.config.sample_size,
-                self.unet.config.sample_size,
-            )
-        else:
-            image_shape = (batch_size, self.unet.config.in_channels, *self.unet.config.sample_size)
-        image = randn_tensor(image_shape, generator=generator)
-        # set step values
-        self.scheduler.set_timesteps(num_inference_steps)
-        for t in self.progress_bar(self.scheduler.timesteps):
-            # 1. predict noise model_output
-            model_output = self.unet(image, t).sample
-
-            # 2. compute previous image: x_t -> x_t-1
-            image = self.scheduler.step(model_output, t, image, generator=generator).prev_sample
-        image = (image / 2 + 0.5).clip(min=0, max=1)
-        image = image.cpu().transpose(perm=[0, 2, 3, 1]).numpy()
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-        if not return_dict:
-            return (image,)
-        return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/__init__.py b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/__init__.py
deleted file mode 100644
index fccb87f08b7b..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/__init__.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from dataclasses import dataclass
-from typing import List, Optional, Union
-
-import numpy as np
-import PIL
-
-from ...utils import (
-    BaseOutput,
-    OptionalDependencyNotAvailable,
-    is_paddle_available,
-    is_paddlenlp_available,
-)
-from .timesteps import (
-    fast27_timesteps,
-    smart27_timesteps,
-    smart50_timesteps,
-    smart100_timesteps,
-    smart185_timesteps,
-    super27_timesteps,
-    super40_timesteps,
-    super100_timesteps,
-)
-
-
-@dataclass
-class IFPipelineOutput(BaseOutput):
-    """
-    Args:
-    Output class for Stable Diffusion pipelines.
-        images (`List[PIL.Image.Image]` or `np.ndarray`)
-            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
-            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
-        nsfw_detected (`List[bool]`)
-            List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content or a watermark. `None` if safety checking could not be performed.
-        watermark_detected (`List[bool]`)
-            List of flags denoting whether the corresponding generated image likely has a watermark. `None` if safety
-            checking could not be performed.
-    """
-
-    images: Union[List[PIL.Image.Image], np.ndarray]
-    nsfw_detected: Optional[List[bool]]
-    watermark_detected: Optional[List[bool]]
-
-
-try:
-    if not (is_paddle_available() and is_paddlenlp_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ...utils.dummy_paddle_and_paddlenlp_objects import *  # noqa F403
-else:
-    from .pipeline_if import IFPipeline
-    from .pipeline_if_img2img import IFImg2ImgPipeline
-    from .pipeline_if_img2img_superresolution import IFImg2ImgSuperResolutionPipeline
-    from .pipeline_if_inpainting import IFInpaintingPipeline
-    from .pipeline_if_inpainting_superresolution import (
-        IFInpaintingSuperResolutionPipeline,
-    )
-    from .pipeline_if_superresolution import IFSuperResolutionPipeline
-    from .safety_checker import IFSafetyChecker
-    from .watermark import IFWatermarker
diff --git a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if.py b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if.py
deleted file mode 100644
index 206c904b3aec..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if.py
+++ /dev/null
@@ -1,750 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import html
-import inspect
-import re
-import urllib.parse as ul
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import paddle
-
-from paddlenlp.transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
-
-from ...loaders import LoraLoaderMixin
-from ...models import UNet2DConditionModel
-from ...schedulers import DDPMScheduler
-from ...utils import (
-    BACKENDS_MAPPING,
-    is_bs4_available,
-    is_ftfy_available,
-    logging,
-    randn_tensor,
-    replace_example_docstring,
-)
-from ..pipeline_utils import DiffusionPipeline
-from . import IFPipelineOutput
-from .safety_checker import IFSafetyChecker
-from .watermark import IFWatermarker
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-if is_bs4_available():
-    from bs4 import BeautifulSoup
-
-if is_ftfy_available():
-    import ftfy
-
-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> from ppdiffusers import IFPipeline, IFSuperResolutionPipeline, DiffusionPipeline
-        >>> from ppdiffusers.utils import pd_to_pil
-        >>> import paddle
-
-        >>> pipe = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", paddle_dtype=paddle.float16)
-
-        >>> prompt = 'a photo of a kangaroo wearing an orange hoodie and blue sunglasses standing in front of the eiffel tower holding a sign that says "very deep learning"'
-        >>> prompt_embeds, negative_embeds = pipe.encode_prompt(prompt)
-
-        >>> image = pipe(prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_embeds, output_type="pd").images
-
-        >>> # save intermediate image
-        >>> pil_image = pd_to_pil(image)
-        >>> pil_image[0].save("./if_stage_I.png")
-
-        >>> super_res_1_pipe = IFSuperResolutionPipeline.from_pretrained(
-        ...     "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", paddle_dtype=paddle.float16
-        ... )
-
-        >>> image = super_res_1_pipe(
-        ...     image=image, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_embeds, output_type="pd"
-        ... ).images
-
-        >>> # save intermediate image
-        >>> pil_image = pd_to_pil(image)
-        >>> pil_image[0].save("./if_stage_I.png")
-
-        >>> safety_modules = {
-        ...     "feature_extractor": pipe.feature_extractor,
-        ...     "safety_checker": pipe.safety_checker,
-        ...     "watermarker": pipe.watermarker,
-        ... }
-        >>> super_res_2_pipe = DiffusionPipeline.from_pretrained(
-        ...     "stabilityai/stable-diffusion-x4-upscaler", **safety_modules, paddle_dtype=paddle.float16
-        ... )
-
-        >>> image = super_res_2_pipe(
-        ...     prompt=prompt,
-        ...     image=image,
-        ... ).images
-        >>> image[0].save("./if_stage_II.png")
-        ```
-"""
-
-
-class IFPipeline(DiffusionPipeline, LoraLoaderMixin):
-    tokenizer: T5Tokenizer
-    text_encoder: T5EncoderModel
-
-    unet: UNet2DConditionModel
-    scheduler: DDPMScheduler
-
-    feature_extractor: Optional[CLIPImageProcessor]
-    safety_checker: Optional[IFSafetyChecker]
-
-    watermarker: Optional[IFWatermarker]
-
-    bad_punct_regex = re.compile(
-        r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
-    )  # noqa
-
-    _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"]
-
-    def __init__(
-        self,
-        tokenizer: T5Tokenizer,
-        text_encoder: T5EncoderModel,
-        unet: UNet2DConditionModel,
-        scheduler: DDPMScheduler,
-        safety_checker: Optional[IFSafetyChecker],
-        feature_extractor: Optional[CLIPImageProcessor],
-        watermarker: Optional[IFWatermarker],
-        requires_safety_checker: bool = True,
-    ):
-        super().__init__()
-
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the IF license and do not expose unfiltered"
-                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                f"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-
-        self.register_modules(
-            tokenizer=tokenizer,
-            text_encoder=text_encoder,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-            watermarker=watermarker,
-        )
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-
-    @paddle.no_grad()
-    def encode_prompt(
-        self,
-        prompt,
-        do_classifier_free_guidance=True,
-        num_images_per_prompt=1,
-        negative_prompt=None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        clean_caption: bool = False,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
-                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-        """
-        if prompt is not None and negative_prompt is not None:
-            if type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # while T5 can handle much longer input sequences than 77, the text encoder was trained with a max length of 77 for IF
-        max_length = 77
-
-        if prompt_embeds is None:
-            prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                add_special_tokens=True,
-                return_tensors="pd",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {max_length} tokens: {removed_text}"
-                )
-
-            attention_mask = text_inputs.attention_mask
-            prompt_embeds = self.text_encoder(
-                text_input_ids,
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        if self.text_encoder is not None:
-            dtype = self.text_encoder.dtype
-        elif self.unet is not None:
-            dtype = self.unet.dtype
-        else:
-            dtype = None
-
-        if dtype is not None:
-            prompt_embeds = prompt_embeds.cast(dtype)
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_attention_mask=True,
-                add_special_tokens=True,
-                return_tensors="pd",
-            )
-            attention_mask = uncond_input.attention_mask
-
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids,
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            if dtype is not None:
-                negative_prompt_embeds = negative_prompt_embeds.cast(dtype)
-
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-        else:
-            negative_prompt_embeds = None
-
-        return prompt_embeds, negative_prompt_embeds
-
-    def run_safety_checker(self, image, dtype):
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
-            image, nsfw_detected, watermark_detected = self.safety_checker(
-                images=image,
-                clip_input=safety_checker_input.pixel_values.cast(dtype),
-            )
-        else:
-            nsfw_detected = None
-            watermark_detected = None
-
-        return image, nsfw_detected, watermark_detected
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    def check_inputs(
-        self,
-        prompt,
-        callback_steps,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-    ):
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-    def prepare_intermediate_images(self, batch_size, num_channels, height, width, dtype, generator):
-        shape = (batch_size, num_channels, height, width)
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        intermediate_images = randn_tensor(shape, generator=generator, dtype=dtype)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        intermediate_images = intermediate_images * self.scheduler.init_noise_sigma
-        return intermediate_images
-
-    def _text_preprocessing(self, text, clean_caption=False):
-        if clean_caption and not is_bs4_available():
-            logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
-            logger.warn("Setting `clean_caption` to False...")
-            clean_caption = False
-
-        if clean_caption and not is_ftfy_available():
-            logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
-            logger.warn("Setting `clean_caption` to False...")
-            clean_caption = False
-
-        if not isinstance(text, (tuple, list)):
-            text = [text]
-
-        def process(text: str):
-            if clean_caption:
-                text = self._clean_caption(text)
-                text = self._clean_caption(text)
-            else:
-                text = text.lower().strip()
-            return text
-
-        return [process(t) for t in text]
-
-    def _clean_caption(self, caption):
-        caption = str(caption)
-        caption = ul.unquote_plus(caption)
-        caption = caption.strip().lower()
-        caption = re.sub("<person>", "person", caption)
-        # urls:
-        caption = re.sub(
-            r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
-            "",
-            caption,
-        )  # regex for urls
-        caption = re.sub(
-            r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
-            "",
-            caption,
-        )  # regex for urls
-        # html:
-        caption = BeautifulSoup(caption, features="html.parser").text
-
-        # @<nickname>
-        caption = re.sub(r"@[\w\d]+\b", "", caption)
-
-        # 31C0—31EF CJK Strokes
-        # 31F0—31FF Katakana Phonetic Extensions
-        # 3200—32FF Enclosed CJK Letters and Months
-        # 3300—33FF CJK Compatibility
-        # 3400—4DBF CJK Unified Ideographs Extension A
-        # 4DC0—4DFF Yijing Hexagram Symbols
-        # 4E00—9FFF CJK Unified Ideographs
-        caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
-        caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
-        caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
-        caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
-        caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
-        caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
-        caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
-        #######################################################
-
-        # все виды тире / all types of dash --> "-"
-        caption = re.sub(
-            r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
-            "-",
-            caption,
-        )
-
-        # кавычки к одному стандарту
-        caption = re.sub(r"[`´«»“”¨]", '"', caption)
-        caption = re.sub(r"[‘’]", "'", caption)
-
-        # &quot;
-        caption = re.sub(r"&quot;?", "", caption)
-        # &amp
-        caption = re.sub(r"&amp", "", caption)
-
-        # ip adresses:
-        caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
-
-        # article ids:
-        caption = re.sub(r"\d:\d\d\s+$", "", caption)
-
-        # \n
-        caption = re.sub(r"\\n", " ", caption)
-
-        # "#123"
-        caption = re.sub(r"#\d{1,3}\b", "", caption)
-        # "#12345.."
-        caption = re.sub(r"#\d{5,}\b", "", caption)
-        # "123456.."
-        caption = re.sub(r"\b\d{6,}\b", "", caption)
-        # filenames:
-        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
-
-        #
-        caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
-        caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
-
-        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
-        caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
-
-        # this-is-my-cute-cat / this_is_my_cute_cat
-        regex2 = re.compile(r"(?:\-|\_)")
-        if len(re.findall(regex2, caption)) > 3:
-            caption = re.sub(regex2, " ", caption)
-
-        caption = ftfy.fix_text(caption)
-        caption = html.unescape(html.unescape(caption))
-
-        caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
-        caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
-        caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231
-
-        caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
-        caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
-        caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
-        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
-        caption = re.sub(r"\bpage\s+\d+\b", "", caption)
-
-        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
-
-        caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
-
-        caption = re.sub(r"\b\s+\:\s+", r": ", caption)
-        caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
-        caption = re.sub(r"\s+", " ", caption)
-
-        caption.strip()
-
-        caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
-        caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
-        caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
-        caption = re.sub(r"^\.\S+$", "", caption)
-
-        return caption.strip()
-
-    @paddle.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        num_inference_steps: int = 100,
-        timesteps: List[int] = None,
-        guidance_scale: float = 7.0,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: int = 1,
-        clean_caption: bool = True,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-    ):
-        """
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            timesteps (`List[int]`, *optional*):
-                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
-                timesteps are used. Must be in descending order.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            height (`int`, *optional*, defaults to self.unet.config.sample_size):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size):
-                The width in pixels of the generated image.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
-                One or a list of [paddle generator(s)] to make generation deterministic.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            clean_caption (`bool`, *optional*, defaults to `True`):
-                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
-                be installed. If the dependencies are not installed, the embeddings will be created from the raw
-                prompt.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
-
-        Examples:
-
-        Returns:
-            [`~pipelines.stable_diffusion.IFPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.IFPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
-            returning a tuple, the first element is a list with the generated images, and the second element is a list
-            of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw)
-            or watermarked content, according to the `safety_checker`.
-        """
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(prompt, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds)
-
-        # 2. Define call parameters
-        height = height or self.unet.config.sample_size
-        width = width or self.unet.config.sample_size
-
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input prompt
-        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
-            prompt,
-            do_classifier_free_guidance,
-            num_images_per_prompt=num_images_per_prompt,
-            negative_prompt=negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            clean_caption=clean_caption,
-        )
-
-        if do_classifier_free_guidance:
-            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
-
-        # 4. Prepare timesteps
-        if timesteps is not None:
-            self.scheduler.set_timesteps(timesteps=timesteps)
-            timesteps = self.scheduler.timesteps
-            num_inference_steps = len(timesteps)
-        else:
-            self.scheduler.set_timesteps(num_inference_steps)
-            timesteps = self.scheduler.timesteps
-
-        # 5. Prepare intermediate images
-        intermediate_images = self.prepare_intermediate_images(
-            batch_size * num_images_per_prompt,
-            self.unet.config.in_channels,
-            height,
-            width,
-            prompt_embeds.dtype,
-            generator,
-        )
-
-        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 7. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                model_input = (
-                    paddle.concat([intermediate_images] * 2) if do_classifier_free_guidance else intermediate_images
-                )
-                model_input = self.scheduler.scale_model_input(model_input, t)
-
-                # predict the noise residual
-                noise_pred = self.unet(
-                    model_input,
-                    t,
-                    encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                    return_dict=False,
-                )[0]
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred_uncond, _ = noise_pred_uncond.split(
-                        [model_input.shape[1], noise_pred_uncond.shape[1] - model_input.shape[1]], axis=1
-                    )
-                    noise_pred_text, predicted_variance = noise_pred_text.split(
-                        [model_input.shape[1], noise_pred_text.shape[1] - model_input.shape[1]], axis=1
-                    )
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                    noise_pred = paddle.concat([noise_pred, predicted_variance], axis=1)
-                if self.scheduler.config.variance_type not in ["learned", "learned_range"]:
-                    noise_pred, _ = noise_pred.split(
-                        [model_input.shape[1], noise_pred_uncond.shape[1] - model_input.shape[1]], axis=1
-                    )
-
-                # compute the previous noisy sample x_t -> x_t-1
-                intermediate_images = self.scheduler.step(
-                    noise_pred, t, intermediate_images, **extra_step_kwargs, return_dict=False
-                )[0]
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, intermediate_images)
-
-        image = intermediate_images
-
-        if output_type == "pil":
-            # 8. Post-processing
-            image = (image / 2 + 0.5).clip(0, 1)
-            image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy()
-
-            # 9. Run safety checker
-            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype)
-
-            # 10. Convert to PIL
-            image = self.numpy_to_pil(image)
-
-            # 11. Apply watermark
-            if self.watermarker is not None:
-                image = self.watermarker.apply_watermark(image, self.unet.config.sample_size)
-        elif output_type == "pd":
-            nsfw_detected = None
-            watermark_detected = None
-
-        else:
-            # 8. Post-processing
-            image = (image / 2 + 0.5).clip(0, 1)
-            image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy()
-
-            # 9. Run safety checker
-            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype)
-
-        if not return_dict:
-            return (image, nsfw_detected, watermark_detected)
-
-        return IFPipelineOutput(images=image, nsfw_detected=nsfw_detected, watermark_detected=watermark_detected)
diff --git a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
deleted file mode 100644
index 841ffbf414ad..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
+++ /dev/null
@@ -1,865 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import html
-import inspect
-import re
-import urllib.parse as ul
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import numpy as np
-import paddle
-import PIL
-
-from paddlenlp.transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
-
-from ...models import UNet2DConditionModel
-from ...schedulers import DDPMScheduler
-from ...utils import (
-    BACKENDS_MAPPING,
-    PIL_INTERPOLATION,
-    is_bs4_available,
-    is_ftfy_available,
-    logging,
-    randn_tensor,
-    replace_example_docstring,
-)
-from ..pipeline_utils import DiffusionPipeline
-from . import IFPipelineOutput
-from .safety_checker import IFSafetyChecker
-from .watermark import IFWatermarker
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-if is_bs4_available():
-    from bs4 import BeautifulSoup
-
-if is_ftfy_available():
-    import ftfy
-
-
-def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image:
-    w, h = images.size
-
-    coef = w / h
-
-    w, h = img_size, img_size
-
-    if coef >= 1:
-        w = int(round(img_size / 8 * coef) * 8)
-    else:
-        h = int(round(img_size / 8 / coef) * 8)
-
-    images = images.resize((w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None)
-
-    return images
-
-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> from ppdiffusers import IFImg2ImgPipeline, IFImg2ImgSuperResolutionPipeline, DiffusionPipeline
-        >>> from ppdiffusers.utils import pd_to_pil
-        >>> import paddle
-        >>> from PIL import Image
-        >>> import requests
-        >>> from io import BytesIO
-
-        >>> url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
-        >>> response = requests.get(url)
-        >>> original_image = Image.open(BytesIO(response.content)).convert("RGB")
-        >>> original_image = original_image.resize((768, 512))
-
-        >>> pipe = IFImg2ImgPipeline.from_pretrained(
-        ...     "DeepFloyd/IF-I-XL-v1.0",
-        ...     variant="fp16",
-        ...     paddle_dtype=paddle.float16,
-        ... )
-
-        >>> prompt = "A fantasy landscape in style minecraft"
-        >>> prompt_embeds, negative_embeds = pipe.encode_prompt(prompt)
-
-        >>> image = pipe(
-        ...     image=original_image,
-        ...     prompt_embeds=prompt_embeds,
-        ...     negative_prompt_embeds=negative_embeds,
-        ...     output_type="pd",
-        ... ).images
-
-        >>> # save intermediate image
-        >>> pil_image = pd_to_pil(image)
-        >>> pil_image[0].save("./if_stage_I.png")
-
-        >>> super_res_1_pipe = IFImg2ImgSuperResolutionPipeline.from_pretrained(
-        ...     "DeepFloyd/IF-II-L-v1.0",
-        ...     text_encoder=None,
-        ...     variant="fp16",
-        ...     paddle_dtype=paddle.float16,
-        ... )
-
-        >>> image = super_res_1_pipe(
-        ...     image=image,
-        ...     original_image=original_image,
-        ...     prompt_embeds=prompt_embeds,
-        ...     negative_prompt_embeds=negative_embeds,
-        ... ).images
-        >>> image[0].save("./if_stage_II.png")
-        ```
-"""
-
-
-class IFImg2ImgPipeline(DiffusionPipeline):
-    tokenizer: T5Tokenizer
-    text_encoder: T5EncoderModel
-
-    unet: UNet2DConditionModel
-    scheduler: DDPMScheduler
-
-    feature_extractor: Optional[CLIPImageProcessor]
-    safety_checker: Optional[IFSafetyChecker]
-
-    watermarker: Optional[IFWatermarker]
-
-    bad_punct_regex = re.compile(
-        r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
-    )  # noqa
-
-    _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"]
-
-    def __init__(
-        self,
-        tokenizer: T5Tokenizer,
-        text_encoder: T5EncoderModel,
-        unet: UNet2DConditionModel,
-        scheduler: DDPMScheduler,
-        safety_checker: Optional[IFSafetyChecker],
-        feature_extractor: Optional[CLIPImageProcessor],
-        watermarker: Optional[IFWatermarker],
-        requires_safety_checker: bool = True,
-    ):
-        super().__init__()
-
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the IF license and do not expose unfiltered"
-                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                f"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-
-        self.register_modules(
-            tokenizer=tokenizer,
-            text_encoder=text_encoder,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-            watermarker=watermarker,
-        )
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-
-    @paddle.no_grad()
-    # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt
-    def encode_prompt(
-        self,
-        prompt,
-        do_classifier_free_guidance=True,
-        num_images_per_prompt=1,
-        negative_prompt=None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        clean_caption: bool = False,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
-                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-        """
-        if prompt is not None and negative_prompt is not None:
-            if type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # while T5 can handle much longer input sequences than 77, the text encoder was trained with a max length of 77 for IF
-        max_length = 77
-
-        if prompt_embeds is None:
-            prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                add_special_tokens=True,
-                return_tensors="pd",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {max_length} tokens: {removed_text}"
-                )
-            attention_mask = text_inputs.attention_mask
-            prompt_embeds = self.text_encoder(
-                text_input_ids,
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        if self.text_encoder is not None:
-            dtype = self.text_encoder.dtype
-        elif self.unet is not None:
-            dtype = self.unet.dtype
-        else:
-            dtype = None
-        if dtype is not None:
-            prompt_embeds = prompt_embeds.cast(dtype)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_attention_mask=True,
-                add_special_tokens=True,
-                return_tensors="pd",
-            )
-            attention_mask = uncond_input.attention_mask
-
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids,
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            if dtype is not None:
-                negative_prompt_embeds = negative_prompt_embeds.cast(dtype)
-
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-        else:
-            negative_prompt_embeds = None
-
-        return prompt_embeds, negative_prompt_embeds
-
-    # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker
-    def run_safety_checker(self, image, dtype):
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
-            image, nsfw_detected, watermark_detected = self.safety_checker(
-                images=image,
-                clip_input=safety_checker_input.pixel_values.cast(dtype),
-            )
-        else:
-            nsfw_detected = None
-            watermark_detected = None
-
-        return image, nsfw_detected, watermark_detected
-
-    # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    def check_inputs(
-        self,
-        prompt,
-        image,
-        batch_size,
-        callback_steps,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-    ):
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-        if isinstance(image, list):
-            check_image_type = image[0]
-        else:
-            check_image_type = image
-
-        if (
-            not isinstance(check_image_type, paddle.Tensor)
-            and not isinstance(check_image_type, PIL.Image.Image)
-            and not isinstance(check_image_type, np.ndarray)
-        ):
-            raise ValueError(
-                "`image` has to be of type `paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
-                f" {type(check_image_type)}"
-            )
-
-        if isinstance(image, list):
-            image_batch_size = len(image)
-        elif isinstance(image, paddle.Tensor):
-            image_batch_size = image.shape[0]
-        elif isinstance(image, PIL.Image.Image):
-            image_batch_size = 1
-        elif isinstance(image, np.ndarray):
-            image_batch_size = image.shape[0]
-        else:
-            assert False
-
-        if batch_size != image_batch_size:
-            raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}")
-
-    # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
-    def _text_preprocessing(self, text, clean_caption=False):
-        if clean_caption and not is_bs4_available():
-            logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
-            logger.warn("Setting `clean_caption` to False...")
-            clean_caption = False
-
-        if clean_caption and not is_ftfy_available():
-            logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
-            logger.warn("Setting `clean_caption` to False...")
-            clean_caption = False
-
-        if not isinstance(text, (tuple, list)):
-            text = [text]
-
-        def process(text: str):
-            if clean_caption:
-                text = self._clean_caption(text)
-                text = self._clean_caption(text)
-            else:
-                text = text.lower().strip()
-            return text
-
-        return [process(t) for t in text]
-
-    # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
-    def _clean_caption(self, caption):
-        caption = str(caption)
-        caption = ul.unquote_plus(caption)
-        caption = caption.strip().lower()
-        caption = re.sub("<person>", "person", caption)
-        # urls:
-        caption = re.sub(
-            r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
-            "",
-            caption,
-        )  # regex for urls
-        caption = re.sub(
-            r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
-            "",
-            caption,
-        )  # regex for urls
-        # html:
-        caption = BeautifulSoup(caption, features="html.parser").text
-
-        # @<nickname>
-        caption = re.sub(r"@[\w\d]+\b", "", caption)
-
-        # 31C0—31EF CJK Strokes
-        # 31F0—31FF Katakana Phonetic Extensions
-        # 3200—32FF Enclosed CJK Letters and Months
-        # 3300—33FF CJK Compatibility
-        # 3400—4DBF CJK Unified Ideographs Extension A
-        # 4DC0—4DFF Yijing Hexagram Symbols
-        # 4E00—9FFF CJK Unified Ideographs
-        caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
-        caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
-        caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
-        caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
-        caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
-        caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
-        caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
-        #######################################################
-
-        # все виды тире / all types of dash --> "-"
-        caption = re.sub(
-            r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
-            "-",
-            caption,
-        )
-
-        # кавычки к одному стандарту
-        caption = re.sub(r"[`´«»“”¨]", '"', caption)
-        caption = re.sub(r"[‘’]", "'", caption)
-
-        # &quot;
-        caption = re.sub(r"&quot;?", "", caption)
-        # &amp
-        caption = re.sub(r"&amp", "", caption)
-
-        # ip adresses:
-        caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
-
-        # article ids:
-        caption = re.sub(r"\d:\d\d\s+$", "", caption)
-
-        # \n
-        caption = re.sub(r"\\n", " ", caption)
-
-        # "#123"
-        caption = re.sub(r"#\d{1,3}\b", "", caption)
-        # "#12345.."
-        caption = re.sub(r"#\d{5,}\b", "", caption)
-        # "123456.."
-        caption = re.sub(r"\b\d{6,}\b", "", caption)
-        # filenames:
-        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
-
-        #
-        caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
-        caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
-
-        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
-        caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
-
-        # this-is-my-cute-cat / this_is_my_cute_cat
-        regex2 = re.compile(r"(?:\-|\_)")
-        if len(re.findall(regex2, caption)) > 3:
-            caption = re.sub(regex2, " ", caption)
-
-        caption = ftfy.fix_text(caption)
-        caption = html.unescape(html.unescape(caption))
-
-        caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
-        caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
-        caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231
-
-        caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
-        caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
-        caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
-        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
-        caption = re.sub(r"\bpage\s+\d+\b", "", caption)
-
-        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
-
-        caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
-
-        caption = re.sub(r"\b\s+\:\s+", r": ", caption)
-        caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
-        caption = re.sub(r"\s+", " ", caption)
-
-        caption.strip()
-
-        caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
-        caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
-        caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
-        caption = re.sub(r"^\.\S+$", "", caption)
-
-        return caption.strip()
-
-    def preprocess_image(self, image: PIL.Image.Image) -> paddle.Tensor:
-        if not isinstance(image, list):
-            image = [image]
-
-        def numpy_to_pd(images):
-            if images.ndim == 3:
-                images = images[..., None]
-
-            images = paddle.to_tensor(images.transpose(0, 3, 1, 2))
-            return images
-
-        if isinstance(image[0], PIL.Image.Image):
-            new_image = []
-
-            for image_ in image:
-                image_ = image_.convert("RGB")
-                image_ = resize(image_, self.unet.sample_size)
-                image_ = np.array(image_)
-                image_ = image_.astype(np.float32)
-                image_ = image_ / 127.5 - 1
-                new_image.append(image_)
-
-            image = new_image
-
-            image = np.stack(image, axis=0)  # to np
-            image = numpy_to_pd(image)  # to pd
-
-        elif isinstance(image[0], np.ndarray):
-            image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
-            image = numpy_to_pd(image)
-
-        elif isinstance(image[0], paddle.Tensor):
-            image = paddle.concat(image, axis=0) if image[0].ndim == 4 else paddle.stack(image, axis=0)
-
-        return image
-
-    def get_timesteps(self, num_inference_steps, strength):
-        # get the original timestep using init_timestep
-        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
-
-        t_start = max(num_inference_steps - init_timestep, 0)
-        timesteps = self.scheduler.timesteps[t_start:]
-
-        return timesteps, num_inference_steps - t_start
-
-    def prepare_intermediate_images(self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None):
-        _, channels, height, width = image.shape
-
-        batch_size = batch_size * num_images_per_prompt
-
-        shape = (batch_size, channels, height, width)
-
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        noise = randn_tensor(shape, generator=generator, dtype=dtype)
-
-        image = image.repeat_interleave(num_images_per_prompt, axis=0)
-        image = self.scheduler.add_noise(image, noise, timestep)
-
-        return image
-
-    @paddle.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        image: Union[
-            PIL.Image.Image, paddle.Tensor, np.ndarray, List[PIL.Image.Image], List[paddle.Tensor], List[np.ndarray]
-        ] = None,
-        strength: float = 0.7,
-        num_inference_steps: int = 80,
-        timesteps: List[int] = None,
-        guidance_scale: float = 10.0,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: int = 1,
-        clean_caption: bool = True,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-    ):
-        """
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            image (`paddle.Tensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, that will be used as the starting point for the
-                process.
-            strength (`float`, *optional*, defaults to 0.8):
-                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
-                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
-                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
-                be maximum and the denoising process will run for the full number of iterations specified in
-                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            timesteps (`List[int]`, *optional*):
-                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
-                timesteps are used. Must be in descending order.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
-                One or a list of [paddle generator(s)] to make generation deterministic.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            clean_caption (`bool`, *optional*, defaults to `True`):
-                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
-                be installed. If the dependencies are not installed, the embeddings will be created from the raw
-                prompt.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
-
-        Examples:
-
-        Returns:
-            [`~pipelines.stable_diffusion.IFPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.IFPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
-            returning a tuple, the first element is a list with the generated images, and the second element is a list
-            of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw)
-            or watermarked content, according to the `safety_checker`.
-        """
-        # 1. Check inputs. Raise error if not correct
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        self.check_inputs(
-            prompt, image, batch_size, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
-        )
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input prompt
-        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
-            prompt,
-            do_classifier_free_guidance,
-            num_images_per_prompt=num_images_per_prompt,
-            negative_prompt=negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            clean_caption=clean_caption,
-        )
-
-        if do_classifier_free_guidance:
-            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
-
-        dtype = prompt_embeds.dtype
-
-        # 4. Prepare timesteps
-        if timesteps is not None:
-            self.scheduler.set_timesteps(timesteps=timesteps)
-            timesteps = self.scheduler.timesteps
-            num_inference_steps = len(timesteps)
-        else:
-            self.scheduler.set_timesteps(num_inference_steps)
-            timesteps = self.scheduler.timesteps
-
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
-
-        # 5. Prepare intermediate images
-        image = self.preprocess_image(image)
-        image = image.cast(dtype)
-
-        noise_timestep = timesteps[0:1]
-        noise_timestep = noise_timestep.tile((batch_size * num_images_per_prompt,))
-
-        intermediate_images = self.prepare_intermediate_images(
-            image, noise_timestep, batch_size, num_images_per_prompt, dtype, generator
-        )
-
-        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 7. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                model_input = (
-                    paddle.concat([intermediate_images] * 2) if do_classifier_free_guidance else intermediate_images
-                )
-                model_input = self.scheduler.scale_model_input(model_input, t)
-
-                # predict the noise residual
-                noise_pred = self.unet(
-                    model_input,
-                    t,
-                    encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                ).sample
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred_uncond, _ = noise_pred_uncond.split(
-                        [model_input.shape[1], noise_pred_uncond.shape[1] - model_input.shape[1]], axis=1
-                    )
-                    noise_pred_text, predicted_variance = noise_pred_text.split(
-                        [model_input.shape[1], noise_pred_text.shape[1] - model_input.shape[1]], axis=1
-                    )
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                    noise_pred = paddle.concat([noise_pred, predicted_variance], axis=1)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                intermediate_images = self.scheduler.step(
-                    noise_pred, t, intermediate_images, **extra_step_kwargs
-                ).prev_sample
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, intermediate_images)
-
-        image = intermediate_images
-
-        if output_type == "pil":
-            # 8. Post-processing
-            image = (image / 2 + 0.5).clip(0, 1)
-            image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy()
-
-            # 9. Run safety checker
-            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype)
-
-            # 10. Convert to PIL
-            image = self.numpy_to_pil(image)
-
-            # 11. Apply watermark
-            if self.watermarker is not None:
-                self.watermarker.apply_watermark(image, self.unet.config.sample_size)
-        elif output_type == "pd":
-            nsfw_detected = None
-            watermark_detected = None
-
-        else:
-            # 8. Post-processing
-            image = (image / 2 + 0.5).clip(0, 1)
-            image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy()
-
-            # 9. Run safety checker
-            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype)
-
-        if not return_dict:
-            return (image, nsfw_detected, watermark_detected)
-
-        return IFPipelineOutput(images=image, nsfw_detected=nsfw_detected, watermark_detected=watermark_detected)
diff --git a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
deleted file mode 100644
index c3d6e27aa110..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
+++ /dev/null
@@ -1,982 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import html
-import inspect
-import re
-import urllib.parse as ul
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import numpy as np
-import paddle
-import paddle.nn.functional as F
-import PIL
-
-from paddlenlp.transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
-
-from ...models import UNet2DConditionModel
-from ...schedulers import DDPMScheduler
-from ...utils import (
-    BACKENDS_MAPPING,
-    PIL_INTERPOLATION,
-    is_bs4_available,
-    is_ftfy_available,
-    logging,
-    randn_tensor,
-    replace_example_docstring,
-)
-from ..pipeline_utils import DiffusionPipeline
-from . import IFPipelineOutput
-from .safety_checker import IFSafetyChecker
-from .watermark import IFWatermarker
-
-if is_bs4_available():
-    from bs4 import BeautifulSoup
-
-if is_ftfy_available():
-    import ftfy
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-# Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_img2img.resize
-def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image:
-    w, h = images.size
-
-    coef = w / h
-
-    w, h = img_size, img_size
-
-    if coef >= 1:
-        w = int(round(img_size / 8 * coef) * 8)
-    else:
-        h = int(round(img_size / 8 / coef) * 8)
-
-    images = images.resize((w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None)
-
-    return images
-
-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> from ppdiffusers import IFImg2ImgPipeline, IFImg2ImgSuperResolutionPipeline, DiffusionPipeline
-        >>> from ppdiffusers.utils import pd_to_pil
-        >>> import paddle
-        >>> from PIL import Image
-        >>> import requests
-        >>> from io import BytesIO
-
-        >>> url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
-        >>> response = requests.get(url)
-        >>> original_image = Image.open(BytesIO(response.content)).convert("RGB")
-        >>> original_image = original_image.resize((768, 512))
-
-        >>> pipe = IFImg2ImgPipeline.from_pretrained(
-        ...     "DeepFloyd/IF-I-XL-v1.0",
-        ...     variant="fp16",
-        ...     paddle_dtype=paddle.float16,
-        ... )
-
-        >>> prompt = "A fantasy landscape in style minecraft"
-        >>> prompt_embeds, negative_embeds = pipe.encode_prompt(prompt)
-
-        >>> image = pipe(
-        ...     image=original_image,
-        ...     prompt_embeds=prompt_embeds,
-        ...     negative_prompt_embeds=negative_embeds,
-        ...     output_type="pd",
-        ... ).images
-
-        >>> # save intermediate image
-        >>> pil_image = pd_to_pil(image)
-        >>> pil_image[0].save("./if_stage_I.png")
-
-        >>> super_res_1_pipe = IFImg2ImgSuperResolutionPipeline.from_pretrained(
-        ...     "DeepFloyd/IF-II-L-v1.0",
-        ...     text_encoder=None,
-        ...     variant="fp16",
-        ...     paddle_dtype=paddle.float16,
-        ... )
-
-        >>> image = super_res_1_pipe(
-        ...     image=image,
-        ...     original_image=original_image,
-        ...     prompt_embeds=prompt_embeds,
-        ...     negative_prompt_embeds=negative_embeds,
-        ... ).images
-        >>> image[0].save("./if_stage_II.png")
-        ```
-"""
-
-
-class IFImg2ImgSuperResolutionPipeline(DiffusionPipeline):
-    tokenizer: T5Tokenizer
-    text_encoder: T5EncoderModel
-
-    unet: UNet2DConditionModel
-    scheduler: DDPMScheduler
-    image_noising_scheduler: DDPMScheduler
-
-    feature_extractor: Optional[CLIPImageProcessor]
-    safety_checker: Optional[IFSafetyChecker]
-
-    watermarker: Optional[IFWatermarker]
-
-    bad_punct_regex = re.compile(
-        r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
-    )  # noqa
-
-    _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor"]
-
-    def __init__(
-        self,
-        tokenizer: T5Tokenizer,
-        text_encoder: T5EncoderModel,
-        unet: UNet2DConditionModel,
-        scheduler: DDPMScheduler,
-        image_noising_scheduler: DDPMScheduler,
-        safety_checker: Optional[IFSafetyChecker],
-        feature_extractor: Optional[CLIPImageProcessor],
-        watermarker: Optional[IFWatermarker],
-        requires_safety_checker: bool = True,
-    ):
-        super().__init__()
-
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the IF license and do not expose unfiltered"
-                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                f"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-
-        if unet.config.in_channels != 6:
-            logger.warn(
-                "It seems like you have loaded a checkpoint that shall not be used for super resolution from {unet.config._name_or_path} as it accepts {unet.config.in_channels} input channels instead of 6. Please make sure to pass a super resolution checkpoint as the `'unet'`: IFSuperResolutionPipeline.from_pretrained(unet=super_resolution_unet, ...)`."
-            )
-
-        self.register_modules(
-            tokenizer=tokenizer,
-            text_encoder=text_encoder,
-            unet=unet,
-            scheduler=scheduler,
-            image_noising_scheduler=image_noising_scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-            watermarker=watermarker,
-        )
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-
-    # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
-    def _text_preprocessing(self, text, clean_caption=False):
-        if clean_caption and not is_bs4_available():
-            logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
-            logger.warn("Setting `clean_caption` to False...")
-            clean_caption = False
-
-        if clean_caption and not is_ftfy_available():
-            logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
-            logger.warn("Setting `clean_caption` to False...")
-            clean_caption = False
-
-        if not isinstance(text, (tuple, list)):
-            text = [text]
-
-        def process(text: str):
-            if clean_caption:
-                text = self._clean_caption(text)
-                text = self._clean_caption(text)
-            else:
-                text = text.lower().strip()
-            return text
-
-        return [process(t) for t in text]
-
-    # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
-    def _clean_caption(self, caption):
-        caption = str(caption)
-        caption = ul.unquote_plus(caption)
-        caption = caption.strip().lower()
-        caption = re.sub("<person>", "person", caption)
-        # urls:
-        caption = re.sub(
-            r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
-            "",
-            caption,
-        )  # regex for urls
-        caption = re.sub(
-            r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
-            "",
-            caption,
-        )  # regex for urls
-        # html:
-        caption = BeautifulSoup(caption, features="html.parser").text
-
-        # @<nickname>
-        caption = re.sub(r"@[\w\d]+\b", "", caption)
-
-        # 31C0—31EF CJK Strokes
-        # 31F0—31FF Katakana Phonetic Extensions
-        # 3200—32FF Enclosed CJK Letters and Months
-        # 3300—33FF CJK Compatibility
-        # 3400—4DBF CJK Unified Ideographs Extension A
-        # 4DC0—4DFF Yijing Hexagram Symbols
-        # 4E00—9FFF CJK Unified Ideographs
-        caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
-        caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
-        caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
-        caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
-        caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
-        caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
-        caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
-        #######################################################
-
-        # все виды тире / all types of dash --> "-"
-        caption = re.sub(
-            r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
-            "-",
-            caption,
-        )
-
-        # кавычки к одному стандарту
-        caption = re.sub(r"[`´«»“”¨]", '"', caption)
-        caption = re.sub(r"[‘’]", "'", caption)
-
-        # &quot;
-        caption = re.sub(r"&quot;?", "", caption)
-        # &amp
-        caption = re.sub(r"&amp", "", caption)
-
-        # ip adresses:
-        caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
-
-        # article ids:
-        caption = re.sub(r"\d:\d\d\s+$", "", caption)
-
-        # \n
-        caption = re.sub(r"\\n", " ", caption)
-
-        # "#123"
-        caption = re.sub(r"#\d{1,3}\b", "", caption)
-        # "#12345.."
-        caption = re.sub(r"#\d{5,}\b", "", caption)
-        # "123456.."
-        caption = re.sub(r"\b\d{6,}\b", "", caption)
-        # filenames:
-        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
-
-        #
-        caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
-        caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
-
-        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
-        caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
-
-        # this-is-my-cute-cat / this_is_my_cute_cat
-        regex2 = re.compile(r"(?:\-|\_)")
-        if len(re.findall(regex2, caption)) > 3:
-            caption = re.sub(regex2, " ", caption)
-
-        caption = ftfy.fix_text(caption)
-        caption = html.unescape(html.unescape(caption))
-
-        caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
-        caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
-        caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231
-
-        caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
-        caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
-        caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
-        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
-        caption = re.sub(r"\bpage\s+\d+\b", "", caption)
-
-        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
-
-        caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
-
-        caption = re.sub(r"\b\s+\:\s+", r": ", caption)
-        caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
-        caption = re.sub(r"\s+", " ", caption)
-
-        caption.strip()
-
-        caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
-        caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
-        caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
-        caption = re.sub(r"^\.\S+$", "", caption)
-
-        return caption.strip()
-
-    @paddle.no_grad()
-    # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt
-    def encode_prompt(
-        self,
-        prompt,
-        do_classifier_free_guidance=True,
-        num_images_per_prompt=1,
-        negative_prompt=None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        clean_caption: bool = False,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
-                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-        """
-        if prompt is not None and negative_prompt is not None:
-            if type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # while T5 can handle much longer input sequences than 77, the text encoder was trained with a max length of 77 for IF
-        max_length = 77
-
-        if prompt_embeds is None:
-            prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                add_special_tokens=True,
-                return_tensors="pd",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {max_length} tokens: {removed_text}"
-                )
-            attention_mask = text_inputs.attention_mask
-            prompt_embeds = self.text_encoder(
-                text_input_ids,
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        if self.text_encoder is not None:
-            dtype = self.text_encoder.dtype
-        elif self.unet is not None:
-            dtype = self.unet.dtype
-        else:
-            dtype = None
-
-        if dtype is not None:
-            prompt_embeds = prompt_embeds.cast(dtype)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_attention_mask=True,
-                add_special_tokens=True,
-                return_tensors="pd",
-            )
-            attention_mask = uncond_input.attention_mask
-
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids,
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-            if dtype is not None:
-                negative_prompt_embeds = negative_prompt_embeds.cast(dtype)
-
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-        else:
-            negative_prompt_embeds = None
-
-        return prompt_embeds, negative_prompt_embeds
-
-    # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker
-    def run_safety_checker(self, image, dtype):
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
-            image, nsfw_detected, watermark_detected = self.safety_checker(
-                images=image,
-                clip_input=safety_checker_input.pixel_values.cast(dtype),
-            )
-        else:
-            nsfw_detected = None
-            watermark_detected = None
-
-        return image, nsfw_detected, watermark_detected
-
-    # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    def check_inputs(
-        self,
-        prompt,
-        image,
-        original_image,
-        batch_size,
-        callback_steps,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-    ):
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-        # image
-
-        if isinstance(image, list):
-            check_image_type = image[0]
-        else:
-            check_image_type = image
-
-        if (
-            not isinstance(check_image_type, paddle.Tensor)
-            and not isinstance(check_image_type, PIL.Image.Image)
-            and not isinstance(check_image_type, np.ndarray)
-        ):
-            raise ValueError(
-                "`image` has to be of type `paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
-                f" {type(check_image_type)}"
-            )
-
-        if isinstance(image, list):
-            image_batch_size = len(image)
-        elif isinstance(image, paddle.Tensor):
-            image_batch_size = image.shape[0]
-        elif isinstance(image, PIL.Image.Image):
-            image_batch_size = 1
-        elif isinstance(image, np.ndarray):
-            image_batch_size = image.shape[0]
-        else:
-            assert False
-
-        if batch_size != image_batch_size:
-            raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}")
-
-        # original_image
-
-        if isinstance(original_image, list):
-            check_image_type = original_image[0]
-        else:
-            check_image_type = original_image
-
-        if (
-            not isinstance(check_image_type, paddle.Tensor)
-            and not isinstance(check_image_type, PIL.Image.Image)
-            and not isinstance(check_image_type, np.ndarray)
-        ):
-            raise ValueError(
-                "`original_image` has to be of type `paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
-                f" {type(check_image_type)}"
-            )
-
-        if isinstance(original_image, list):
-            image_batch_size = len(original_image)
-        elif isinstance(original_image, paddle.Tensor):
-            image_batch_size = original_image.shape[0]
-        elif isinstance(original_image, PIL.Image.Image):
-            image_batch_size = 1
-        elif isinstance(original_image, np.ndarray):
-            image_batch_size = original_image.shape[0]
-        else:
-            assert False
-
-        if batch_size != image_batch_size:
-            raise ValueError(
-                f"original_image batch size: {image_batch_size} must be same as prompt batch size {batch_size}"
-            )
-
-    # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.preprocess_image with preprocess_image -> preprocess_original_image
-    def preprocess_original_image(self, image: PIL.Image.Image) -> paddle.Tensor:
-        if not isinstance(image, list):
-            image = [image]
-
-        def numpy_to_pd(images):
-            if images.ndim == 3:
-                images = images[..., None]
-
-            images = paddle.to_tensor(images.transpose(0, 3, 1, 2))
-            return images
-
-        if isinstance(image[0], PIL.Image.Image):
-            new_image = []
-
-            for image_ in image:
-                image_ = image_.convert("RGB")
-                image_ = resize(image_, self.unet.sample_size)
-                image_ = np.array(image_)
-                image_ = image_.astype(np.float32)
-                image_ = image_ / 127.5 - 1
-                new_image.append(image_)
-
-            image = new_image
-
-            image = np.stack(image, axis=0)  # to np
-            image = numpy_to_pd(image)  # to pd
-
-        elif isinstance(image[0], np.ndarray):
-            image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
-            image = numpy_to_pd(image)
-
-        elif isinstance(image[0], paddle.Tensor):
-            image = paddle.concat(image, axis=0) if image[0].ndim == 4 else paddle.stack(image, axis=0)
-
-        return image
-
-    # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_superresolution.IFSuperResolutionPipeline.preprocess_image
-    def preprocess_image(self, image: PIL.Image.Image, num_images_per_prompt) -> paddle.Tensor:
-        if not isinstance(image, paddle.Tensor) and not isinstance(image, list):
-            image = [image]
-
-        if isinstance(image[0], PIL.Image.Image):
-            image = [np.array(i).astype(np.float32) / 255.0 for i in image]
-
-            image = np.stack(image, axis=0)  # to np
-            image = paddle.to_tensor(image.transpose(0, 3, 1, 2))
-        elif isinstance(image[0], np.ndarray):
-            image = np.stack(image, axis=0)  # to np
-            if image.ndim == 5:
-                image = image[0]
-
-            image = paddle.to_tensor(image.transpose(0, 3, 1, 2))
-        elif isinstance(image, list) and isinstance(image[0], paddle.Tensor):
-            dims = image[0].ndim
-
-            if dims == 3:
-                image = paddle.stack(image, axis=0)
-            elif dims == 4:
-                image = paddle.concat(image, axis=0)
-            else:
-                raise ValueError(f"Image must have 3 or 4 dimensions, instead got {dims}")
-
-        image = image.cast(self.unet.dtype)
-
-        image = image.repeat_interleave(num_images_per_prompt, axis=0)
-
-        return image
-
-    # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.get_timesteps
-    def get_timesteps(self, num_inference_steps, strength):
-        # get the original timestep using init_timestep
-        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
-
-        t_start = max(num_inference_steps - init_timestep, 0)
-        timesteps = self.scheduler.timesteps[t_start:]
-
-        return timesteps, num_inference_steps - t_start
-
-    # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.prepare_intermediate_images
-    def prepare_intermediate_images(self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None):
-        _, channels, height, width = image.shape
-
-        batch_size = batch_size * num_images_per_prompt
-
-        shape = (batch_size, channels, height, width)
-
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        noise = randn_tensor(shape, generator=generator, dtype=dtype)
-
-        image = image.repeat_interleave(num_images_per_prompt, axis=0)
-        image = self.scheduler.add_noise(image, noise, timestep)
-
-        return image
-
-    @paddle.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        image: Union[PIL.Image.Image, np.ndarray, paddle.Tensor],
-        original_image: Union[
-            PIL.Image.Image, paddle.Tensor, np.ndarray, List[PIL.Image.Image], List[paddle.Tensor], List[np.ndarray]
-        ] = None,
-        strength: float = 0.8,
-        prompt: Union[str, List[str]] = None,
-        num_inference_steps: int = 50,
-        timesteps: List[int] = None,
-        guidance_scale: float = 4.0,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: int = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        noise_level: int = 250,
-        clean_caption: bool = True,
-    ):
-        """
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            image (`paddle.Tensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, that will be used as the starting point for the
-                process.
-            original_image (`paddle.Tensor` or `PIL.Image.Image`):
-                The original image that `image` was varied from.
-            strength (`float`, *optional*, defaults to 0.8):
-                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
-                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
-                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
-                be maximum and the denoising process will run for the full number of iterations specified in
-                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            timesteps (`List[int]`, *optional*):
-                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
-                timesteps are used. Must be in descending order.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
-                One or a list of [paddle generator(s)] to make generation deterministic.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
-            noise_level (`int`, *optional*, defaults to 250):
-                The amount of noise to add to the upscaled image. Must be in the range `[0, 1000)`
-            clean_caption (`bool`, *optional*, defaults to `True`):
-                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
-                be installed. If the dependencies are not installed, the embeddings will be created from the raw
-                prompt.
-
-        Examples:
-
-        Returns:
-            [`~pipelines.stable_diffusion.IFPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.IFPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
-            returning a tuple, the first element is a list with the generated images, and the second element is a list
-            of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw)
-            or watermarked content, according to the `safety_checker`.
-        """
-        # 1. Check inputs. Raise error if not correct
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        self.check_inputs(
-            prompt,
-            image,
-            original_image,
-            batch_size,
-            callback_steps,
-            negative_prompt,
-            prompt_embeds,
-            negative_prompt_embeds,
-        )
-
-        # 2. Define call parameters
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input prompt
-        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
-            prompt,
-            do_classifier_free_guidance,
-            num_images_per_prompt=num_images_per_prompt,
-            negative_prompt=negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            clean_caption=clean_caption,
-        )
-
-        if do_classifier_free_guidance:
-            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
-
-        dtype = prompt_embeds.dtype
-
-        # 4. Prepare timesteps
-        if timesteps is not None:
-            self.scheduler.set_timesteps(timesteps=timesteps)
-            timesteps = self.scheduler.timesteps
-            num_inference_steps = len(timesteps)
-        else:
-            self.scheduler.set_timesteps(num_inference_steps)
-            timesteps = self.scheduler.timesteps
-
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
-
-        # 5. prepare original image
-        original_image = self.preprocess_original_image(original_image)
-        original_image = original_image.cast(dtype=dtype)
-
-        # 6. Prepare intermediate images
-        noise_timestep = timesteps[0:1]
-        noise_timestep = noise_timestep.tile((batch_size * num_images_per_prompt,))
-
-        intermediate_images = self.prepare_intermediate_images(
-            original_image,
-            noise_timestep,
-            batch_size,
-            num_images_per_prompt,
-            dtype,
-            generator,
-        )
-
-        # 7. Prepare upscaled image and noise level
-        _, _, height, width = original_image.shape
-
-        image = self.preprocess_image(image, num_images_per_prompt)
-
-        upscaled = F.interpolate(image, (height, width), mode="bilinear", align_corners=True)
-
-        noise_level = paddle.to_tensor([noise_level] * upscaled.shape[0])
-        noise = randn_tensor(upscaled.shape, generator=generator, dtype=upscaled.dtype)
-        upscaled = self.image_noising_scheduler.add_noise(upscaled, noise, timesteps=noise_level)
-
-        if do_classifier_free_guidance:
-            noise_level = paddle.concat([noise_level] * 2)
-
-        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 9. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                model_input = paddle.concat([intermediate_images, upscaled.cast(intermediate_images.dtype)], axis=1)
-
-                model_input = paddle.concat([model_input] * 2) if do_classifier_free_guidance else model_input
-                model_input = self.scheduler.scale_model_input(model_input, t)
-
-                # predict the noise residual
-                noise_pred = self.unet(
-                    model_input,
-                    t,
-                    encoder_hidden_states=prompt_embeds,
-                    class_labels=noise_level,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                ).sample
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred_uncond, _ = noise_pred_uncond.split(
-                        [model_input.shape[1] // 2, noise_pred_uncond.shape[1] - model_input.shape[1] // 2], axis=1
-                    )
-                    noise_pred_text, predicted_variance = noise_pred_text.split(
-                        [model_input.shape[1] // 2, noise_pred_text.shape[1] - model_input.shape[1] // 2], axis=1
-                    )
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                    noise_pred = paddle.concat([noise_pred, predicted_variance], axis=1)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                intermediate_images = self.scheduler.step(
-                    noise_pred, t, intermediate_images, **extra_step_kwargs
-                ).prev_sample
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, intermediate_images)
-
-        image = intermediate_images
-
-        if output_type == "pil":
-            # 10. Post-processing
-            image = (image / 2 + 0.5).clip(0, 1)
-            image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy()
-
-            # 11. Run safety checker
-            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype)
-
-            # 12. Convert to PIL
-            image = self.numpy_to_pil(image)
-
-            # 13. Apply watermark
-            if self.watermarker is not None:
-                self.watermarker.apply_watermark(image, self.unet.config.sample_size)
-        elif output_type == "pd":
-            nsfw_detected = None
-            watermark_detected = None
-        else:
-            # 10. Post-processing
-            image = (image / 2 + 0.5).clip(0, 1)
-            image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy()
-
-            # 11. Run safety checker
-            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype)
-
-        if not return_dict:
-            return (image, nsfw_detected, watermark_detected)
-
-        return IFPipelineOutput(images=image, nsfw_detected=nsfw_detected, watermark_detected=watermark_detected)
diff --git a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
deleted file mode 100644
index f0137249aa09..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
+++ /dev/null
@@ -1,989 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import html
-import inspect
-import re
-import urllib.parse as ul
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import numpy as np
-import paddle
-import PIL
-
-from paddlenlp.transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
-
-from ...models import UNet2DConditionModel
-from ...schedulers import DDPMScheduler
-from ...utils import (
-    BACKENDS_MAPPING,
-    PIL_INTERPOLATION,
-    is_bs4_available,
-    is_ftfy_available,
-    logging,
-    randn_tensor,
-    replace_example_docstring,
-)
-from ..pipeline_utils import DiffusionPipeline
-from . import IFPipelineOutput
-from .safety_checker import IFSafetyChecker
-from .watermark import IFWatermarker
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-if is_bs4_available():
-    from bs4 import BeautifulSoup
-
-if is_ftfy_available():
-    import ftfy
-
-
-# Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_img2img.resize
-def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image:
-    w, h = images.size
-
-    coef = w / h
-
-    w, h = img_size, img_size
-
-    if coef >= 1:
-        w = int(round(img_size / 8 * coef) * 8)
-    else:
-        h = int(round(img_size / 8 / coef) * 8)
-
-    images = images.resize((w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None)
-
-    return images
-
-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> from ppdiffusers import IFInpaintingPipeline, IFInpaintingSuperResolutionPipeline, DiffusionPipeline
-        >>> from ppdiffusers.utils import pd_to_pil
-        >>> import paddle
-        >>> from PIL import Image
-        >>> import requests
-        >>> from io import BytesIO
-
-        >>> url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/if/person.png"
-        >>> response = requests.get(url)
-        >>> original_image = Image.open(BytesIO(response.content)).convert("RGB")
-        >>> original_image = original_image
-
-        >>> url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/if/glasses_mask.png"
-        >>> response = requests.get(url)
-        >>> mask_image = Image.open(BytesIO(response.content))
-        >>> mask_image = mask_image
-
-        >>> pipe = IFInpaintingPipeline.from_pretrained(
-        ...     "DeepFloyd/IF-I-XL-v1.0", variant="fp16", paddle_dtype=paddle.float16
-        ... )
-
-        >>> prompt = "blue sunglasses"
-        >>> prompt_embeds, negative_embeds = pipe.encode_prompt(prompt)
-
-        >>> image = pipe(
-        ...     image=original_image,
-        ...     mask_image=mask_image,
-        ...     prompt_embeds=prompt_embeds,
-        ...     negative_prompt_embeds=negative_embeds,
-        ...     output_type="pd",
-        ... ).images
-
-        >>> # save intermediate image
-        >>> pil_image = pd_to_pil(image)
-        >>> pil_image[0].save("./if_stage_I.png")
-
-        >>> super_res_1_pipe = IFInpaintingSuperResolutionPipeline.from_pretrained(
-        ...     "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", paddle_dtype=paddle.float16
-        ... )
-
-        >>> image = super_res_1_pipe(
-        ...     image=image,
-        ...     mask_image=mask_image,
-        ...     original_image=original_image,
-        ...     prompt_embeds=prompt_embeds,
-        ...     negative_prompt_embeds=negative_embeds,
-        ... ).images
-        >>> image[0].save("./if_stage_II.png")
-        ```
-"""
-
-
-class IFInpaintingPipeline(DiffusionPipeline):
-    tokenizer: T5Tokenizer
-    text_encoder: T5EncoderModel
-
-    unet: UNet2DConditionModel
-    scheduler: DDPMScheduler
-
-    feature_extractor: Optional[CLIPImageProcessor]
-    safety_checker: Optional[IFSafetyChecker]
-
-    watermarker: Optional[IFWatermarker]
-
-    bad_punct_regex = re.compile(
-        r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
-    )  # noqa
-
-    _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"]
-
-    def __init__(
-        self,
-        tokenizer: T5Tokenizer,
-        text_encoder: T5EncoderModel,
-        unet: UNet2DConditionModel,
-        scheduler: DDPMScheduler,
-        safety_checker: Optional[IFSafetyChecker],
-        feature_extractor: Optional[CLIPImageProcessor],
-        watermarker: Optional[IFWatermarker],
-        requires_safety_checker: bool = True,
-    ):
-        super().__init__()
-
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the IF license and do not expose unfiltered"
-                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                f"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-
-        self.register_modules(
-            tokenizer=tokenizer,
-            text_encoder=text_encoder,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-            watermarker=watermarker,
-        )
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-
-    @paddle.no_grad()
-    # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt
-    def encode_prompt(
-        self,
-        prompt,
-        do_classifier_free_guidance=True,
-        num_images_per_prompt=1,
-        negative_prompt=None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        clean_caption: bool = False,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
-                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-        """
-        if prompt is not None and negative_prompt is not None:
-            if type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # while T5 can handle much longer input sequences than 77, the text encoder was trained with a max length of 77 for IF
-        max_length = 77
-
-        if prompt_embeds is None:
-            prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                add_special_tokens=True,
-                return_tensors="pd",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {max_length} tokens: {removed_text}"
-                )
-
-            attention_mask = text_inputs.attention_mask
-            prompt_embeds = self.text_encoder(
-                text_input_ids,
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        if self.text_encoder is not None:
-            dtype = self.text_encoder.dtype
-        elif self.unet is not None:
-            dtype = self.unet.dtype
-        else:
-            dtype = None
-
-        if dtype is not None:
-            prompt_embeds = prompt_embeds.cast(dtype)
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_attention_mask=True,
-                add_special_tokens=True,
-                return_tensors="pd",
-            )
-            attention_mask = uncond_input.attention_mask
-
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids,
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            if dtype is not None:
-                negative_prompt_embeds = negative_prompt_embeds.cast(dtype)
-
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-        else:
-            negative_prompt_embeds = None
-
-        return prompt_embeds, negative_prompt_embeds
-
-    # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker
-    def run_safety_checker(self, image, dtype):
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
-            image, nsfw_detected, watermark_detected = self.safety_checker(
-                images=image,
-                clip_input=safety_checker_input.pixel_values.cast(dtype),
-            )
-        else:
-            nsfw_detected = None
-            watermark_detected = None
-
-        return image, nsfw_detected, watermark_detected
-
-    # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    def check_inputs(
-        self,
-        prompt,
-        image,
-        mask_image,
-        batch_size,
-        callback_steps,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-    ):
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-        # image
-
-        if isinstance(image, list):
-            check_image_type = image[0]
-        else:
-            check_image_type = image
-
-        if (
-            not isinstance(check_image_type, paddle.Tensor)
-            and not isinstance(check_image_type, PIL.Image.Image)
-            and not isinstance(check_image_type, np.ndarray)
-        ):
-            raise ValueError(
-                "`image` has to be of type `paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
-                f" {type(check_image_type)}"
-            )
-
-        if isinstance(image, list):
-            image_batch_size = len(image)
-        elif isinstance(image, paddle.Tensor):
-            image_batch_size = image.shape[0]
-        elif isinstance(image, PIL.Image.Image):
-            image_batch_size = 1
-        elif isinstance(image, np.ndarray):
-            image_batch_size = image.shape[0]
-        else:
-            assert False
-
-        if batch_size != image_batch_size:
-            raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}")
-
-        # mask_image
-
-        if isinstance(mask_image, list):
-            check_image_type = mask_image[0]
-        else:
-            check_image_type = mask_image
-
-        if (
-            not isinstance(check_image_type, paddle.Tensor)
-            and not isinstance(check_image_type, PIL.Image.Image)
-            and not isinstance(check_image_type, np.ndarray)
-        ):
-            raise ValueError(
-                "`mask_image` has to be of type `paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
-                f" {type(check_image_type)}"
-            )
-
-        if isinstance(mask_image, list):
-            image_batch_size = len(mask_image)
-        elif isinstance(mask_image, paddle.Tensor):
-            image_batch_size = mask_image.shape[0]
-        elif isinstance(mask_image, PIL.Image.Image):
-            image_batch_size = 1
-        elif isinstance(mask_image, np.ndarray):
-            image_batch_size = mask_image.shape[0]
-        else:
-            assert False
-
-        if image_batch_size != 1 and batch_size != image_batch_size:
-            raise ValueError(
-                f"mask_image batch size: {image_batch_size} must be `1` or the same as prompt batch size {batch_size}"
-            )
-
-    # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
-    def _text_preprocessing(self, text, clean_caption=False):
-        if clean_caption and not is_bs4_available():
-            logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
-            logger.warn("Setting `clean_caption` to False...")
-            clean_caption = False
-
-        if clean_caption and not is_ftfy_available():
-            logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
-            logger.warn("Setting `clean_caption` to False...")
-            clean_caption = False
-
-        if not isinstance(text, (tuple, list)):
-            text = [text]
-
-        def process(text: str):
-            if clean_caption:
-                text = self._clean_caption(text)
-                text = self._clean_caption(text)
-            else:
-                text = text.lower().strip()
-            return text
-
-        return [process(t) for t in text]
-
-    # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
-    def _clean_caption(self, caption):
-        caption = str(caption)
-        caption = ul.unquote_plus(caption)
-        caption = caption.strip().lower()
-        caption = re.sub("<person>", "person", caption)
-        # urls:
-        caption = re.sub(
-            r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
-            "",
-            caption,
-        )  # regex for urls
-        caption = re.sub(
-            r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
-            "",
-            caption,
-        )  # regex for urls
-        # html:
-        caption = BeautifulSoup(caption, features="html.parser").text
-
-        # @<nickname>
-        caption = re.sub(r"@[\w\d]+\b", "", caption)
-
-        # 31C0—31EF CJK Strokes
-        # 31F0—31FF Katakana Phonetic Extensions
-        # 3200—32FF Enclosed CJK Letters and Months
-        # 3300—33FF CJK Compatibility
-        # 3400—4DBF CJK Unified Ideographs Extension A
-        # 4DC0—4DFF Yijing Hexagram Symbols
-        # 4E00—9FFF CJK Unified Ideographs
-        caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
-        caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
-        caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
-        caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
-        caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
-        caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
-        caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
-        #######################################################
-
-        # все виды тире / all types of dash --> "-"
-        caption = re.sub(
-            r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
-            "-",
-            caption,
-        )
-
-        # кавычки к одному стандарту
-        caption = re.sub(r"[`´«»“”¨]", '"', caption)
-        caption = re.sub(r"[‘’]", "'", caption)
-
-        # &quot;
-        caption = re.sub(r"&quot;?", "", caption)
-        # &amp
-        caption = re.sub(r"&amp", "", caption)
-
-        # ip adresses:
-        caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
-
-        # article ids:
-        caption = re.sub(r"\d:\d\d\s+$", "", caption)
-
-        # \n
-        caption = re.sub(r"\\n", " ", caption)
-
-        # "#123"
-        caption = re.sub(r"#\d{1,3}\b", "", caption)
-        # "#12345.."
-        caption = re.sub(r"#\d{5,}\b", "", caption)
-        # "123456.."
-        caption = re.sub(r"\b\d{6,}\b", "", caption)
-        # filenames:
-        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
-
-        #
-        caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
-        caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
-
-        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
-        caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
-
-        # this-is-my-cute-cat / this_is_my_cute_cat
-        regex2 = re.compile(r"(?:\-|\_)")
-        if len(re.findall(regex2, caption)) > 3:
-            caption = re.sub(regex2, " ", caption)
-
-        caption = ftfy.fix_text(caption)
-        caption = html.unescape(html.unescape(caption))
-
-        caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
-        caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
-        caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231
-
-        caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
-        caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
-        caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
-        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
-        caption = re.sub(r"\bpage\s+\d+\b", "", caption)
-
-        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
-
-        caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
-
-        caption = re.sub(r"\b\s+\:\s+", r": ", caption)
-        caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
-        caption = re.sub(r"\s+", " ", caption)
-
-        caption.strip()
-
-        caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
-        caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
-        caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
-        caption = re.sub(r"^\.\S+$", "", caption)
-
-        return caption.strip()
-
-    # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.preprocess_image
-    def preprocess_image(self, image: PIL.Image.Image) -> paddle.Tensor:
-        if not isinstance(image, list):
-            image = [image]
-
-        def numpy_to_pd(images):
-            if images.ndim == 3:
-                images = images[..., None]
-
-            images = paddle.to_tensor(images.transpose(0, 3, 1, 2))
-            return images
-
-        if isinstance(image[0], PIL.Image.Image):
-            new_image = []
-
-            for image_ in image:
-                image_ = image_.convert("RGB")
-                image_ = resize(image_, self.unet.sample_size)
-                image_ = np.array(image_)
-                image_ = image_.astype(np.float32)
-                image_ = image_ / 127.5 - 1
-                new_image.append(image_)
-
-            image = new_image
-
-            image = np.stack(image, axis=0)  # to np
-            image = numpy_to_pd(image)  # to pd
-
-        elif isinstance(image[0], np.ndarray):
-            image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
-            image = numpy_to_pd(image)
-
-        elif isinstance(image[0], paddle.Tensor):
-            image = paddle.concat(image, axis=0) if image[0].ndim == 4 else paddle.stack(image, axis=0)
-
-        return image
-
-    def preprocess_mask_image(self, mask_image) -> paddle.Tensor:
-        if not isinstance(mask_image, list):
-            mask_image = [mask_image]
-
-        if isinstance(mask_image[0], paddle.Tensor):
-            mask_image = (
-                paddle.concat(mask_image, axis=0) if mask_image[0].ndim == 4 else paddle.stack(mask_image, axis=0)
-            )
-
-            if mask_image.ndim == 2:
-                # Batch and add channel dim for single mask
-                mask_image = mask_image.unsqueeze(0).unsqueeze(0)
-            elif mask_image.ndim == 3 and mask_image.shape[0] == 1:
-                # Single mask, the 0'th dimension is considered to be
-                # the existing batch size of 1
-                mask_image = mask_image.unsqueeze(0)
-            elif mask_image.ndim == 3 and mask_image.shape[0] != 1:
-                # Batch of mask, the 0'th dimension is considered to be
-                # the batching dimension
-                mask_image = mask_image.unsqueeze(1)
-
-            mask_image[mask_image < 0.5] = 0
-            mask_image[mask_image >= 0.5] = 1
-
-        elif isinstance(mask_image[0], PIL.Image.Image):
-            new_mask_image = []
-
-            for mask_image_ in mask_image:
-                mask_image_ = mask_image_.convert("L")
-                mask_image_ = resize(mask_image_, self.unet.sample_size)
-                mask_image_ = np.array(mask_image_)
-                mask_image_ = mask_image_[None, None, :]
-                new_mask_image.append(mask_image_)
-
-            mask_image = new_mask_image
-
-            mask_image = np.concatenate(mask_image, axis=0)
-            mask_image = mask_image.astype(np.float32) / 255.0
-            mask_image[mask_image < 0.5] = 0
-            mask_image[mask_image >= 0.5] = 1
-            mask_image = paddle.to_tensor(mask_image)
-
-        elif isinstance(mask_image[0], np.ndarray):
-            mask_image = np.concatenate([m[None, None, :] for m in mask_image], axis=0)
-
-            mask_image[mask_image < 0.5] = 0
-            mask_image[mask_image >= 0.5] = 1
-            mask_image = paddle.to_tensor(mask_image)
-
-        return mask_image
-
-    # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.get_timesteps
-    def get_timesteps(self, num_inference_steps, strength):
-        # get the original timestep using init_timestep
-        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
-
-        t_start = max(num_inference_steps - init_timestep, 0)
-        timesteps = self.scheduler.timesteps[t_start:]
-
-        return timesteps, num_inference_steps - t_start
-
-    def prepare_intermediate_images(
-        self, image, timestep, batch_size, num_images_per_prompt, dtype, mask_image, generator=None
-    ):
-        image_batch_size, channels, height, width = image.shape
-
-        batch_size = batch_size * num_images_per_prompt
-
-        shape = (batch_size, channels, height, width)
-
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        noise = randn_tensor(shape, generator=generator, dtype=dtype)
-
-        image = image.repeat_interleave(num_images_per_prompt, axis=0)
-        noised_image = self.scheduler.add_noise(image, noise, timestep)
-
-        image = (1 - mask_image) * image + mask_image * noised_image
-
-        return image
-
-    @paddle.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        image: Union[
-            PIL.Image.Image, paddle.Tensor, np.ndarray, List[PIL.Image.Image], List[paddle.Tensor], List[np.ndarray]
-        ] = None,
-        mask_image: Union[
-            PIL.Image.Image, paddle.Tensor, np.ndarray, List[PIL.Image.Image], List[paddle.Tensor], List[np.ndarray]
-        ] = None,
-        strength: float = 1.0,
-        num_inference_steps: int = 50,
-        timesteps: List[int] = None,
-        guidance_scale: float = 7.0,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: int = 1,
-        clean_caption: bool = True,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-    ):
-        """
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            image (`paddle.Tensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, that will be used as the starting point for the
-                process.
-            mask_image (`PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
-                repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
-                to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
-                instead of 3, so the expected shape would be `(B, H, W, 1)`.
-            strength (`float`, *optional*, defaults to 0.8):
-                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
-                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
-                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
-                be maximum and the denoising process will run for the full number of iterations specified in
-                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            timesteps (`List[int]`, *optional*):
-                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
-                timesteps are used. Must be in descending order.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
-                One or a list of [paddle generator(s)] to make generation deterministic.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            clean_caption (`bool`, *optional*, defaults to `True`):
-                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
-                be installed. If the dependencies are not installed, the embeddings will be created from the raw
-                prompt.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
-
-        Examples:
-
-        Returns:
-            [`~pipelines.stable_diffusion.IFPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.IFPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
-            returning a tuple, the first element is a list with the generated images, and the second element is a list
-            of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw)
-            or watermarked content, according to the `safety_checker`.
-        """
-        # 1. Check inputs. Raise error if not correct
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        self.check_inputs(
-            prompt,
-            image,
-            mask_image,
-            batch_size,
-            callback_steps,
-            negative_prompt,
-            prompt_embeds,
-            negative_prompt_embeds,
-        )
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input prompt
-        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
-            prompt,
-            do_classifier_free_guidance,
-            num_images_per_prompt=num_images_per_prompt,
-            negative_prompt=negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            clean_caption=clean_caption,
-        )
-
-        if do_classifier_free_guidance:
-            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
-
-        dtype = prompt_embeds.dtype
-
-        # 4. Prepare timesteps
-        if timesteps is not None:
-            self.scheduler.set_timesteps(timesteps=timesteps)
-            timesteps = self.scheduler.timesteps
-            num_inference_steps = len(timesteps)
-        else:
-            self.scheduler.set_timesteps(num_inference_steps)
-            timesteps = self.scheduler.timesteps
-
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
-
-        # 5. Prepare intermediate images
-        image = self.preprocess_image(image)
-        image = image.cast(dtype)
-
-        mask_image = self.preprocess_mask_image(mask_image)
-        mask_image = mask_image.cast(dtype)
-
-        if mask_image.shape[0] == 1:
-            mask_image = mask_image.repeat_interleave(batch_size * num_images_per_prompt, axis=0)
-        else:
-            mask_image = mask_image.repeat_interleave(num_images_per_prompt, axis=0)
-
-        noise_timestep = timesteps[0:1]
-        noise_timestep = noise_timestep.tile((batch_size * num_images_per_prompt,))
-
-        intermediate_images = self.prepare_intermediate_images(
-            image, noise_timestep, batch_size, num_images_per_prompt, dtype, mask_image, generator
-        )
-
-        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 7. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                model_input = (
-                    paddle.concat([intermediate_images] * 2) if do_classifier_free_guidance else intermediate_images
-                )
-                model_input = self.scheduler.scale_model_input(model_input, t)
-
-                # predict the noise residual
-                noise_pred = self.unet(
-                    model_input,
-                    t,
-                    encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                ).sample
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred_uncond, _ = noise_pred_uncond.split(
-                        [model_input.shape[1], noise_pred_uncond.shape[1] - model_input.shape[1]], axis=1
-                    )
-                    noise_pred_text, predicted_variance = noise_pred_text.split(
-                        [model_input.shape[1], noise_pred_text.shape[1] - model_input.shape[1]], axis=1
-                    )
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                    noise_pred = paddle.concat([noise_pred, predicted_variance], axis=1)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                prev_intermediate_images = intermediate_images
-
-                intermediate_images = self.scheduler.step(
-                    noise_pred, t, intermediate_images, **extra_step_kwargs
-                ).prev_sample
-
-                intermediate_images = (1 - mask_image) * prev_intermediate_images + mask_image * intermediate_images
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, intermediate_images)
-
-        image = intermediate_images
-
-        if output_type == "pil":
-            # 8. Post-processing
-            image = (image / 2 + 0.5).clip(0, 1)
-            image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy()
-
-            # 9. Run safety checker
-            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype)
-
-            # 10. Convert to PIL
-            image = self.numpy_to_pil(image)
-
-            # 11. Apply watermark
-            if self.watermarker is not None:
-                self.watermarker.apply_watermark(image, self.unet.config.sample_size)
-        elif output_type == "pd":
-            nsfw_detected = None
-            watermark_detected = None
-
-        else:
-            # 8. Post-processing
-            image = (image / 2 + 0.5).clip(0, 1)
-            image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy()
-
-            # 9. Run safety checker
-            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype)
-
-        if not return_dict:
-            return (image, nsfw_detected, watermark_detected)
-
-        return IFPipelineOutput(images=image, nsfw_detected=nsfw_detected, watermark_detected=watermark_detected)
diff --git a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
deleted file mode 100644
index b4fcfe245331..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
+++ /dev/null
@@ -1,1098 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import html
-import inspect
-import re
-import urllib.parse as ul
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import numpy as np
-import paddle
-import paddle.nn.functional as F
-import PIL
-
-from paddlenlp.transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
-
-from ...models import UNet2DConditionModel
-from ...schedulers import DDPMScheduler
-from ...utils import (
-    BACKENDS_MAPPING,
-    PIL_INTERPOLATION,
-    is_bs4_available,
-    is_ftfy_available,
-    logging,
-    randn_tensor,
-    replace_example_docstring,
-)
-from ..pipeline_utils import DiffusionPipeline
-from . import IFPipelineOutput
-from .safety_checker import IFSafetyChecker
-from .watermark import IFWatermarker
-
-if is_bs4_available():
-    from bs4 import BeautifulSoup
-
-if is_ftfy_available():
-    import ftfy
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-# Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_img2img.resize
-def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image:
-    w, h = images.size
-
-    coef = w / h
-
-    w, h = img_size, img_size
-
-    if coef >= 1:
-        w = int(round(img_size / 8 * coef) * 8)
-    else:
-        h = int(round(img_size / 8 / coef) * 8)
-
-    images = images.resize((w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None)
-
-    return images
-
-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> from ppdiffusers import IFInpaintingPipeline, IFInpaintingSuperResolutionPipeline, DiffusionPipeline
-        >>> from ppdiffusers.utils import pd_to_pil
-        >>> import paddle
-        >>> from PIL import Image
-        >>> import requests
-        >>> from io import BytesIO
-
-        >>> url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/if/person.png"
-        >>> response = requests.get(url)
-        >>> original_image = Image.open(BytesIO(response.content)).convert("RGB")
-        >>> original_image = original_image
-
-        >>> url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/if/glasses_mask.png"
-        >>> response = requests.get(url)
-        >>> mask_image = Image.open(BytesIO(response.content))
-        >>> mask_image = mask_image
-
-        >>> pipe = IFInpaintingPipeline.from_pretrained(
-        ...     "DeepFloyd/IF-I-XL-v1.0", variant="fp16", paddle_dtype=paddle.float16
-        ... )
-
-        >>> prompt = "blue sunglasses"
-
-        >>> prompt_embeds, negative_embeds = pipe.encode_prompt(prompt)
-        >>> image = pipe(
-        ...     image=original_image,
-        ...     mask_image=mask_image,
-        ...     prompt_embeds=prompt_embeds,
-        ...     negative_prompt_embeds=negative_embeds,
-        ...     output_type="pd",
-        ... ).images
-
-        >>> # save intermediate image
-        >>> pil_image = pd_to_pil(image)
-        >>> pil_image[0].save("./if_stage_I.png")
-
-        >>> super_res_1_pipe = IFInpaintingSuperResolutionPipeline.from_pretrained(
-        ...     "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", paddle_dtype=paddle.float16
-        ... )
-
-        >>> image = super_res_1_pipe(
-        ...     image=image,
-        ...     mask_image=mask_image,
-        ...     original_image=original_image,
-        ...     prompt_embeds=prompt_embeds,
-        ...     negative_prompt_embeds=negative_embeds,
-        ... ).images
-        >>> image[0].save("./if_stage_II.png")
-        ```
-    """
-
-
-class IFInpaintingSuperResolutionPipeline(DiffusionPipeline):
-    tokenizer: T5Tokenizer
-    text_encoder: T5EncoderModel
-
-    unet: UNet2DConditionModel
-    scheduler: DDPMScheduler
-    image_noising_scheduler: DDPMScheduler
-
-    feature_extractor: Optional[CLIPImageProcessor]
-    safety_checker: Optional[IFSafetyChecker]
-
-    watermarker: Optional[IFWatermarker]
-
-    bad_punct_regex = re.compile(
-        r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
-    )  # noqa
-
-    _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"]
-
-    def __init__(
-        self,
-        tokenizer: T5Tokenizer,
-        text_encoder: T5EncoderModel,
-        unet: UNet2DConditionModel,
-        scheduler: DDPMScheduler,
-        image_noising_scheduler: DDPMScheduler,
-        safety_checker: Optional[IFSafetyChecker],
-        feature_extractor: Optional[CLIPImageProcessor],
-        watermarker: Optional[IFWatermarker],
-        requires_safety_checker: bool = True,
-    ):
-        super().__init__()
-
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the IF license and do not expose unfiltered"
-                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                f"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-
-        if unet.config.in_channels != 6:
-            logger.warn(
-                "It seems like you have loaded a checkpoint that shall not be used for super resolution from {unet.config._name_or_path} as it accepts {unet.config.in_channels} input channels instead of 6. Please make sure to pass a super resolution checkpoint as the `'unet'`: IFSuperResolutionPipeline.from_pretrained(unet=super_resolution_unet, ...)`."
-            )
-
-        self.register_modules(
-            tokenizer=tokenizer,
-            text_encoder=text_encoder,
-            unet=unet,
-            scheduler=scheduler,
-            image_noising_scheduler=image_noising_scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-            watermarker=watermarker,
-        )
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-
-    # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
-    def _text_preprocessing(self, text, clean_caption=False):
-        if clean_caption and not is_bs4_available():
-            logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
-            logger.warn("Setting `clean_caption` to False...")
-            clean_caption = False
-
-        if clean_caption and not is_ftfy_available():
-            logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
-            logger.warn("Setting `clean_caption` to False...")
-            clean_caption = False
-
-        if not isinstance(text, (tuple, list)):
-            text = [text]
-
-        def process(text: str):
-            if clean_caption:
-                text = self._clean_caption(text)
-                text = self._clean_caption(text)
-            else:
-                text = text.lower().strip()
-            return text
-
-        return [process(t) for t in text]
-
-    # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
-    def _clean_caption(self, caption):
-        caption = str(caption)
-        caption = ul.unquote_plus(caption)
-        caption = caption.strip().lower()
-        caption = re.sub("<person>", "person", caption)
-        # urls:
-        caption = re.sub(
-            r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
-            "",
-            caption,
-        )  # regex for urls
-        caption = re.sub(
-            r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
-            "",
-            caption,
-        )  # regex for urls
-        # html:
-        caption = BeautifulSoup(caption, features="html.parser").text
-
-        # @<nickname>
-        caption = re.sub(r"@[\w\d]+\b", "", caption)
-
-        # 31C0—31EF CJK Strokes
-        # 31F0—31FF Katakana Phonetic Extensions
-        # 3200—32FF Enclosed CJK Letters and Months
-        # 3300—33FF CJK Compatibility
-        # 3400—4DBF CJK Unified Ideographs Extension A
-        # 4DC0—4DFF Yijing Hexagram Symbols
-        # 4E00—9FFF CJK Unified Ideographs
-        caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
-        caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
-        caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
-        caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
-        caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
-        caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
-        caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
-        #######################################################
-
-        # все виды тире / all types of dash --> "-"
-        caption = re.sub(
-            r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
-            "-",
-            caption,
-        )
-
-        # кавычки к одному стандарту
-        caption = re.sub(r"[`´«»“”¨]", '"', caption)
-        caption = re.sub(r"[‘’]", "'", caption)
-
-        # &quot;
-        caption = re.sub(r"&quot;?", "", caption)
-        # &amp
-        caption = re.sub(r"&amp", "", caption)
-
-        # ip adresses:
-        caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
-
-        # article ids:
-        caption = re.sub(r"\d:\d\d\s+$", "", caption)
-
-        # \n
-        caption = re.sub(r"\\n", " ", caption)
-
-        # "#123"
-        caption = re.sub(r"#\d{1,3}\b", "", caption)
-        # "#12345.."
-        caption = re.sub(r"#\d{5,}\b", "", caption)
-        # "123456.."
-        caption = re.sub(r"\b\d{6,}\b", "", caption)
-        # filenames:
-        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
-
-        #
-        caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
-        caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
-
-        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
-        caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
-
-        # this-is-my-cute-cat / this_is_my_cute_cat
-        regex2 = re.compile(r"(?:\-|\_)")
-        if len(re.findall(regex2, caption)) > 3:
-            caption = re.sub(regex2, " ", caption)
-
-        caption = ftfy.fix_text(caption)
-        caption = html.unescape(html.unescape(caption))
-
-        caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
-        caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
-        caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231
-
-        caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
-        caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
-        caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
-        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
-        caption = re.sub(r"\bpage\s+\d+\b", "", caption)
-
-        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
-
-        caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
-
-        caption = re.sub(r"\b\s+\:\s+", r": ", caption)
-        caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
-        caption = re.sub(r"\s+", " ", caption)
-
-        caption.strip()
-
-        caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
-        caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
-        caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
-        caption = re.sub(r"^\.\S+$", "", caption)
-
-        return caption.strip()
-
-    @paddle.no_grad()
-    # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt
-    def encode_prompt(
-        self,
-        prompt,
-        do_classifier_free_guidance=True,
-        num_images_per_prompt=1,
-        negative_prompt=None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        clean_caption: bool = False,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
-                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-        """
-        if prompt is not None and negative_prompt is not None:
-            if type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # while T5 can handle much longer input sequences than 77, the text encoder was trained with a max length of 77 for IF
-        max_length = 77
-
-        if prompt_embeds is None:
-            prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                add_special_tokens=True,
-                return_tensors="pd",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {max_length} tokens: {removed_text}"
-                )
-            attention_mask = text_inputs.attention_mask
-            prompt_embeds = self.text_encoder(
-                text_input_ids,
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        if self.text_encoder is not None:
-            dtype = self.text_encoder.dtype
-        elif self.unet is not None:
-            dtype = self.unet.dtype
-        else:
-            dtype = None
-
-        if dtype is not None:
-            prompt_embeds = prompt_embeds.cast(dtype)
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_attention_mask=True,
-                add_special_tokens=True,
-                return_tensors="pd",
-            )
-            attention_mask = uncond_input.attention_mask
-
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids,
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            if dtype is not None:
-                negative_prompt_embeds = negative_prompt_embeds.cast(dtype)
-
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-        else:
-            negative_prompt_embeds = None
-
-        return prompt_embeds, negative_prompt_embeds
-
-    # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker
-    def run_safety_checker(self, image, dtype):
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
-            image, nsfw_detected, watermark_detected = self.safety_checker(
-                images=image,
-                clip_input=safety_checker_input.pixel_values.cast(dtype),
-            )
-        else:
-            nsfw_detected = None
-            watermark_detected = None
-
-        return image, nsfw_detected, watermark_detected
-
-    # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    def check_inputs(
-        self,
-        prompt,
-        image,
-        original_image,
-        mask_image,
-        batch_size,
-        callback_steps,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-    ):
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-        # image
-
-        if isinstance(image, list):
-            check_image_type = image[0]
-        else:
-            check_image_type = image
-
-        if (
-            not isinstance(check_image_type, paddle.Tensor)
-            and not isinstance(check_image_type, PIL.Image.Image)
-            and not isinstance(check_image_type, np.ndarray)
-        ):
-            raise ValueError(
-                "`image` has to be of type `paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
-                f" {type(check_image_type)}"
-            )
-
-        if isinstance(image, list):
-            image_batch_size = len(image)
-        elif isinstance(image, paddle.Tensor):
-            image_batch_size = image.shape[0]
-        elif isinstance(image, PIL.Image.Image):
-            image_batch_size = 1
-        elif isinstance(image, np.ndarray):
-            image_batch_size = image.shape[0]
-        else:
-            assert False
-
-        if batch_size != image_batch_size:
-            raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}")
-
-        # original_image
-
-        if isinstance(original_image, list):
-            check_image_type = original_image[0]
-        else:
-            check_image_type = original_image
-
-        if (
-            not isinstance(check_image_type, paddle.Tensor)
-            and not isinstance(check_image_type, PIL.Image.Image)
-            and not isinstance(check_image_type, np.ndarray)
-        ):
-            raise ValueError(
-                "`original_image` has to be of type `paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
-                f" {type(check_image_type)}"
-            )
-
-        if isinstance(original_image, list):
-            image_batch_size = len(original_image)
-        elif isinstance(original_image, paddle.Tensor):
-            image_batch_size = original_image.shape[0]
-        elif isinstance(original_image, PIL.Image.Image):
-            image_batch_size = 1
-        elif isinstance(original_image, np.ndarray):
-            image_batch_size = original_image.shape[0]
-        else:
-            assert False
-
-        if batch_size != image_batch_size:
-            raise ValueError(
-                f"original_image batch size: {image_batch_size} must be same as prompt batch size {batch_size}"
-            )
-
-        # mask_image
-
-        if isinstance(mask_image, list):
-            check_image_type = mask_image[0]
-        else:
-            check_image_type = mask_image
-
-        if (
-            not isinstance(check_image_type, paddle.Tensor)
-            and not isinstance(check_image_type, PIL.Image.Image)
-            and not isinstance(check_image_type, np.ndarray)
-        ):
-            raise ValueError(
-                "`mask_image` has to be of type `paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
-                f" {type(check_image_type)}"
-            )
-
-        if isinstance(mask_image, list):
-            image_batch_size = len(mask_image)
-        elif isinstance(mask_image, paddle.Tensor):
-            image_batch_size = mask_image.shape[0]
-        elif isinstance(mask_image, PIL.Image.Image):
-            image_batch_size = 1
-        elif isinstance(mask_image, np.ndarray):
-            image_batch_size = mask_image.shape[0]
-        else:
-            assert False
-
-        if image_batch_size != 1 and batch_size != image_batch_size:
-            raise ValueError(
-                f"mask_image batch size: {image_batch_size} must be `1` or the same as prompt batch size {batch_size}"
-            )
-
-    # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.preprocess_image with preprocess_image -> preprocess_original_image
-    def preprocess_original_image(self, image: PIL.Image.Image) -> paddle.Tensor:
-        if not isinstance(image, list):
-            image = [image]
-
-        def numpy_to_pd(images):
-            if images.ndim == 3:
-                images = images[..., None]
-
-            images = paddle.to_tensor(images.transpose(0, 3, 1, 2))
-            return images
-
-        if isinstance(image[0], PIL.Image.Image):
-            new_image = []
-
-            for image_ in image:
-                image_ = image_.convert("RGB")
-                image_ = resize(image_, self.unet.sample_size)
-                image_ = np.array(image_)
-                image_ = image_.astype(np.float32)
-                image_ = image_ / 127.5 - 1
-                new_image.append(image_)
-
-            image = new_image
-
-            image = np.stack(image, axis=0)  # to np
-            image = numpy_to_pd(image)  # to pd
-
-        elif isinstance(image[0], np.ndarray):
-            image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
-            image = numpy_to_pd(image)
-
-        elif isinstance(image[0], paddle.Tensor):
-            image = paddle.concat(image, axis=0) if image[0].ndim == 4 else paddle.stack(image, axis=0)
-
-        return image
-
-    # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_superresolution.IFSuperResolutionPipeline.preprocess_image
-    def preprocess_image(self, image: PIL.Image.Image, num_images_per_prompt) -> paddle.Tensor:
-        if not isinstance(image, paddle.Tensor) and not isinstance(image, list):
-            image = [image]
-
-        if isinstance(image[0], PIL.Image.Image):
-            image = [np.array(i).astype(np.float32) / 255.0 for i in image]
-
-            image = np.stack(image, axis=0)  # to np
-            image = paddle.to_tensor(image.transpose(0, 3, 1, 2))
-        elif isinstance(image[0], np.ndarray):
-            image = np.stack(image, axis=0)  # to np
-            if image.ndim == 5:
-                image = image[0]
-
-            image = paddle.to_tensor(image.transpose(0, 3, 1, 2))
-        elif isinstance(image, list) and isinstance(image[0], paddle.Tensor):
-            dims = image[0].ndim
-
-            if dims == 3:
-                image = paddle.stack(image, axis=0)
-            elif dims == 4:
-                image = paddle.concat(image, axis=0)
-            else:
-                raise ValueError(f"Image must have 3 or 4 dimensions, instead got {dims}")
-
-        image = image.cast(self.unet.dtype)
-
-        image = image.repeat_interleave(num_images_per_prompt, axis=0)
-
-        return image
-
-    # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_inpainting.IFInpaintingPipeline.preprocess_mask_image
-    def preprocess_mask_image(self, mask_image) -> paddle.Tensor:
-        if not isinstance(mask_image, list):
-            mask_image = [mask_image]
-
-        if isinstance(mask_image[0], paddle.Tensor):
-            mask_image = (
-                paddle.concat(mask_image, axis=0) if mask_image[0].ndim == 4 else paddle.stack(mask_image, axis=0)
-            )
-
-            if mask_image.ndim == 2:
-                # Batch and add channel dim for single mask
-                mask_image = mask_image.unsqueeze(0).unsqueeze(0)
-            elif mask_image.ndim == 3 and mask_image.shape[0] == 1:
-                # Single mask, the 0'th dimension is considered to be
-                # the existing batch size of 1
-                mask_image = mask_image.unsqueeze(0)
-            elif mask_image.ndim == 3 and mask_image.shape[0] != 1:
-                # Batch of mask, the 0'th dimension is considered to be
-                # the batching dimension
-                mask_image = mask_image.unsqueeze(1)
-
-            mask_image[mask_image < 0.5] = 0
-            mask_image[mask_image >= 0.5] = 1
-
-        elif isinstance(mask_image[0], PIL.Image.Image):
-            new_mask_image = []
-
-            for mask_image_ in mask_image:
-                mask_image_ = mask_image_.convert("L")
-                mask_image_ = resize(mask_image_, self.unet.sample_size)
-                mask_image_ = np.array(mask_image_)
-                mask_image_ = mask_image_[None, None, :]
-                new_mask_image.append(mask_image_)
-
-            mask_image = new_mask_image
-
-            mask_image = np.concatenate(mask_image, axis=0)
-            mask_image = mask_image.astype(np.float32) / 255.0
-            mask_image[mask_image < 0.5] = 0
-            mask_image[mask_image >= 0.5] = 1
-            mask_image = paddle.to_tensor(mask_image)
-
-        elif isinstance(mask_image[0], np.ndarray):
-            mask_image = np.concatenate([m[None, None, :] for m in mask_image], axis=0)
-
-            mask_image[mask_image < 0.5] = 0
-            mask_image[mask_image >= 0.5] = 1
-            mask_image = paddle.to_tensor(mask_image)
-
-        return mask_image
-
-    # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.get_timesteps
-    def get_timesteps(self, num_inference_steps, strength):
-        # get the original timestep using init_timestep
-        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
-
-        t_start = max(num_inference_steps - init_timestep, 0)
-        timesteps = self.scheduler.timesteps[t_start:]
-
-        return timesteps, num_inference_steps - t_start
-
-    # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_inpainting.IFInpaintingPipeline.prepare_intermediate_images
-    def prepare_intermediate_images(
-        self, image, timestep, batch_size, num_images_per_prompt, dtype, mask_image, generator=None
-    ):
-        image_batch_size, channels, height, width = image.shape
-
-        batch_size = batch_size * num_images_per_prompt
-
-        shape = (batch_size, channels, height, width)
-
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        noise = randn_tensor(shape, generator=generator, dtype=dtype)
-
-        image = image.repeat_interleave(num_images_per_prompt, axis=0)
-        noised_image = self.scheduler.add_noise(image, noise, timestep)
-
-        image = (1 - mask_image) * image + mask_image * noised_image
-
-        return image
-
-    @paddle.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        image: Union[PIL.Image.Image, np.ndarray, paddle.Tensor],
-        original_image: Union[
-            PIL.Image.Image, paddle.Tensor, np.ndarray, List[PIL.Image.Image], List[paddle.Tensor], List[np.ndarray]
-        ] = None,
-        mask_image: Union[
-            PIL.Image.Image, paddle.Tensor, np.ndarray, List[PIL.Image.Image], List[paddle.Tensor], List[np.ndarray]
-        ] = None,
-        strength: float = 0.8,
-        prompt: Union[str, List[str]] = None,
-        num_inference_steps: int = 100,
-        timesteps: List[int] = None,
-        guidance_scale: float = 4.0,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: int = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        noise_level: int = 0,
-        clean_caption: bool = True,
-    ):
-        """
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            image (`paddle.Tensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, that will be used as the starting point for the
-                process.
-            original_image (`paddle.Tensor` or `PIL.Image.Image`):
-                The original image that `image` was varied from.
-            mask_image (`PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
-                repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
-                to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
-                instead of 3, so the expected shape would be `(B, H, W, 1)`.
-            strength (`float`, *optional*, defaults to 0.8):
-                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
-                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
-                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
-                be maximum and the denoising process will run for the full number of iterations specified in
-                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            timesteps (`List[int]`, *optional*):
-                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
-                timesteps are used. Must be in descending order.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
-                One or a list of [paddle generator(s)] to make generation deterministic.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
-            noise_level (`int`, *optional*, defaults to 0):
-                The amount of noise to add to the upscaled image. Must be in the range `[0, 1000)`
-            clean_caption (`bool`, *optional*, defaults to `True`):
-                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
-                be installed. If the dependencies are not installed, the embeddings will be created from the raw
-                prompt.
-
-        Examples:
-
-        Returns:
-            [`~pipelines.stable_diffusion.IFPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.IFPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
-            returning a tuple, the first element is a list with the generated images, and the second element is a list
-            of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw)
-            or watermarked content, according to the `safety_checker`.
-        """
-        # 1. Check inputs. Raise error if not correct
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        self.check_inputs(
-            prompt,
-            image,
-            original_image,
-            mask_image,
-            batch_size,
-            callback_steps,
-            negative_prompt,
-            prompt_embeds,
-            negative_prompt_embeds,
-        )
-
-        # 2. Define call parameters
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input prompt
-        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
-            prompt,
-            do_classifier_free_guidance,
-            num_images_per_prompt=num_images_per_prompt,
-            negative_prompt=negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            clean_caption=clean_caption,
-        )
-
-        if do_classifier_free_guidance:
-            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
-
-        dtype = prompt_embeds.dtype
-
-        # 4. Prepare timesteps
-        if timesteps is not None:
-            self.scheduler.set_timesteps(timesteps=timesteps)
-            timesteps = self.scheduler.timesteps
-            num_inference_steps = len(timesteps)
-        else:
-            self.scheduler.set_timesteps(num_inference_steps)
-            timesteps = self.scheduler.timesteps
-
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
-
-        # 5. prepare original image
-        original_image = self.preprocess_original_image(original_image)
-        original_image = original_image.cast(dtype)
-
-        # 6. prepare mask image
-        mask_image = self.preprocess_mask_image(mask_image)
-        mask_image = mask_image.cast(dtype)
-
-        if mask_image.shape[0] == 1:
-            mask_image = mask_image.repeat_interleave(batch_size * num_images_per_prompt, axis=0)
-        else:
-            mask_image = mask_image.repeat_interleave(num_images_per_prompt, axis=0)
-
-        # 6. Prepare intermediate images
-        noise_timestep = timesteps[0:1]
-        noise_timestep = noise_timestep.tile((batch_size * num_images_per_prompt,))
-
-        intermediate_images = self.prepare_intermediate_images(
-            original_image,
-            noise_timestep,
-            batch_size,
-            num_images_per_prompt,
-            dtype,
-            mask_image,
-            generator,
-        )
-
-        # 7. Prepare upscaled image and noise level
-        _, _, height, width = original_image.shape
-
-        image = self.preprocess_image(image, num_images_per_prompt)
-
-        upscaled = F.interpolate(image, (height, width), mode="bilinear", align_corners=True)
-
-        noise_level = paddle.to_tensor([noise_level] * upscaled.shape[0])
-        noise = randn_tensor(upscaled.shape, generator=generator, dtype=upscaled.dtype)
-        upscaled = self.image_noising_scheduler.add_noise(upscaled, noise, timesteps=noise_level)
-
-        if do_classifier_free_guidance:
-            noise_level = paddle.concat([noise_level] * 2)
-
-        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 9. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                model_input = paddle.concat([intermediate_images, upscaled.cast(intermediate_images.dtype)], axis=1)
-
-                model_input = paddle.concat([model_input] * 2) if do_classifier_free_guidance else model_input
-                model_input = self.scheduler.scale_model_input(model_input, t)
-
-                # predict the noise residual
-                noise_pred = self.unet(
-                    model_input,
-                    t,
-                    encoder_hidden_states=prompt_embeds,
-                    class_labels=noise_level,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                ).sample
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred_uncond, _ = noise_pred_uncond.split(
-                        [model_input.shape[1] // 2, noise_pred_uncond.shape[1] - model_input.shape[1] // 2], axis=1
-                    )
-                    noise_pred_text, predicted_variance = noise_pred_text.split(
-                        [model_input.shape[1] // 2, noise_pred_text.shape[1] - model_input.shape[1] // 2], axis=1
-                    )
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                    noise_pred = paddle.concat([noise_pred, predicted_variance], axis=1)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                prev_intermediate_images = intermediate_images
-
-                intermediate_images = self.scheduler.step(
-                    noise_pred, t, intermediate_images, **extra_step_kwargs
-                ).prev_sample
-
-                intermediate_images = (1 - mask_image) * prev_intermediate_images + mask_image * intermediate_images
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, intermediate_images)
-
-        image = intermediate_images
-
-        if output_type == "pil":
-            # 10. Post-processing
-            image = (image / 2 + 0.5).clip(0, 1)
-            image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy()
-
-            # 11. Run safety checker
-            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype)
-
-            # 12. Convert to PIL
-            image = self.numpy_to_pil(image)
-
-            # 13. Apply watermark
-            if self.watermarker is not None:
-                self.watermarker.apply_watermark(image, self.unet.config.sample_size)
-        elif output_type == "pd":
-            nsfw_detected = None
-            watermark_detected = None
-
-        else:
-            # 10. Post-processing
-            image = (image / 2 + 0.5).clip(0, 1)
-            image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy()
-
-            # 11. Run safety checker
-            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype)
-
-        if not return_dict:
-            return (image, nsfw_detected, watermark_detected)
-
-        return IFPipelineOutput(images=image, nsfw_detected=nsfw_detected, watermark_detected=watermark_detected)
diff --git a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
deleted file mode 100644
index da22cbe313bc..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
+++ /dev/null
@@ -1,843 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import html
-import inspect
-import re
-import urllib.parse as ul
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import numpy as np
-import paddle
-import paddle.nn.functional as F
-import PIL
-
-from paddlenlp.transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
-
-from ...models import UNet2DConditionModel
-from ...schedulers import DDPMScheduler
-from ...utils import (
-    BACKENDS_MAPPING,
-    is_bs4_available,
-    is_ftfy_available,
-    logging,
-    randn_tensor,
-    replace_example_docstring,
-)
-from ..pipeline_utils import DiffusionPipeline
-from . import IFPipelineOutput
-from .safety_checker import IFSafetyChecker
-from .watermark import IFWatermarker
-
-if is_bs4_available():
-    from bs4 import BeautifulSoup
-
-if is_ftfy_available():
-    import ftfy
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> from ppdiffusers import IFPipeline, IFSuperResolutionPipeline, DiffusionPipeline
-        >>> from ppdiffusers.utils import pd_to_pil
-        >>> import paddle
-
-        >>> pipe = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", paddle_dtype=paddle.float16)
-
-        >>> prompt = 'a photo of a kangaroo wearing an orange hoodie and blue sunglasses standing in front of the eiffel tower holding a sign that says "very deep learning"'
-        >>> prompt_embeds, negative_embeds = pipe.encode_prompt(prompt)
-
-        >>> image = pipe(prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_embeds, output_type="pd").images
-
-        >>> # save intermediate image
-        >>> pil_image = pd_to_pil(image)
-        >>> pil_image[0].save("./if_stage_I.png")
-
-        >>> super_res_1_pipe = IFSuperResolutionPipeline.from_pretrained(
-        ...     "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", paddle_dtype=paddle.float16
-        ... )
-
-        >>> image = super_res_1_pipe(
-        ...     image=image, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_embeds
-        ... ).images
-        >>> image[0].save("./if_stage_II.png")
-        ```
-"""
-
-
-class IFSuperResolutionPipeline(DiffusionPipeline):
-    tokenizer: T5Tokenizer
-    text_encoder: T5EncoderModel
-
-    unet: UNet2DConditionModel
-    scheduler: DDPMScheduler
-    image_noising_scheduler: DDPMScheduler
-
-    feature_extractor: Optional[CLIPImageProcessor]
-    safety_checker: Optional[IFSafetyChecker]
-
-    watermarker: Optional[IFWatermarker]
-
-    bad_punct_regex = re.compile(
-        r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
-    )  # noqa
-
-    _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"]
-
-    def __init__(
-        self,
-        tokenizer: T5Tokenizer,
-        text_encoder: T5EncoderModel,
-        unet: UNet2DConditionModel,
-        scheduler: DDPMScheduler,
-        image_noising_scheduler: DDPMScheduler,
-        safety_checker: Optional[IFSafetyChecker],
-        feature_extractor: Optional[CLIPImageProcessor],
-        watermarker: Optional[IFWatermarker],
-        requires_safety_checker: bool = True,
-    ):
-        super().__init__()
-
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the IF license and do not expose unfiltered"
-                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                f"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-
-        if unet.config.in_channels != 6:
-            logger.warn(
-                "It seems like you have loaded a checkpoint that shall not be used for super resolution from {unet.config._name_or_path} as it accepts {unet.config.in_channels} input channels instead of 6. Please make sure to pass a super resolution checkpoint as the `'unet'`: IFSuperResolutionPipeline.from_pretrained(unet=super_resolution_unet, ...)`."
-            )
-
-        self.register_modules(
-            tokenizer=tokenizer,
-            text_encoder=text_encoder,
-            unet=unet,
-            scheduler=scheduler,
-            image_noising_scheduler=image_noising_scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-            watermarker=watermarker,
-        )
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-
-    # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
-    def _text_preprocessing(self, text, clean_caption=False):
-        if clean_caption and not is_bs4_available():
-            logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
-            logger.warn("Setting `clean_caption` to False...")
-            clean_caption = False
-
-        if clean_caption and not is_ftfy_available():
-            logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
-            logger.warn("Setting `clean_caption` to False...")
-            clean_caption = False
-
-        if not isinstance(text, (tuple, list)):
-            text = [text]
-
-        def process(text: str):
-            if clean_caption:
-                text = self._clean_caption(text)
-                text = self._clean_caption(text)
-            else:
-                text = text.lower().strip()
-            return text
-
-        return [process(t) for t in text]
-
-    # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
-    def _clean_caption(self, caption):
-        caption = str(caption)
-        caption = ul.unquote_plus(caption)
-        caption = caption.strip().lower()
-        caption = re.sub("<person>", "person", caption)
-        # urls:
-        caption = re.sub(
-            r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
-            "",
-            caption,
-        )  # regex for urls
-        caption = re.sub(
-            r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
-            "",
-            caption,
-        )  # regex for urls
-        # html:
-        caption = BeautifulSoup(caption, features="html.parser").text
-
-        # @<nickname>
-        caption = re.sub(r"@[\w\d]+\b", "", caption)
-
-        # 31C0—31EF CJK Strokes
-        # 31F0—31FF Katakana Phonetic Extensions
-        # 3200—32FF Enclosed CJK Letters and Months
-        # 3300—33FF CJK Compatibility
-        # 3400—4DBF CJK Unified Ideographs Extension A
-        # 4DC0—4DFF Yijing Hexagram Symbols
-        # 4E00—9FFF CJK Unified Ideographs
-        caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
-        caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
-        caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
-        caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
-        caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
-        caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
-        caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
-        #######################################################
-
-        # все виды тире / all types of dash --> "-"
-        caption = re.sub(
-            r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
-            "-",
-            caption,
-        )
-
-        # кавычки к одному стандарту
-        caption = re.sub(r"[`´«»“”¨]", '"', caption)
-        caption = re.sub(r"[‘’]", "'", caption)
-
-        # &quot;
-        caption = re.sub(r"&quot;?", "", caption)
-        # &amp
-        caption = re.sub(r"&amp", "", caption)
-
-        # ip adresses:
-        caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
-
-        # article ids:
-        caption = re.sub(r"\d:\d\d\s+$", "", caption)
-
-        # \n
-        caption = re.sub(r"\\n", " ", caption)
-
-        # "#123"
-        caption = re.sub(r"#\d{1,3}\b", "", caption)
-        # "#12345.."
-        caption = re.sub(r"#\d{5,}\b", "", caption)
-        # "123456.."
-        caption = re.sub(r"\b\d{6,}\b", "", caption)
-        # filenames:
-        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
-
-        #
-        caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
-        caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
-
-        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
-        caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
-
-        # this-is-my-cute-cat / this_is_my_cute_cat
-        regex2 = re.compile(r"(?:\-|\_)")
-        if len(re.findall(regex2, caption)) > 3:
-            caption = re.sub(regex2, " ", caption)
-
-        caption = ftfy.fix_text(caption)
-        caption = html.unescape(html.unescape(caption))
-
-        caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
-        caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
-        caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231
-
-        caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
-        caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
-        caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
-        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
-        caption = re.sub(r"\bpage\s+\d+\b", "", caption)
-
-        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
-
-        caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
-
-        caption = re.sub(r"\b\s+\:\s+", r": ", caption)
-        caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
-        caption = re.sub(r"\s+", " ", caption)
-
-        caption.strip()
-
-        caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
-        caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
-        caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
-        caption = re.sub(r"^\.\S+$", "", caption)
-
-        return caption.strip()
-
-    @paddle.no_grad()
-    # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt
-    def encode_prompt(
-        self,
-        prompt,
-        do_classifier_free_guidance=True,
-        num_images_per_prompt=1,
-        negative_prompt=None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        clean_caption: bool = False,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
-                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-        """
-        if prompt is not None and negative_prompt is not None:
-            if type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # while T5 can handle much longer input sequences than 77, the text encoder was trained with a max length of 77 for IF
-        max_length = 77
-
-        if prompt_embeds is None:
-            prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                add_special_tokens=True,
-                return_tensors="pd",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {max_length} tokens: {removed_text}"
-                )
-
-            attention_mask = text_inputs.attention_mask
-            prompt_embeds = self.text_encoder(
-                text_input_ids,
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        if self.text_encoder is not None:
-            dtype = self.text_encoder.dtype
-        elif self.unet is not None:
-            dtype = self.unet.dtype
-        else:
-            dtype = None
-
-        if dtype is not None:
-            prompt_embeds = prompt_embeds.cast(dtype)
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_attention_mask=True,
-                add_special_tokens=True,
-                return_tensors="pd",
-            )
-            attention_mask = uncond_input.attention_mask
-
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids,
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            if dtype is not None:
-                negative_prompt_embeds = negative_prompt_embeds.cast(dtype)
-
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-        else:
-            negative_prompt_embeds = None
-
-        return prompt_embeds, negative_prompt_embeds
-
-    # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker
-    def run_safety_checker(self, image, dtype):
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
-            image, nsfw_detected, watermark_detected = self.safety_checker(
-                images=image,
-                clip_input=safety_checker_input.pixel_values.cast(dtype),
-            )
-        else:
-            nsfw_detected = None
-            watermark_detected = None
-
-        return image, nsfw_detected, watermark_detected
-
-    # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    def check_inputs(
-        self,
-        prompt,
-        image,
-        batch_size,
-        noise_level,
-        callback_steps,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-    ):
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-        if noise_level < 0 or noise_level >= self.image_noising_scheduler.config.num_train_timesteps:
-            raise ValueError(
-                f"`noise_level`: {noise_level} must be a valid timestep in `self.noising_scheduler`, [0, {self.image_noising_scheduler.config.num_train_timesteps})"
-            )
-
-        if isinstance(image, list):
-            check_image_type = image[0]
-        else:
-            check_image_type = image
-
-        if (
-            not isinstance(check_image_type, paddle.Tensor)
-            and not isinstance(check_image_type, PIL.Image.Image)
-            and not isinstance(check_image_type, np.ndarray)
-        ):
-            raise ValueError(
-                "`image` has to be of type `paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
-                f" {type(check_image_type)}"
-            )
-
-        if isinstance(image, list):
-            image_batch_size = len(image)
-        elif isinstance(image, paddle.Tensor):
-            image_batch_size = image.shape[0]
-        elif isinstance(image, PIL.Image.Image):
-            image_batch_size = 1
-        elif isinstance(image, np.ndarray):
-            image_batch_size = image.shape[0]
-        else:
-            assert False
-
-        if batch_size != image_batch_size:
-            raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}")
-
-    # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.prepare_intermediate_images
-    def prepare_intermediate_images(self, batch_size, num_channels, height, width, dtype, generator):
-        shape = (batch_size, num_channels, height, width)
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        intermediate_images = randn_tensor(shape, generator=generator, dtype=dtype)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        intermediate_images = intermediate_images * self.scheduler.init_noise_sigma
-        return intermediate_images
-
-    def preprocess_image(self, image, num_images_per_prompt):
-        if not isinstance(image, paddle.Tensor) and not isinstance(image, list):
-            image = [image]
-
-        if isinstance(image[0], PIL.Image.Image):
-            image = [np.array(i).astype(np.float32) / 255.0 for i in image]
-
-            image = np.stack(image, axis=0)  # to np
-            image = paddle.to_tensor(image.transpose(0, 3, 1, 2))
-        elif isinstance(image[0], np.ndarray):
-            image = np.stack(image, axis=0)  # to np
-            if image.ndim == 5:
-                image = image[0]
-
-            image = paddle.to_tensor(image.transpose(0, 3, 1, 2))
-        elif isinstance(image, list) and isinstance(image[0], paddle.Tensor):
-            dims = image[0].ndim
-
-            if dims == 3:
-                image = paddle.stack(image, axis=0)
-            elif dims == 4:
-                image = paddle.concat(image, axis=0)
-            else:
-                raise ValueError(f"Image must have 3 or 4 dimensions, instead got {dims}")
-
-        image = image.cast(self.unet.dtype)
-
-        image = image.repeat_interleave(num_images_per_prompt, axis=0)
-
-        return image
-
-    @paddle.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        height: int = None,
-        width: int = None,
-        image: Union[PIL.Image.Image, np.ndarray, paddle.Tensor] = None,
-        num_inference_steps: int = 50,
-        timesteps: List[int] = None,
-        guidance_scale: float = 4.0,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: int = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        noise_level: int = 250,
-        clean_caption: bool = True,
-    ):
-        """
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            height (`int`, *optional*, defaults to self.unet.config.sample_size):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size):
-                The width in pixels of the generated image.
-            image (`PIL.Image.Image`, `np.ndarray`, `paddle.Tensor`):
-                The image to be upscaled.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            timesteps (`List[int]`, *optional*):
-                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
-                timesteps are used. Must be in descending order.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
-                One or a list of [paddle generator(s)] to make generation deterministic.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
-            noise_level (`int`, *optional*, defaults to 250):
-                The amount of noise to add to the upscaled image. Must be in the range `[0, 1000)`
-            clean_caption (`bool`, *optional*, defaults to `True`):
-                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
-                be installed. If the dependencies are not installed, the embeddings will be created from the raw
-                prompt.
-
-        Examples:
-
-        Returns:
-            [`~pipelines.stable_diffusion.IFPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.IFPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
-            returning a tuple, the first element is a list with the generated images, and the second element is a list
-            of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw)
-            or watermarked content, according to the `safety_checker`.
-        """
-        # 1. Check inputs. Raise error if not correct
-
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        self.check_inputs(
-            prompt,
-            image,
-            batch_size,
-            noise_level,
-            callback_steps,
-            negative_prompt,
-            prompt_embeds,
-            negative_prompt_embeds,
-        )
-
-        # 2. Define call parameters
-
-        height = height or self.unet.config.sample_size
-        width = width or self.unet.config.sample_size
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input prompt
-        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
-            prompt,
-            do_classifier_free_guidance,
-            num_images_per_prompt=num_images_per_prompt,
-            negative_prompt=negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            clean_caption=clean_caption,
-        )
-
-        if do_classifier_free_guidance:
-            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
-
-        # 4. Prepare timesteps
-        if timesteps is not None:
-            self.scheduler.set_timesteps(timesteps=timesteps)
-            timesteps = self.scheduler.timesteps
-            num_inference_steps = len(timesteps)
-        else:
-            self.scheduler.set_timesteps(num_inference_steps)
-            timesteps = self.scheduler.timesteps
-
-        # 5. Prepare intermediate images
-        num_channels = self.unet.config.in_channels // 2
-        intermediate_images = self.prepare_intermediate_images(
-            batch_size * num_images_per_prompt,
-            num_channels,
-            height,
-            width,
-            prompt_embeds.dtype,
-            generator,
-        )
-
-        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 7. Prepare upscaled image and noise level
-        image = self.preprocess_image(image, num_images_per_prompt)
-        upscaled = F.interpolate(image, (height, width), mode="bilinear", align_corners=True)
-
-        noise_level = paddle.to_tensor([noise_level] * upscaled.shape[0])
-        noise = randn_tensor(upscaled.shape, generator=generator, dtype=upscaled.dtype)
-        upscaled = self.image_noising_scheduler.add_noise(upscaled, noise, timesteps=noise_level)
-
-        if do_classifier_free_guidance:
-            noise_level = paddle.concat([noise_level] * 2)
-
-        # 8. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                model_input = paddle.concat([intermediate_images, upscaled.cast(intermediate_images.dtype)], axis=1)
-
-                model_input = paddle.concat([model_input] * 2) if do_classifier_free_guidance else model_input
-                model_input = self.scheduler.scale_model_input(model_input, t)
-
-                # predict the noise residual
-                noise_pred = self.unet(
-                    model_input,
-                    t,
-                    encoder_hidden_states=prompt_embeds,
-                    class_labels=noise_level,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                    return_dict=False,
-                )[0]
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred_uncond, _ = noise_pred_uncond.split(
-                        [model_input.shape[1] // 2, noise_pred_uncond.shape[1] - model_input.shape[1] // 2], axis=1
-                    )
-                    noise_pred_text, predicted_variance = noise_pred_text.split(
-                        [model_input.shape[1] // 2, noise_pred_text.shape[1] - model_input.shape[1] // 2], axis=1
-                    )
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                    noise_pred = paddle.concat([noise_pred, predicted_variance], axis=1)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                intermediate_images = self.scheduler.step(
-                    noise_pred, t, intermediate_images, **extra_step_kwargs, return_dict=False
-                )[0]
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, intermediate_images)
-
-        image = intermediate_images
-
-        if output_type == "pil":
-            # 9. Post-processing
-            image = (image / 2 + 0.5).clip(0, 1)
-            image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy()
-
-            # 10. Run safety checker
-            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype)
-
-            # 11. Convert to PIL
-            image = self.numpy_to_pil(image)
-
-            # 12. Apply watermark
-            if self.watermarker is not None:
-                self.watermarker.apply_watermark(image, self.unet.config.sample_size)
-        elif output_type == "pd":
-            nsfw_detected = None
-            watermark_detected = None
-
-        else:
-            # 9. Post-processing
-            image = (image / 2 + 0.5).clip(0, 1)
-            image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy()
-
-            # 10. Run safety checker
-            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype)
-
-        if not return_dict:
-            return (image, nsfw_detected, watermark_detected)
-
-        return IFPipelineOutput(images=image, nsfw_detected=nsfw_detected, watermark_detected=watermark_detected)
diff --git a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/safety_checker.py b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/safety_checker.py
deleted file mode 100644
index 35de3106c75a..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/safety_checker.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import paddle
-import paddle.nn as nn
-
-from paddlenlp.transformers import (
-    CLIPConfig,
-    CLIPVisionModelWithProjection,
-    PretrainedModel,
-)
-
-from ...utils import logging
-
-logger = logging.get_logger(__name__)
-
-
-class IFSafetyChecker(PretrainedModel):
-    config_class = CLIPConfig
-
-    def __init__(self, config: CLIPConfig):
-        super().__init__(config)
-
-        self.vision_model = CLIPVisionModelWithProjection(config.vision_config)
-
-        self.p_head = nn.Linear(config.vision_config.projection_dim, 1)
-        self.w_head = nn.Linear(config.vision_config.projection_dim, 1)
-
-    @paddle.no_grad()
-    def forward(self, clip_input, images, p_threshold=0.5, w_threshold=0.5):
-        image_embeds = self.vision_model(clip_input)[0]
-
-        nsfw_detected = self.p_head(image_embeds)
-        nsfw_detected = nsfw_detected.flatten()
-        nsfw_detected = nsfw_detected > p_threshold
-        nsfw_detected = nsfw_detected.tolist()
-
-        if any(nsfw_detected):
-            logger.warning(
-                "Potential NSFW content was detected in one or more images. A black image will be returned instead."
-                " Try again with a different prompt and/or seed."
-            )
-
-        for idx, nsfw_detected_ in enumerate(nsfw_detected):
-            if nsfw_detected_:
-                images[idx] = np.zeros(images[idx].shape)
-
-        watermark_detected = self.w_head(image_embeds)
-        watermark_detected = watermark_detected.flatten()
-        watermark_detected = watermark_detected > w_threshold
-        watermark_detected = watermark_detected.tolist()
-
-        if any(watermark_detected):
-            logger.warning(
-                "Potential watermarked content was detected in one or more images. A black image will be returned instead."
-                " Try again with a different prompt and/or seed."
-            )
-
-        for idx, watermark_detected_ in enumerate(watermark_detected):
-            if watermark_detected_:
-                images[idx] = np.zeros(images[idx].shape)
-
-        return images, nsfw_detected, watermark_detected
diff --git a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/timesteps.py b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/timesteps.py
deleted file mode 100644
index 041e468cd4cc..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/timesteps.py
+++ /dev/null
@@ -1,593 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-fast27_timesteps = [
-    999,
-    800,
-    799,
-    600,
-    599,
-    500,
-    400,
-    399,
-    377,
-    355,
-    333,
-    311,
-    288,
-    266,
-    244,
-    222,
-    200,
-    199,
-    177,
-    155,
-    133,
-    111,
-    88,
-    66,
-    44,
-    22,
-    0,
-]
-
-smart27_timesteps = [
-    999,
-    976,
-    952,
-    928,
-    905,
-    882,
-    858,
-    857,
-    810,
-    762,
-    715,
-    714,
-    572,
-    429,
-    428,
-    286,
-    285,
-    238,
-    190,
-    143,
-    142,
-    118,
-    95,
-    71,
-    47,
-    24,
-    0,
-]
-
-smart50_timesteps = [
-    999,
-    988,
-    977,
-    966,
-    955,
-    944,
-    933,
-    922,
-    911,
-    900,
-    899,
-    879,
-    859,
-    840,
-    820,
-    800,
-    799,
-    766,
-    733,
-    700,
-    699,
-    650,
-    600,
-    599,
-    500,
-    499,
-    400,
-    399,
-    350,
-    300,
-    299,
-    266,
-    233,
-    200,
-    199,
-    179,
-    159,
-    140,
-    120,
-    100,
-    99,
-    88,
-    77,
-    66,
-    55,
-    44,
-    33,
-    22,
-    11,
-    0,
-]
-
-smart100_timesteps = [
-    999,
-    995,
-    992,
-    989,
-    985,
-    981,
-    978,
-    975,
-    971,
-    967,
-    964,
-    961,
-    957,
-    956,
-    951,
-    947,
-    942,
-    937,
-    933,
-    928,
-    923,
-    919,
-    914,
-    913,
-    908,
-    903,
-    897,
-    892,
-    887,
-    881,
-    876,
-    871,
-    870,
-    864,
-    858,
-    852,
-    846,
-    840,
-    834,
-    828,
-    827,
-    820,
-    813,
-    806,
-    799,
-    792,
-    785,
-    784,
-    777,
-    770,
-    763,
-    756,
-    749,
-    742,
-    741,
-    733,
-    724,
-    716,
-    707,
-    699,
-    698,
-    688,
-    677,
-    666,
-    656,
-    655,
-    645,
-    634,
-    623,
-    613,
-    612,
-    598,
-    584,
-    570,
-    569,
-    555,
-    541,
-    527,
-    526,
-    505,
-    484,
-    483,
-    462,
-    440,
-    439,
-    396,
-    395,
-    352,
-    351,
-    308,
-    307,
-    264,
-    263,
-    220,
-    219,
-    176,
-    132,
-    88,
-    44,
-    0,
-]
-
-smart185_timesteps = [
-    999,
-    997,
-    995,
-    992,
-    990,
-    988,
-    986,
-    984,
-    981,
-    979,
-    977,
-    975,
-    972,
-    970,
-    968,
-    966,
-    964,
-    961,
-    959,
-    957,
-    956,
-    954,
-    951,
-    949,
-    946,
-    944,
-    941,
-    939,
-    936,
-    934,
-    931,
-    929,
-    926,
-    924,
-    921,
-    919,
-    916,
-    914,
-    913,
-    910,
-    907,
-    905,
-    902,
-    899,
-    896,
-    893,
-    891,
-    888,
-    885,
-    882,
-    879,
-    877,
-    874,
-    871,
-    870,
-    867,
-    864,
-    861,
-    858,
-    855,
-    852,
-    849,
-    846,
-    843,
-    840,
-    837,
-    834,
-    831,
-    828,
-    827,
-    824,
-    821,
-    817,
-    814,
-    811,
-    808,
-    804,
-    801,
-    798,
-    795,
-    791,
-    788,
-    785,
-    784,
-    780,
-    777,
-    774,
-    770,
-    766,
-    763,
-    760,
-    756,
-    752,
-    749,
-    746,
-    742,
-    741,
-    737,
-    733,
-    730,
-    726,
-    722,
-    718,
-    714,
-    710,
-    707,
-    703,
-    699,
-    698,
-    694,
-    690,
-    685,
-    681,
-    677,
-    673,
-    669,
-    664,
-    660,
-    656,
-    655,
-    650,
-    646,
-    641,
-    636,
-    632,
-    627,
-    622,
-    618,
-    613,
-    612,
-    607,
-    602,
-    596,
-    591,
-    586,
-    580,
-    575,
-    570,
-    569,
-    563,
-    557,
-    551,
-    545,
-    539,
-    533,
-    527,
-    526,
-    519,
-    512,
-    505,
-    498,
-    491,
-    484,
-    483,
-    474,
-    466,
-    457,
-    449,
-    440,
-    439,
-    428,
-    418,
-    407,
-    396,
-    395,
-    381,
-    366,
-    352,
-    351,
-    330,
-    308,
-    307,
-    286,
-    264,
-    263,
-    242,
-    220,
-    219,
-    176,
-    175,
-    132,
-    131,
-    88,
-    44,
-    0,
-]
-
-super27_timesteps = [
-    999,
-    991,
-    982,
-    974,
-    966,
-    958,
-    950,
-    941,
-    933,
-    925,
-    916,
-    908,
-    900,
-    899,
-    874,
-    850,
-    825,
-    800,
-    799,
-    700,
-    600,
-    500,
-    400,
-    300,
-    200,
-    100,
-    0,
-]
-
-super40_timesteps = [
-    999,
-    992,
-    985,
-    978,
-    971,
-    964,
-    957,
-    949,
-    942,
-    935,
-    928,
-    921,
-    914,
-    907,
-    900,
-    899,
-    879,
-    859,
-    840,
-    820,
-    800,
-    799,
-    766,
-    733,
-    700,
-    699,
-    650,
-    600,
-    599,
-    500,
-    499,
-    400,
-    399,
-    300,
-    299,
-    200,
-    199,
-    100,
-    99,
-    0,
-]
-
-super100_timesteps = [
-    999,
-    996,
-    992,
-    989,
-    985,
-    982,
-    979,
-    975,
-    972,
-    968,
-    965,
-    961,
-    958,
-    955,
-    951,
-    948,
-    944,
-    941,
-    938,
-    934,
-    931,
-    927,
-    924,
-    920,
-    917,
-    914,
-    910,
-    907,
-    903,
-    900,
-    899,
-    891,
-    884,
-    876,
-    869,
-    861,
-    853,
-    846,
-    838,
-    830,
-    823,
-    815,
-    808,
-    800,
-    799,
-    788,
-    777,
-    766,
-    755,
-    744,
-    733,
-    722,
-    711,
-    700,
-    699,
-    688,
-    677,
-    666,
-    655,
-    644,
-    633,
-    622,
-    611,
-    600,
-    599,
-    585,
-    571,
-    557,
-    542,
-    528,
-    514,
-    500,
-    499,
-    485,
-    471,
-    457,
-    442,
-    428,
-    414,
-    400,
-    399,
-    379,
-    359,
-    340,
-    320,
-    300,
-    299,
-    279,
-    259,
-    240,
-    220,
-    200,
-    199,
-    166,
-    133,
-    100,
-    99,
-    66,
-    33,
-    0,
-]
diff --git a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/watermark.py b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/watermark.py
deleted file mode 100644
index de1cf051a99f..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/watermark.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import List
-
-import paddle
-import PIL
-from PIL import Image
-
-from ...configuration_utils import ConfigMixin
-from ...models.modeling_utils import ModelMixin
-from ...utils import PIL_INTERPOLATION
-
-
-class IFWatermarker(ModelMixin, ConfigMixin):
-    def __init__(self):
-        super().__init__()
-
-        self.register_buffer("watermark_image", paddle.zeros((62, 62, 4), dtype=paddle.get_default_dtype()))
-        self.watermark_image_as_pil = None
-
-    def apply_watermark(self, images: List[PIL.Image.Image], sample_size=None):
-        # copied from https://github.com/deep-floyd/IF/blob/b77482e36ca2031cb94dbca1001fc1e6400bf4ab/deepfloyd_if/modules/base.py#L287
-
-        h = images[0].height
-        w = images[0].width
-
-        sample_size = sample_size or h
-
-        coef = min(h / sample_size, w / sample_size)
-        img_h, img_w = (int(h / coef), int(w / coef)) if coef < 1 else (h, w)
-
-        S1, S2 = 1024**2, img_w * img_h
-        K = (S2 / S1) ** 0.5
-        wm_size, wm_x, wm_y = int(K * 62), img_w - int(14 * K), img_h - int(14 * K)
-
-        if self.watermark_image_as_pil is None:
-            watermark_image = self.watermark_image.cpu().numpy().astype("uint8")
-            watermark_image = Image.fromarray(watermark_image, mode="RGBA")
-            self.watermark_image_as_pil = watermark_image
-
-        wm_img = self.watermark_image_as_pil.resize(
-            (wm_size, wm_size), PIL_INTERPOLATION["bicubic"], reducing_gap=None
-        )
-
-        for pil_img in images:
-            pil_img.paste(wm_img, box=(wm_x - wm_size, wm_y - wm_size, wm_x, wm_y), mask=wm_img.split()[-1])
-
-        return images
diff --git a/ppdiffusers/ppdiffusers/pipelines/dit/__init__.py b/ppdiffusers/ppdiffusers/pipelines/dit/__init__.py
deleted file mode 100644
index 3ae256886b87..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/dit/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .pipeline_dit import DiTPipeline
diff --git a/ppdiffusers/ppdiffusers/pipelines/dit/pipeline_dit.py b/ppdiffusers/ppdiffusers/pipelines/dit/pipeline_dit.py
deleted file mode 100644
index 7672e27c7b4e..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/dit/pipeline_dit.py
+++ /dev/null
@@ -1,208 +0,0 @@
-# Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)
-# William Peebles and Saining Xie
-#
-# Copyright (c) 2021 OpenAI
-# MIT License
-#
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Dict, List, Optional, Tuple, Union
-
-import paddle
-
-from ...models import AutoencoderKL, Transformer2DModel
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
-
-
-class DiTPipeline(DiffusionPipeline):
-    r"""
-    This pipeline inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Parameters:
-        transformer ([`Transformer2DModel`]):
-            Class conditioned Transformer in Diffusion model to denoise the encoded image latents.
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        scheduler ([`DDIMScheduler`]):
-            A scheduler to be used in combination with `dit` to denoise the encoded image latents.
-    """
-
-    def __init__(
-        self,
-        transformer: Transformer2DModel,
-        vae: AutoencoderKL,
-        scheduler: KarrasDiffusionSchedulers,
-        id2label: Optional[Dict[int, str]] = None,
-    ):
-        super().__init__()
-        self.register_modules(transformer=transformer, vae=vae, scheduler=scheduler)
-
-        # create a imagenet -> id dictionary for easier use
-        self.labels = {}
-        if id2label is not None:
-            for key, value in id2label.items():
-                for label in value.split(","):
-                    self.labels[label.lstrip().rstrip()] = int(key)
-            self.labels = dict(sorted(self.labels.items()))
-            # register id2label
-            self.register_to_config(id2label=id2label)
-
-    def get_label_ids(self, label: Union[str, List[str]]) -> List[int]:
-        r"""
-
-        Map label strings, *e.g.* from ImageNet, to corresponding class ids.
-
-        Parameters:
-            label (`str` or `dict` of `str`): label strings to be mapped to class ids.
-
-        Returns:
-            `list` of `int`: Class ids to be processed by pipeline.
-        """
-
-        if not isinstance(label, list):
-            label = list(label)
-
-        for l in label:
-            if l not in self.labels:
-                raise ValueError(
-                    f"{l} does not exist. Please make sure to select one of the following labels: \n {self.labels}."
-                )
-
-        return [self.labels[l] for l in label]
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        class_labels: List[int],
-        guidance_scale: float = 4.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        num_inference_steps: int = 50,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-    ) -> Union[ImagePipelineOutput, Tuple]:
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            class_labels (List[int]):
-                List of imagenet class labels for the images to be generated.
-            guidance_scale (`float`, *optional*, defaults to 4.0):
-                Scale of the guidance signal.
-            generator (`paddle.Generator`, *optional*):
-                A [paddle generator] to make generation deterministic.
-            num_inference_steps (`int`, *optional*, defaults to 250):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.
-        """
-
-        batch_size = len(class_labels)
-        latent_size = self.transformer.config.sample_size
-        latent_channels = self.transformer.config.in_channels
-
-        latents = randn_tensor(
-            shape=(batch_size, latent_channels, latent_size, latent_size),
-            generator=generator,
-            dtype=self.transformer.dtype,
-        )
-        latent_model_input = paddle.concat([latents] * 2) if guidance_scale > 1 else latents
-
-        class_labels = paddle.to_tensor(class_labels).flatten()
-        class_null = paddle.to_tensor([1000] * batch_size)
-        class_labels_input = paddle.concat([class_labels, class_null], 0) if guidance_scale > 1 else class_labels
-
-        # set step values
-        self.scheduler.set_timesteps(num_inference_steps)
-
-        for t in self.progress_bar(self.scheduler.timesteps):
-            if guidance_scale > 1:
-                half = latent_model_input[: len(latent_model_input) // 2]
-                latent_model_input = paddle.concat([half, half], axis=0)
-            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-            timesteps = t
-            if not paddle.is_tensor(timesteps):
-                # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
-                # This would be a good case for the `match` statement (Python 3.10+)
-                if isinstance(timesteps, float):
-                    dtype = paddle.float32
-                else:
-                    dtype = paddle.int64
-                timesteps = paddle.to_tensor([timesteps], dtype=dtype)
-            elif len(timesteps.shape) == 0:
-                timesteps = timesteps[None]
-            # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-            timesteps = timesteps.expand(
-                [
-                    latent_model_input.shape[0],
-                ]
-            )
-            # predict noise model_output
-            noise_pred = self.transformer(
-                latent_model_input, timestep=timesteps, class_labels=class_labels_input
-            ).sample
-
-            # perform guidance
-            if guidance_scale > 1:
-                eps, rest = noise_pred[:, :latent_channels], noise_pred[:, latent_channels:]
-                bs = eps.shape[0]
-                # TODO torch.split vs paddle.split
-                cond_eps, uncond_eps = paddle.split(eps, [bs // 2, bs - bs // 2], axis=0)
-
-                half_eps = uncond_eps + guidance_scale * (cond_eps - uncond_eps)
-                eps = paddle.concat([half_eps, half_eps], axis=0)
-
-                noise_pred = paddle.concat([eps, rest], axis=1)
-
-            # learned sigma
-            if self.transformer.config.out_channels // 2 == latent_channels:
-                # TODO torch.split vs paddle.split
-                model_output, _ = paddle.split(
-                    noise_pred, [latent_channels, noise_pred.shape[1] - latent_channels], axis=1
-                )
-            else:
-                model_output = noise_pred
-
-            # compute previous image: x_t -> x_t-1
-            latent_model_input = self.scheduler.step(model_output, t, latent_model_input).prev_sample
-
-        if guidance_scale > 1:
-            latents, _ = latent_model_input.chunk(2, axis=0)
-        else:
-            latents = latent_model_input
-
-        latents = 1 / self.vae.config.scaling_factor * latents
-        samples = self.vae.decode(latents).sample
-
-        samples = (samples / 2 + 0.5).clip(0, 1)
-
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        samples = samples.transpose([0, 2, 3, 1]).cast("float32").numpy()
-
-        if output_type == "pil":
-            samples = self.numpy_to_pil(samples)
-
-        if not return_dict:
-            return (samples,)
-
-        return ImagePipelineOutput(images=samples)
diff --git a/ppdiffusers/ppdiffusers/pipelines/fastdeploy_utils.py b/ppdiffusers/ppdiffusers/pipelines/fastdeploy_utils.py
deleted file mode 100644
index 85bd3211b317..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/fastdeploy_utils.py
+++ /dev/null
@@ -1,1477 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2022 The HuggingFace Inc. team.
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import inspect
-import os
-import re
-import shutil
-from pathlib import Path
-from typing import Dict, List, Optional, Union
-
-import numpy as np
-
-from ..image_processor import VaeImageProcessor
-from ..schedulers import (
-    DDIMScheduler,
-    DDPMScheduler,
-    DEISMultistepScheduler,
-    DPMSolverMultistepScheduler,
-    DPMSolverSinglestepScheduler,
-    EulerAncestralDiscreteScheduler,
-    EulerDiscreteScheduler,
-    HeunDiscreteScheduler,
-    KDPM2AncestralDiscreteScheduler,
-    KDPM2DiscreteScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    PreconfigEulerAncestralDiscreteScheduler,
-    PreconfigLMSDiscreteScheduler,
-    UniPCMultistepScheduler,
-)
-from ..utils import (
-    DIFFUSERS_CACHE,
-    FASTDEPLOY_MODEL_NAME,
-    FASTDEPLOY_WEIGHTS_NAME,
-    FROM_HF_HUB,
-    HF_HUB_OFFLINE,
-    ONNX_EXTERNAL_WEIGHTS_NAME,
-    ONNX_WEIGHTS_NAME,
-    PPDIFFUSERS_CACHE,
-    _add_variant,
-    _get_model_file,
-    is_fastdeploy_available,
-    is_paddle_available,
-    logging,
-    randn_tensor,
-)
-from ..version import VERSION as __version__
-
-__all__ = ["FastDeployRuntimeModel", "FastDeployDiffusionPipelineMixin"]
-
-
-if is_paddle_available():
-    import paddle
-
-if is_fastdeploy_available():
-    import fastdeploy as fd
-    from fastdeploy import ModelFormat
-
-    def fdtensor2pdtensor(fdtensor: "fd.C.FDTensor"):
-        dltensor = fdtensor.to_dlpack()
-        pdtensor = paddle.utils.dlpack.from_dlpack(dltensor)
-        return pdtensor
-
-    def pdtensor2fdtensor(pdtensor: paddle.Tensor, name: str = "", share_with_raw_ptr=False):
-        if not share_with_raw_ptr:
-            dltensor = paddle.utils.dlpack.to_dlpack(pdtensor)
-            return fd.C.FDTensor.from_dlpack(name, dltensor)
-        else:
-            return fd.C.FDTensor.from_external_data(
-                name,
-                pdtensor.data_ptr(),
-                pdtensor.shape,
-                pdtensor.dtype.name,
-                str(pdtensor.place),
-                int(pdtensor.place.gpu_device_id()),
-            )
-
-
-logger = logging.get_logger(__name__)
-
-
-re_attention = re.compile(
-    r"""
-\\\(|
-\\\)|
-\\\[|
-\\]|
-\\\\|
-\\|
-\(|
-\[|
-:([+-]?[.\d]+)\)|
-\)|
-]|
-[^\\()\[\]:]+|
-:
-""",
-    re.X,
-)
-
-
-def parse_prompt_attention(text):
-    r"""
-    Parses a string with attention tokens and returns a list of pairs: text and its associated weight.
-    Accepted tokens are:
-      (abc) - increases attention to abc by a multiplier of 1.1
-      (abc:3.12) - increases attention to abc by a multiplier of 3.12
-      [abc] - decreases attention to abc by a multiplier of 1.1
-      \( - literal character '('
-      \[ - literal character '['
-      \) - literal character ')'
-      \] - literal character ']'
-      \\ - literal character '\'
-      anything else - just text
-    >>> parse_prompt_attention('normal text')
-    [['normal text', 1.0]]
-    >>> parse_prompt_attention('an (important) word')
-    [['an ', 1.0], ['important', 1.1], [' word', 1.0]]
-    >>> parse_prompt_attention('(unbalanced')
-    [['unbalanced', 1.1]]
-    >>> parse_prompt_attention('\(literal\]')
-    [['(literal]', 1.0]]
-    >>> parse_prompt_attention('(unnecessary)(parens)')
-    [['unnecessaryparens', 1.1]]
-    >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
-    [['a ', 1.0],
-     ['house', 1.5730000000000004],
-     [' ', 1.1],
-     ['on', 1.0],
-     [' a ', 1.1],
-     ['hill', 0.55],
-     [', sun, ', 1.1],
-     ['sky', 1.4641000000000006],
-     ['.', 1.1]]
-    """
-
-    res = []
-    round_brackets = []
-    square_brackets = []
-
-    round_bracket_multiplier = 1.1
-    square_bracket_multiplier = 1 / 1.1
-
-    def multiply_range(start_position, multiplier):
-        for p in range(start_position, len(res)):
-            res[p][1] *= multiplier
-
-    for m in re_attention.finditer(text):
-        text = m.group(0)
-        weight = m.group(1)
-
-        if text.startswith("\\"):
-            res.append([text[1:], 1.0])
-        elif text == "(":
-            round_brackets.append(len(res))
-        elif text == "[":
-            square_brackets.append(len(res))
-        elif weight is not None and len(round_brackets) > 0:
-            multiply_range(round_brackets.pop(), float(weight))
-        elif text == ")" and len(round_brackets) > 0:
-            multiply_range(round_brackets.pop(), round_bracket_multiplier)
-        elif text == "]" and len(square_brackets) > 0:
-            multiply_range(square_brackets.pop(), square_bracket_multiplier)
-        else:
-            res.append([text, 1.0])
-
-    for pos in round_brackets:
-        multiply_range(pos, round_bracket_multiplier)
-
-    for pos in square_brackets:
-        multiply_range(pos, square_bracket_multiplier)
-
-    if len(res) == 0:
-        res = [["", 1.0]]
-
-    # merge runs of identical weights
-    i = 0
-    while i + 1 < len(res):
-        if res[i][1] == res[i + 1][1]:
-            res[i][0] += res[i + 1][0]
-            res.pop(i + 1)
-        else:
-            i += 1
-
-    return res
-
-
-def get_prompts_with_weights(pipe, prompt: List[str], max_length: int):
-    r"""
-    Tokenize a list of prompts and return its tokens with weights of each token.
-    No padding, starting or ending token is included.
-    """
-    tokens = []
-    weights = []
-    truncated = False
-    for text in prompt:
-        texts_and_weights = parse_prompt_attention(text)
-        text_token = []
-        text_weight = []
-        for word, weight in texts_and_weights:
-            # tokenize and discard the starting and the ending token
-            token = pipe.tokenizer(word).input_ids[1:-1]
-            text_token += token
-            # copy the weight by length of token
-            text_weight += [weight] * len(token)
-            # stop if the text is too long (longer than truncation limit)
-            if len(text_token) > max_length:
-                truncated = True
-                break
-        # truncate
-        if len(text_token) > max_length:
-            truncated = True
-            text_token = text_token[:max_length]
-            text_weight = text_weight[:max_length]
-        tokens.append(text_token)
-        weights.append(text_weight)
-    if truncated:
-        logger.warning("Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples")
-    return tokens, weights
-
-
-def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, pad, no_boseos_middle=True, chunk_length=77):
-    r"""
-    Pad the tokens (with starting and ending tokens) and weights (with 1.0) to max_length.
-    """
-    max_embeddings_multiples = (max_length - 2) // (chunk_length - 2)
-    weights_length = max_length if no_boseos_middle else max_embeddings_multiples * chunk_length
-    for i in range(len(tokens)):
-        tokens[i] = [bos] + tokens[i] + [eos] + [pad] * (max_length - 2 - len(tokens[i]))
-        if no_boseos_middle:
-            weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 - len(weights[i]))
-        else:
-            w = []
-            if len(weights[i]) == 0:
-                w = [1.0] * weights_length
-            else:
-                for j in range(max_embeddings_multiples):
-                    w.append(1.0)  # weight for starting token in this chunk
-                    w += weights[i][j * (chunk_length - 2) : min(len(weights[i]), (j + 1) * (chunk_length - 2))]
-                    w.append(1.0)  # weight for ending token in this chunk
-                w += [1.0] * (weights_length - len(w))
-            weights[i] = w[:]
-    # we must to tensor first!
-    return paddle.to_tensor(tokens, dtype="int64"), paddle.to_tensor(weights, dtype="float32")
-
-
-def get_unweighted_text_embeddings(
-    pipe,
-    text_input: paddle.Tensor,
-    chunk_length: int,
-    no_boseos_middle: Optional[bool] = True,
-    infer_op=None,
-):
-    """
-    When the length of tokens is a multiple of the capacity of the text encoder,
-    it should be split into chunks and sent to the text encoder individually.
-    """
-    max_embeddings_multiples = (text_input.shape[1] - 2) // (chunk_length - 2)
-
-    if max_embeddings_multiples > 1:
-        text_embeddings = []
-        for i in range(max_embeddings_multiples):
-            # extract the i-th chunk
-            text_input_chunk = text_input[:, i * (chunk_length - 2) : (i + 1) * (chunk_length - 2) + 2].clone()
-
-            # cover the head and the tail by the starting and the ending tokens
-            text_input_chunk[:, 0] = text_input[0, 0]
-            text_input_chunk[:, -1] = text_input[0, -1]
-
-            output_shape = [
-                text_input_chunk.shape[0],
-                text_input_chunk.shape[1],
-                pipe.text_encoder_hidden_states_dim,
-            ]
-            text_embedding = pipe.text_encoder(
-                input_ids=text_input_chunk,
-                infer_op=infer_op,
-                output_shape=output_shape,
-            )[0]
-            if no_boseos_middle:
-                if i == 0:
-                    # discard the ending token
-                    text_embedding = text_embedding[:, :-1]
-                elif i == max_embeddings_multiples - 1:
-                    # discard the starting token
-                    text_embedding = text_embedding[:, 1:]
-                else:
-                    # discard both starting and ending tokens
-                    text_embedding = text_embedding[:, 1:-1]
-
-            text_embeddings.append(text_embedding)
-        text_embeddings = paddle.concat(text_embeddings, axis=1)
-    else:
-        output_shape = [
-            text_input.shape[0],
-            text_input.shape[1],
-            pipe.text_encoder_hidden_states_dim,
-        ]
-        text_embeddings = pipe.text_encoder(
-            input_ids=text_input,
-            infer_op=infer_op,
-            output_shape=output_shape,
-        )[0]
-    return text_embeddings
-
-
-def get_weighted_text_embeddings(
-    pipe,
-    prompt: Union[str, List[str]],
-    uncond_prompt: Optional[Union[str, List[str]]] = None,
-    max_embeddings_multiples: Optional[int] = 1,
-    no_boseos_middle: Optional[bool] = False,
-    skip_parsing: Optional[bool] = False,
-    skip_weighting: Optional[bool] = False,
-    infer_op=None,
-    **kwargs,
-):
-    r"""
-    Prompts can be assigned with local weights using brackets. For example,
-    prompt 'A (very beautiful) masterpiece' highlights the words 'very beautiful',
-    and the embedding tokens corresponding to the words get multiplied by a constant, 1.1.
-    Also, to regularize of the embedding, the weighted embedding would be scaled to preserve the original mean.
-    Args:
-        pipe (`DiffusionPipeline`):
-            Pipe to provide access to the tokenizer and the text encoder.
-        prompt (`str` or `List[str]`):
-            The prompt or prompts to guide the image generation.
-        uncond_prompt (`str` or `List[str]`):
-            The unconditional prompt or prompts for guide the image generation. If unconditional prompt
-            is provided, the embeddings of prompt and uncond_prompt are concatenated.
-        max_embeddings_multiples (`int`, *optional*, defaults to `1`):
-            The max multiple length of prompt embeddings compared to the max output length of text encoder.
-        no_boseos_middle (`bool`, *optional*, defaults to `False`):
-            If the length of text token is multiples of the capacity of text encoder, whether reserve the starting and
-            ending token in each of the chunk in the middle.
-        skip_parsing (`bool`, *optional*, defaults to `False`):
-            Skip the parsing of brackets.
-        skip_weighting (`bool`, *optional*, defaults to `False`):
-            Skip the weighting. When the parsing is skipped, it is forced True.
-    """
-    max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
-    if isinstance(prompt, str):
-        prompt = [prompt]
-
-    if not skip_parsing:
-        prompt_tokens, prompt_weights = get_prompts_with_weights(pipe, prompt, max_length - 2)
-        if uncond_prompt is not None:
-            if isinstance(uncond_prompt, str):
-                uncond_prompt = [uncond_prompt]
-            uncond_tokens, uncond_weights = get_prompts_with_weights(pipe, uncond_prompt, max_length - 2)
-    else:
-        prompt_tokens = [
-            token[1:-1] for token in pipe.tokenizer(prompt, max_length=max_length, truncation=True).input_ids
-        ]
-        prompt_weights = [[1.0] * len(token) for token in prompt_tokens]
-        if uncond_prompt is not None:
-            if isinstance(uncond_prompt, str):
-                uncond_prompt = [uncond_prompt]
-            uncond_tokens = [
-                token[1:-1]
-                for token in pipe.tokenizer(uncond_prompt, max_length=max_length, truncation=True).input_ids
-            ]
-            uncond_weights = [[1.0] * len(token) for token in uncond_tokens]
-
-    # round up the longest length of tokens to a multiple of (model_max_length - 2)
-    max_length = max([len(token) for token in prompt_tokens])
-    if uncond_prompt is not None:
-        max_length = max(max_length, max([len(token) for token in uncond_tokens]))
-
-    max_embeddings_multiples = min(
-        max_embeddings_multiples,
-        (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1,
-    )
-    max_embeddings_multiples = max(1, max_embeddings_multiples)
-    max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
-
-    # pad the length of tokens and weights
-    # support bert tokenizer
-    bos = pipe.tokenizer.bos_token_id if pipe.tokenizer.bos_token_id is not None else pipe.tokenizer.cls_token_id
-    eos = pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id is not None else pipe.tokenizer.sep_token_id
-    pad = pipe.tokenizer.pad_token_id
-
-    prompt_tokens, prompt_weights = pad_tokens_and_weights(
-        prompt_tokens,
-        prompt_weights,
-        max_length,
-        bos,
-        eos,
-        pad,
-        no_boseos_middle=no_boseos_middle,
-        chunk_length=pipe.tokenizer.model_max_length,
-    )
-    if uncond_prompt is not None:
-        uncond_tokens, uncond_weights = pad_tokens_and_weights(
-            uncond_tokens,
-            uncond_weights,
-            max_length,
-            bos,
-            eos,
-            pad,
-            no_boseos_middle=no_boseos_middle,
-            chunk_length=pipe.tokenizer.model_max_length,
-        )
-    # get the embeddings
-    text_embeddings = get_unweighted_text_embeddings(
-        pipe,
-        prompt_tokens,
-        pipe.tokenizer.model_max_length,
-        no_boseos_middle=no_boseos_middle,
-        infer_op=infer_op,
-    )
-    if uncond_prompt is not None:
-        uncond_embeddings = get_unweighted_text_embeddings(
-            pipe,
-            uncond_tokens,
-            pipe.tokenizer.model_max_length,
-            no_boseos_middle=no_boseos_middle,
-            infer_op=infer_op,
-        )
-    # assign weights to the prompts and normalize in the sense of mean
-    # TODO: should we normalize by chunk or in a whole (current implementation)?
-    if (not skip_parsing) and (not skip_weighting):
-        previous_mean = text_embeddings.mean(axis=[-2, -1])
-        text_embeddings *= prompt_weights.unsqueeze(-1)
-        text_embeddings *= (previous_mean / text_embeddings.mean(axis=[-2, -1])).unsqueeze(-1).unsqueeze(-1)
-        if uncond_prompt is not None:
-            previous_mean = uncond_embeddings.mean(axis=[-2, -1])
-            uncond_embeddings *= uncond_weights.unsqueeze(-1)
-            uncond_embeddings *= (previous_mean / uncond_embeddings.mean(axis=[-2, -1])).unsqueeze(-1).unsqueeze(-1)
-
-    if uncond_prompt is not None:
-        return text_embeddings, uncond_embeddings
-    return text_embeddings, None
-
-
-class FastDeployDiffusionPipelineMixin:
-    def prepare_infer_op_dict(self, infer_op_dict=None, **kwargs):
-        if infer_op_dict is None:
-            infer_op_dict = {}
-        new_infer_op_dict = {}
-        for name in dir(self):
-            if name.startswith("_"):
-                continue
-            module = getattr(self, name)
-            if isinstance(module, FastDeployRuntimeModel):
-                infer_op = infer_op_dict.get(name, "zero_copy_infer") if module.is_spport_zero_copy() else "raw"
-                # if parse_prompt_type in ["lpw", "webui"] and name in ["text_encoder"]:
-                #     if infer_op != "raw":
-                #         logger.warning(
-                #             f"When parse_prompt_type is `{parse_prompt_type}` and module is `{name}`, we will set infer_op to `raw` instead of `{infer_op}`!"
-                #         )
-                #         infer_op = "raw"
-                new_infer_op_dict[name] = infer_op
-        return new_infer_op_dict
-
-    def post_init(self, vae_scaling_factor=0.18215, vae_scale_factor=8, dtype="float32"):
-        self.vae_scaling_factor = vae_scaling_factor
-        self.vae_scale_factor = vae_scale_factor
-
-        self.image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor, do_convert_rgb=True)
-        self.control_image_processor = VaeImageProcessor(
-            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
-        )
-        self.dtype = dtype
-        self.supported_scheduler = [
-            "pndm",
-            "lms",
-            "preconfig-lms",
-            "euler",
-            "euler-ancestral",
-            "preconfig-euler-ancestral",
-            "dpm-multi",
-            "dpm-single",
-            "unipc-multi",
-            "ddim",
-            "ddpm",
-            "deis-multi",
-            "heun",
-            "kdpm2-ancestral",
-            "kdpm2",
-        ]
-        self.orginal_scheduler_config = self.scheduler.config
-
-    @property
-    def vae_encoder_num_channels(self):
-        if self.vae_encoder is None:
-            return 3
-        return self.vae_encoder.model.get_input_info(0).shape[1]
-
-    @property
-    def vae_decoder_num_latent_channels(self):
-        if self.vae_decoder is None:
-            return 4
-        return self.vae_decoder.model.get_input_info(0).shape[1]
-
-    @property
-    def unet_num_latent_channels(self):
-        return self.unet.model.get_input_info(0).shape[1]
-
-    @property
-    def unet_hidden_states_dim(self):
-        return self.unet.model.get_input_info(2).shape[2]
-
-    @property
-    def text_encoder_hidden_states_dim(self):
-        if not hasattr(self, "text_encoder") or self.text_encoder is None:
-            return 768
-        return self.text_encoder.model.get_output_info(0).shape[2]
-
-    def change_scheduler(self, scheduler_type="ddim"):
-        scheduler_type = scheduler_type.lower()
-        if scheduler_type == "pndm":
-            scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True)
-        elif scheduler_type == "lms":
-            scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config)
-        elif scheduler_type == "preconfig-lms":
-            scheduler = PreconfigLMSDiscreteScheduler.from_config(self.orginal_scheduler_config)
-        elif scheduler_type == "heun":
-            scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config)
-        elif scheduler_type == "euler":
-            scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config)
-        elif scheduler_type == "euler-ancestral":
-            scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
-        elif scheduler_type == "preconfig-euler-ancestral":
-            scheduler = PreconfigEulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
-        elif scheduler_type == "dpm-multi":
-            scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config)
-        elif scheduler_type == "dpm-single":
-            scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config)
-        elif scheduler_type == "kdpm2-ancestral":
-            scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
-        elif scheduler_type == "kdpm2":
-            scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config)
-        elif scheduler_type == "unipc-multi":
-            scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config)
-        elif scheduler_type == "ddim":
-            scheduler = DDIMScheduler.from_config(
-                self.orginal_scheduler_config,
-                steps_offset=1,
-                clip_sample=False,
-                set_alpha_to_one=False,
-            )
-        elif scheduler_type == "ddpm":
-            scheduler = DDPMScheduler.from_config(
-                self.orginal_scheduler_config,
-            )
-        elif scheduler_type == "deis-multi":
-            scheduler = DEISMultistepScheduler.from_config(
-                self.orginal_scheduler_config,
-            )
-        else:
-            raise ValueError(
-                f"Scheduler of type {scheduler_type} doesn't exist! Please choose in {self.supported_scheduler}!"
-            )
-        self.scheduler = scheduler
-
-    def get_timesteps(self, num_inference_steps, strength=1.0):
-        if strength >= 1:
-            return self.scheduler.timesteps.cast(self.dtype), num_inference_steps
-
-        # get the original timestep using init_timestep
-        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
-
-        t_start = max(num_inference_steps - init_timestep, 0)
-        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :].cast(self.dtype)
-
-        if hasattr(self.scheduler, "step_index_offset"):
-            self.scheduler.step_index_offset = t_start * self.scheduler.order
-
-        num_inference_steps = num_inference_steps - t_start
-        # check that number of inference steps is not < 1 - as this doesn't make sense
-        if num_inference_steps < 1:
-            raise ValueError(
-                f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
-                f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
-            )
-
-        return timesteps, num_inference_steps
-
-    def prepare_controlnet_cond(
-        self,
-        controlnet_cond,
-        controlnet_conditioning_scale,
-        width,
-        height,
-        batch_size,
-        num_images_per_prompt,
-        do_classifier_free_guidance=False,
-    ):
-        control_image = self.control_image_processor.preprocess(
-            controlnet_cond,
-            height=height,
-            width=width,
-        )
-        if isinstance(controlnet_conditioning_scale, (float, int)):
-            controlnet_conditioning_scale = paddle.to_tensor([controlnet_conditioning_scale] * 13, dtype=self.dtype)
-        elif isinstance(controlnet_conditioning_scale, (list, tuple)):
-            controlnet_conditioning_scale = paddle.to_tensor(controlnet_conditioning_scale, dtype=self.dtype)
-        else:
-            raise ValueError(
-                f"`controlnet_conditioning_scale` has to be of type `float` or `int` or `list` or `tuple` but is {type(controlnet_conditioning_scale)}"
-            )
-        assert controlnet_conditioning_scale.shape[0] == 13
-        image_batch_size = control_image.shape[0]
-        if image_batch_size == 1:
-            repeat_by = batch_size
-        else:
-            # image batch size is the same as prompt batch size
-            repeat_by = num_images_per_prompt
-        control_image = control_image.repeat_interleave(repeat_by, axis=0)
-        if do_classifier_free_guidance:
-            control_image = paddle.concat([control_image] * 2)
-        return control_image, controlnet_conditioning_scale
-
-    def check_inputs(
-        self,
-        prompt,
-        height=512,
-        width=512,
-        callback_steps=1,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-        strength=1.0,
-    ):
-        if height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by {self.vae_scale_factor} but are {height} and {width}."
-            )
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-        if strength < 0 or strength > 1:
-            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
-
-    def prepare_latents(
-        self,
-        batch_size,
-        height,
-        width,
-        generator,
-        latents=None,
-        image=None,
-        timestep=None,
-        is_strength_max=True,
-        return_noise=False,
-        return_image_latents=False,
-        infer_op=None,
-    ):
-        shape = [
-            batch_size,
-            self.vae_decoder_num_latent_channels,
-            height // self.vae_scale_factor,
-            width // self.vae_scale_factor,
-        ]
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if (image is None or timestep is None) and not is_strength_max:
-            raise ValueError(
-                "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise."
-                "However, either the image or the noise timestep has not been provided."
-            )
-
-        if return_image_latents or (latents is None and not is_strength_max):
-            image = image.cast(dtype=self.dtype)
-            image_latents = self._encode_vae_image(image, infer_op)
-
-        if latents is None:
-            noise = randn_tensor(shape, generator=generator, dtype=self.dtype)
-            # if strength is 1. then initialise the latents to noise, else initial to image + noise
-            latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep)
-            # if pure noise then scale the initial latents by the  Scheduler's init sigma
-            latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
-        else:
-            noise = latents
-            if str(noise.dtype).replace("paddle.", "") != self.dtype:
-                noise = noise.cast(self.dtype)
-            latents = noise * self.scheduler.init_noise_sigma
-
-        outputs = (latents,)
-
-        if return_noise:
-            outputs += (noise,)
-
-        if return_image_latents:
-            outputs += (image_latents,)
-
-        if len(outputs) == 1:
-            outputs = latents
-        return outputs
-
-    def prepare_mask_latents(
-        self,
-        mask,
-        masked_image,
-        batch_size,
-        height,
-        width,
-        do_classifier_free_guidance,
-        return_masked_image_latents=True,
-        infer_op=None,
-    ):
-        # resize the mask to latents shape as we concatenate the mask to the latents
-        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
-        # and half precision
-        mask = paddle.nn.functional.interpolate(
-            mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
-        )
-        mask = mask.cast(dtype=self.dtype)
-
-        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
-        if mask.shape[0] < batch_size:
-            if not batch_size % mask.shape[0] == 0:
-                raise ValueError(
-                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
-                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
-                    " of masks that you pass is divisible by the total requested batch size."
-                )
-            mask = mask.tile([batch_size // mask.shape[0], 1, 1, 1])
-
-        mask = paddle.concat([mask] * 2) if do_classifier_free_guidance else mask
-        if not return_masked_image_latents:
-            return mask
-
-        masked_image = masked_image.cast(dtype=self.dtype)
-        masked_image_latents = self._encode_vae_image(masked_image, infer_op)
-        if masked_image_latents.shape[0] < batch_size:
-            if not batch_size % masked_image_latents.shape[0] == 0:
-                raise ValueError(
-                    "The passed images and the required batch size don't match. Images are supposed to be duplicated"
-                    f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
-                    " Make sure the number of images that you pass is divisible by the total requested batch size."
-                )
-            masked_image_latents = masked_image_latents.tile([batch_size // masked_image_latents.shape[0], 1, 1, 1])
-
-        masked_image_latents = (
-            paddle.concat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
-        )
-
-        # aligning device to prevent device errors when concating it with the latent model input
-        masked_image_latents = masked_image_latents.cast(dtype=self.dtype)
-        return mask, masked_image_latents
-
-    def is_scheduler_support_step_index(self):
-        kwargs_keys = set(inspect.signature(self.scheduler.step).parameters.keys())
-        return "kwargs" in kwargs_keys or "step_index" in kwargs_keys
-
-    def _encode_vae_image(self, image: paddle.Tensor, infer_op=None, **kwargs):
-        image_shape = image.shape
-        output_shape = [
-            image_shape[0],
-            self.vae_decoder_num_latent_channels,
-            image_shape[2] // self.vae_scale_factor,
-            image_shape[3] // self.vae_scale_factor,
-        ]
-        image_latents = self.vae_encoder(
-            sample=image,
-            infer_op=infer_op,
-            output_shape=output_shape,
-        )[0]
-
-        return self.vae_scaling_factor * image_latents
-
-    def _decode_vae_latents(self, latents: paddle.Tensor, infer_op=None, **kwargs):
-        latents_shape = latents.shape
-        output_shape = [
-            latents_shape[0],
-            self.vae_encoder_num_channels,
-            latents_shape[2] * self.vae_scale_factor,
-            latents_shape[3] * self.vae_scale_factor,
-        ]
-        images_vae = self.vae_decoder(
-            latent_sample=latents,
-            infer_op=infer_op,
-            output_shape=output_shape,
-        )[0]
-
-        return images_vae
-
-    def _encode_prompt(
-        self,
-        prompt,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        infer_op=None,
-        parse_prompt_type: Optional[str] = "lpw",
-        max_embeddings_multiples: Optional[int] = 3,
-        **kwargs,
-    ):
-        if parse_prompt_type == "lpw":
-            return self._encode_prompt_lpw(
-                prompt,
-                num_images_per_prompt=num_images_per_prompt,
-                do_classifier_free_guidance=do_classifier_free_guidance,
-                negative_prompt=negative_prompt,
-                prompt_embeds=prompt_embeds,
-                negative_prompt_embeds=negative_prompt_embeds,
-                max_embeddings_multiples=max_embeddings_multiples,
-                infer_op=infer_op,
-                **kwargs,
-            )
-        elif parse_prompt_type == "raw":
-            return self._encode_prompt_raw(
-                prompt,
-                num_images_per_prompt=num_images_per_prompt,
-                do_classifier_free_guidance=do_classifier_free_guidance,
-                negative_prompt=negative_prompt,
-                prompt_embeds=prompt_embeds,
-                negative_prompt_embeds=negative_prompt_embeds,
-                infer_op=infer_op,
-            )
-        elif parse_prompt_type == "webui":
-            raise NotImplementedError("`parse_prompt_type=webui` is not implemented yet.")
-
-    def _encode_prompt_lpw(
-        self,
-        prompt: Union[str, List[str]],
-        num_images_per_prompt: int,
-        do_classifier_free_guidance: bool,
-        negative_prompt: Union[str, List[str]],
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        infer_op=None,
-        max_embeddings_multiples: Optional[int] = 3,
-        **kwargs,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `list(int)`):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
-                The max multiple length of prompt embeddings compared to the max output length of text encoder.
-        """
-
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None and negative_prompt_embeds is None:
-            uncond_tokens: List[str] = None
-            if do_classifier_free_guidance:
-                if negative_prompt is None:
-                    uncond_tokens = [""] * batch_size
-                elif prompt is not None and type(prompt) is not type(negative_prompt):
-                    raise TypeError(
-                        f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                        f" {type(prompt)}."
-                    )
-                elif isinstance(negative_prompt, str):
-                    uncond_tokens = [negative_prompt]
-                elif batch_size != len(negative_prompt):
-                    raise ValueError(
-                        f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                        f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                        " the batch size of `prompt`."
-                    )
-                else:
-                    uncond_tokens = negative_prompt
-
-            prompt_embeds, negative_prompt_embeds = get_weighted_text_embeddings(
-                pipe=self,
-                prompt=prompt,
-                uncond_prompt=uncond_tokens,
-                max_embeddings_multiples=max_embeddings_multiples,
-                infer_op="raw",  # NOTE: we can't use zero copy!
-                **kwargs,
-            )
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
-        return prompt_embeds
-
-    def _encode_prompt_raw(
-        self,
-        prompt,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        infer_op=None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-        """
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            # get prompt text embeddings
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids  # check
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            prompt_embeds = self.text_encoder(
-                input_ids=text_input_ids,
-                infer_op=infer_op,
-                output_shape=[
-                    batch_size,
-                    self.tokenizer.model_max_length,
-                    self.text_encoder_hidden_states_dim,
-                ],
-            )[0]
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            if negative_prompt is None:
-                uncond_tokens = [""]
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-            negative_prompt_embeds = self.text_encoder(
-                input_ids=uncond_input.input_ids,
-                infer_op=infer_op,
-                output_shape=[
-                    batch_size,
-                    max_length,
-                    self.text_encoder_hidden_states_dim,
-                ],
-            )[0]
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
-
-        return prompt_embeds
-
-    def run_safety_checker(self, image):
-        if self.safety_checker is None:
-            has_nsfw_concept = None
-        else:
-            if paddle.is_tensor(image):
-                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
-            else:
-                feature_extractor_input = self.image_processor.numpy_to_pil(image)
-            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="np")
-            image, has_nsfw_concept = self.safety_checker(
-                images=image.numpy(),
-                clip_input=safety_checker_input.pixel_values.astype(self.dtype),
-                infer_op="raw",
-            )
-            image = paddle.to_tensor(image, dtype=self.dtype)
-        return image, has_nsfw_concept
-
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-
-class FastDeployRuntimeModel:
-    def __init__(self, model=None, **kwargs):
-        logger.info("`ppdiffusers.FastDeployRuntimeModel` is experimental and might change in the future.")
-        self.model = model
-        self.model_save_dir = kwargs.get("model_save_dir", None)
-        self.model_format = kwargs.get("model_format", None)
-        self.latest_model_name = kwargs.get("latest_model_name", None)
-        self.latest_params_name = kwargs.get("latest_params_name", None)
-
-        if self.model_format in [ModelFormat.PADDLE, "PADDLE", None]:
-            if self.latest_model_name is None:
-                self.latest_model_name = FASTDEPLOY_MODEL_NAME
-            if self.latest_params_name is None:
-                self.latest_params_name = FASTDEPLOY_WEIGHTS_NAME
-            self.model_format = ModelFormat.PADDLE
-        if self.model_format in [ModelFormat.ONNX, "ONNX"]:
-            if self.latest_model_name is None:
-                self.latest_model_name = ONNX_WEIGHTS_NAME
-            self.latest_params_name = None
-            self.model_format = ModelFormat.ONNX
-
-    def is_spport_zero_copy(self):
-        if self.model.runtime_option._option.backend == fd.Backend.PDINFER:
-            return self.model.runtime_option._option.paddle_infer_option.enable_trt
-        # currently we donot spport zero copy model with fd.Backend.LITE.
-        elif self.model.runtime_option._option.backend == fd.Backend.LITE:
-            return False
-        else:
-            return False
-
-    def zero_copy_infer(self, prebinded_inputs: dict, prebinded_outputs: dict, share_with_raw_ptr=True, **kwargs):
-        """
-        Execute inference without copying data from cpu to gpu.
-
-        Arguments:
-            kwargs (`dict(name, paddle.Tensor)`):
-                An input map from name to tensor.
-        Return:
-            List of output tensor.
-        """
-        for inputs_name, inputs_tensor in prebinded_inputs.items():
-            input_fdtensor = pdtensor2fdtensor(inputs_tensor, inputs_name, share_with_raw_ptr=share_with_raw_ptr)
-            self.model.bind_input_tensor(inputs_name, input_fdtensor)
-
-        for outputs_name, outputs_tensor in prebinded_outputs.items():
-            output_fdtensor = pdtensor2fdtensor(outputs_tensor, outputs_name, share_with_raw_ptr=share_with_raw_ptr)
-            self.model.bind_output_tensor(outputs_name, output_fdtensor)
-
-        self.model.zero_copy_infer()
-
-    def __call__(self, **kwargs):
-        infer_op = kwargs.pop("infer_op", None)
-        if infer_op is None:
-            infer_op = "raw"
-        # for zero_copy_infer
-        share_with_raw_ptr = kwargs.pop("share_with_raw_ptr", True)
-        output_shape = kwargs.pop("output_shape", None)
-
-        inputs = {}
-        for k, v in kwargs.items():
-            if k == "timestep":
-                v = v.astype("float32")
-            inputs[k] = v
-
-        if infer_op == "zero_copy_infer":
-            output = paddle.zeros(output_shape, dtype="float32")
-            self.zero_copy_infer(
-                prebinded_inputs=inputs,
-                prebinded_outputs={self.model.get_output_info(0).name: output},
-                share_with_raw_ptr=share_with_raw_ptr,
-            )
-            return [
-                output,
-            ]
-        elif infer_op == "raw":
-            inputs = {}
-            for k, v in kwargs.items():
-                if paddle.is_tensor(v):
-                    v = v.numpy()
-                inputs[k] = np.array(v)
-            return [paddle.to_tensor(output) for output in self.model.infer(inputs)]
-        else:
-            raise ValueError("Unknown infer_op {}".format(infer_op))
-
-    @staticmethod
-    def load_model(
-        model_path: Union[str, Path],
-        params_path: Union[str, Path] = None,
-        runtime_options: Optional["fd.RuntimeOption"] = None,
-    ):
-        """
-        Loads an FastDeploy Inference Model with fastdeploy.RuntimeOption
-
-        Arguments:
-            model_path (`str` or `Path`):
-                Model path from which to load
-            params_path (`str` or `Path`):
-                Params path from which to load
-            runtime_options (fd.RuntimeOption, *optional*):
-                The RuntimeOption of fastdeploy to initialize the fastdeploy runtime. Default setting
-                the device to cpu and the backend to paddle inference
-        """
-        option = runtime_options
-        if option is None or not isinstance(runtime_options, fd.RuntimeOption):
-            logger.info("No fastdeploy.RuntimeOption specified, using CPU device and paddle inference backend.")
-            option = fd.RuntimeOption()
-            option.use_paddle_backend()
-            option.use_cpu()
-
-        if params_path is None or model_path.endswith(".onnx"):
-            option.use_ort_backend()
-            option.set_model_path(model_path, model_format=ModelFormat.ONNX)
-        else:
-            option.set_model_path(model_path, params_path)
-
-        # set cache file
-        option.set_trt_cache_file(str(Path(model_path).parent / "_opt_cache/"))
-        option.set_lite_model_cache_dir(str(Path(model_path).parent))
-
-        return fd.Runtime(option)
-
-    def _save_pretrained(
-        self,
-        save_directory: Union[str, Path],
-        model_file_name: Optional[str] = None,
-        params_file_name: Optional[str] = None,
-        **kwargs
-    ):
-        """
-        Save a model and its configuration file to a directory, so that it can be re-loaded using the
-        [`~FastDeployRuntimeModel.from_pretrained`] class method. It will always save the
-        latest_model_name.
-
-        Arguments:
-            save_directory (`str` or `Path`):
-                Directory where to save the model file.
-            model_file_name(`str`, *optional*):
-                Overwrites the default model file name from `"inference.pdmodel"` to `model_file_name`. This allows you to save the
-                model with a different name.
-            params_file_name(`str`, *optional*):
-                Overwrites the default model file name from `"inference.pdiparams"` to `params_file_name`. This allows you to save the
-                model with a different name.
-        """
-        is_onnx_model = self.model_format == ModelFormat.ONNX
-        model_file_name = (
-            model_file_name
-            if model_file_name is not None
-            else FASTDEPLOY_MODEL_NAME
-            if not is_onnx_model
-            else ONNX_WEIGHTS_NAME
-        )
-        params_file_name = params_file_name if params_file_name is not None else FASTDEPLOY_WEIGHTS_NAME
-
-        src_model_path = self.model_save_dir.joinpath(self.latest_model_name)
-        dst_model_path = Path(save_directory).joinpath(model_file_name)
-
-        try:
-            shutil.copyfile(src_model_path, dst_model_path)
-        except shutil.SameFileError:
-            pass
-
-        if is_onnx_model:
-            # copy external weights (for models >2GB)
-            src_model_path = self.model_save_dir.joinpath(ONNX_EXTERNAL_WEIGHTS_NAME)
-            if src_model_path.exists():
-                dst_model_path = Path(save_directory).joinpath(ONNX_EXTERNAL_WEIGHTS_NAME)
-                try:
-                    shutil.copyfile(src_model_path, dst_model_path)
-                except shutil.SameFileError:
-                    pass
-
-        if not is_onnx_model:
-            src_params_path = self.model_save_dir.joinpath(self.latest_params_name)
-            dst_params_path = Path(save_directory).joinpath(params_file_name)
-            try:
-                shutil.copyfile(src_params_path, dst_params_path)
-            except shutil.SameFileError:
-                pass
-
-    def save_pretrained(
-        self,
-        save_directory: Union[str, os.PathLike],
-        **kwargs,
-    ):
-        """
-        Save a model to a directory, so that it can be re-loaded using the [`~FastDeployRuntimeModel.from_pretrained`] class
-        method.:
-
-        Arguments:
-            save_directory (`str` or `os.PathLike`):
-                Directory to which to save. Will be created if it doesn't exist.
-        """
-        if os.path.isfile(save_directory):
-            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
-            return
-
-        os.makedirs(save_directory, exist_ok=True)
-
-        # saving model weights/files
-        self._save_pretrained(save_directory, **kwargs)
-
-    @classmethod
-    def _from_pretrained(
-        cls,
-        pretrained_model_name_or_path: Union[str, Path],
-        model_file_name: Optional[str] = None,
-        params_file_name: Optional[str] = None,
-        use_auth_token: Optional[Union[bool, str, None]] = None,
-        revision: Optional[str] = None,
-        subfolder: Optional[str] = None,
-        force_download: bool = False,
-        cache_dir: Optional[str] = None,
-        runtime_options: Optional["fd.RuntimeOption"] = None,
-        from_hf_hub: Optional[bool] = False,
-        proxies: Optional[Dict] = None,
-        resume_download: bool = False,
-        local_files_only: bool = False,
-        user_agent: Union[Dict, str, None] = None,
-        is_onnx_model: bool = False,
-        **kwargs,
-    ):
-        """
-        Load a model from a directory or the HF Hub.
-
-        Arguments:
-            pretrained_model_name_or_path (`str` or `Path`):
-                Directory from which to load
-            model_file_name (`str`):
-                Overwrites the default model file name from `"inference.pdmodel"` to `file_name`. This allows you to load
-                different model files from the same repository or directory.
-            params_file_name (`str`):
-                Overwrites the default params file name from `"inference.pdiparams"` to `file_name`. This allows you to load
-                different model files from the same repository or directory.
-            use_auth_token (`str` or `bool`):
-                Is needed to load models from a private or gated repository
-            revision (`str`):
-                Revision is the specific model version to use. It can be a branch name, a tag name, or a commit id
-            cache_dir (`Union[str, Path]`, *optional*):
-                Path to a directory in which a downloaded pretrained model configuration should be cached if the
-                standard cache should not be used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            runtime_options (`fastdeploy.RuntimeOption`, *optional*):
-                The RuntimeOption of fastdeploy.
-            subfolder (`str`, *optional*, defaults to `""`):
-                In case the relevant files are located inside a subfolder of the model repo (either remote in
-                huggingface.co or downloaded locally), you can specify the folder name here.
-            kwargs (`Dict`, *optional*):
-                kwargs will be passed to the model during initialization
-        """
-
-        model_file_name = (
-            model_file_name
-            if model_file_name is not None
-            else FASTDEPLOY_MODEL_NAME
-            if not is_onnx_model
-            else ONNX_WEIGHTS_NAME
-        )
-        params_file_name = params_file_name if params_file_name is not None else FASTDEPLOY_WEIGHTS_NAME
-        kwargs["model_format"] = "ONNX" if is_onnx_model else "PADDLE"
-
-        # load model from local directory
-        if os.path.isdir(pretrained_model_name_or_path):
-            model_path = os.path.join(pretrained_model_name_or_path, model_file_name)
-            params_path = None if is_onnx_model else os.path.join(pretrained_model_name_or_path, params_file_name)
-            model = FastDeployRuntimeModel.load_model(
-                model_path,
-                params_path,
-                runtime_options=runtime_options,
-            )
-            kwargs["model_save_dir"] = Path(pretrained_model_name_or_path)
-        # load model from hub or paddle bos
-        else:
-            model_cache_path = _get_model_file(
-                pretrained_model_name_or_path=pretrained_model_name_or_path,
-                weights_name=model_file_name,
-                subfolder=subfolder,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                revision=revision,
-                from_hf_hub=from_hf_hub,
-                proxies=proxies,
-                resume_download=resume_download,
-                local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
-                user_agent=user_agent,
-            )
-            if is_onnx_model:
-                params_cache_path = None
-                kwargs["latest_params_name"] = None
-            else:
-                params_cache_path = _get_model_file(
-                    pretrained_model_name_or_path=pretrained_model_name_or_path,
-                    weights_name=params_file_name,
-                    subfolder=subfolder,
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    revision=revision,
-                    from_hf_hub=from_hf_hub,
-                    proxies=proxies,
-                    resume_download=resume_download,
-                    local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
-                    user_agent=user_agent,
-                )
-                kwargs["latest_params_name"] = Path(params_cache_path).name
-            kwargs["model_save_dir"] = Path(model_cache_path).parent
-            kwargs["latest_model_name"] = Path(model_cache_path).name
-
-            model = FastDeployRuntimeModel.load_model(
-                model_cache_path,
-                params_cache_path,
-                runtime_options=runtime_options,
-            )
-        return cls(model=model, **kwargs)
-
-    @classmethod
-    def from_pretrained(
-        cls,
-        pretrained_model_name_or_path: Union[str, Path],
-        model_file_name: Optional[str] = None,
-        params_file_name: Optional[str] = None,
-        runtime_options: Optional["fd.RuntimeOption"] = None,
-        is_onnx_model: bool = False,
-        **kwargs,
-    ):
-        from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB)
-        cache_dir = (
-            kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)
-        )
-        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-        local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE)
-        use_auth_token = kwargs.pop("use_auth_token", None)
-        revision = kwargs.pop("revision", None)
-        subfolder = kwargs.pop("subfolder", None)
-        variant = kwargs.pop("variant", None)
-
-        user_agent = {
-            "ppdiffusers": __version__,
-            "file_type": "model",
-            "framework": "fastdeploy",
-        }
-
-        return cls._from_pretrained(
-            pretrained_model_name_or_path=pretrained_model_name_or_path,
-            model_file_name=_add_variant(model_file_name, variant),
-            params_file_name=_add_variant(params_file_name, variant),
-            use_auth_token=use_auth_token,
-            revision=revision,
-            subfolder=subfolder,
-            force_download=force_download,
-            cache_dir=cache_dir,
-            runtime_options=runtime_options,
-            from_hf_hub=from_hf_hub,
-            proxies=proxies,
-            resume_download=resume_download,
-            local_files_only=local_files_only,
-            user_agent=user_agent,
-            is_onnx_model=is_onnx_model,
-            **kwargs,
-        )
diff --git a/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/__init__.py b/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/__init__.py
deleted file mode 100644
index dd119ef22d12..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/__init__.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# flake8: noqa
-from ...utils import is_paddlenlp_available
-from .pipeline_latent_diffusion_superresolution import LDMSuperResolutionPipeline
-
-if is_paddlenlp_available():
-    from .pipeline_latent_diffusion import (
-        LDMBertConfig,
-        LDMBertModel,
-        LDMTextToImagePipeline,
-    )
diff --git a/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
deleted file mode 100644
index fd31755c0773..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
+++ /dev/null
@@ -1,802 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from functools import partial
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-
-import paddle
-import paddle.nn as nn
-
-from paddlenlp.transformers import (
-    PretrainedConfig,
-    PretrainedModel,
-    PretrainedTokenizer,
-    register_base_model,
-)
-from paddlenlp.transformers.model_outputs import (
-    BaseModelOutputWithPoolingAndCrossAttentions,
-)
-
-from ...configuration_utils import FrozenDict
-from ...models import AutoencoderKL, UNet2DConditionModel, UNet2DModel, VQModel
-from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
-from ...utils import deprecate, logging, randn_tensor, replace_example_docstring
-from ...utils.initializer_utils import normal_, zeros_
-from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> import paddle
-        >>> from ppdiffusers import LDMTextToImagePipeline
-
-        >>> pipe = LDMTextToImagePipeline.from_pretrained("CompVis/ldm-text2im-large-256", paddle_dtype=paddle.float16)
-
-        >>> prompt = "a photo of an astronaut riding a horse on mars"
-        >>> image = pipe(prompt).images[0]
-        ```
-"""
-
-
-class LDMTextToImagePipeline(DiffusionPipeline):
-    r"""
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular xxxx, etc.)
-
-    Parameters:
-        vqvae ([`VQModel`]):
-            Vector-quantized (VQ) Model to encode and decode images to and from latent representations.
-        bert ([`LDMBertModel`]):
-            Text-encoder model based on [BERT](https://paddlenlp.readthedocs.io/zh/latest/source/paddlenlp.transformers.bert.modeling.html#paddlenlp.transformers.bert.modeling.BertModel) architecture.
-        tokenizer (`paddlenlp.transformers.BertTokenizer`):
-            Tokenizer of class
-            [BertTokenizer](https://paddlenlp.readthedocs.io/zh/latest/source/paddlenlp.transformers.bert.tokenizer.html#paddlenlp.transformers.bert.tokenizer.BertTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], [`PNDMScheduler`], [`EulerDiscreteScheduler`], [`EulerAncestralDiscreteScheduler`]
-            or [`DPMSolverMultistepScheduler`].
-    """
-
-    def __init__(
-        self,
-        vqvae: Union[VQModel, AutoencoderKL],
-        bert: PretrainedModel,
-        tokenizer: PretrainedTokenizer,
-        unet: Union[UNet2DModel, UNet2DConditionModel],
-        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
-    ):
-        super().__init__()
-        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
-                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
-                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
-                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
-                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file"
-            )
-            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["steps_offset"] = 1
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
-                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
-                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
-                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
-                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
-            )
-            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["clip_sample"] = False
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if tokenizer.model_max_length > 77:
-            tokenizer.model_max_length = 77
-        self.register_modules(vqvae=vqvae, bert=bert, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
-        self.vae_scale_factor = 8  # 2 ** (len(self.vqvae.config.block_out_channels) - 1)
-
-    def _encode_prompt(
-        self,
-        prompt,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
-                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-        """
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because LDMBert can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            prompt_embeds = self.bert(
-                text_input_ids,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        prompt_embeds = prompt_embeds.cast(self.bert.dtype)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-
-            negative_prompt_embeds = self.bert(
-                uncond_input.input_ids,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.cast(self.bert.dtype)
-
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
-
-        return prompt_embeds
-
-    def decode_latents(self, latents):
-        latents = 1 / 0.18215 * latents
-        image = self.vqvae.decode(latents).sample
-        image = (image / 2 + 0.5).clip(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        image = image.transpose([0, 2, 3, 1]).cast("float32").numpy()
-        return image
-
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    def check_inputs(
-        self,
-        prompt,
-        height,
-        width,
-        callback_steps,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-    ):
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
-        shape = [batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor]
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, dtype=dtype)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    @paddle.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        height: Optional[int] = 256,
-        width: Optional[int] = 256,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 1.0,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-    ) -> Union[Tuple, ImagePipelineOutput]:
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            height (`int`, *optional*, defaults to 256):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to 256):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 1.0):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
-                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
-                `self.processor` in
-                [ppdiffusers.cross_attention](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/ppdiffusers/ppdiffusers/models/cross_attention.py).
-
-        Examples:
-
-        Returns:
-            [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is
-            True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
-        """
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
-        )
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-        )
-
-        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps = self.scheduler.timesteps
-
-        # 5. Prepare latent variables
-        num_channels_latents = self.unet.in_channels
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            prompt_embeds.dtype,
-            generator,
-            latents,
-        )
-
-        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 7. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                # predict the noise residual
-                noise_pred = self.unet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                ).sample
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        if output_type == "latent":
-            image = latents
-        elif output_type == "pil":
-            # 8. Post-processing
-            image = self.decode_latents(latents)
-            # 10. Convert to PIL
-            image = self.numpy_to_pil(image)
-        else:
-            # 8. Post-processing
-            image = self.decode_latents(latents)
-
-        if not return_dict:
-            return (image,)
-
-        return ImagePipelineOutput(images=image)
-
-
-################################################################################
-# Code for the text transformer model
-################################################################################
-""" Paddle LDMBERT model."""
-
-""" LDMBERT model configuration"""
-
-
-class LDMBertConfig(PretrainedConfig):
-    model_type = "ldmbert"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
-
-    def __init__(
-        self,
-        vocab_size=30522,
-        max_position_embeddings=77,
-        encoder_layers=32,
-        encoder_ffn_dim=5120,
-        encoder_attention_heads=8,
-        head_dim=64,
-        encoder_layerdrop=0.0,
-        activation_function="gelu",
-        d_model=1280,
-        dropout=0.1,
-        attention_dropout=0.0,
-        activation_dropout=0.0,
-        init_std=0.02,
-        classifier_dropout=0.0,
-        scale_embedding=False,
-        use_cache=True,
-        pad_token_id=0,
-        **kwargs,
-    ):
-        kwargs["return_dict"] = kwargs.pop("return_dict", True)
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.d_model = d_model
-        self.encoder_ffn_dim = encoder_ffn_dim
-        self.encoder_layers = encoder_layers
-        self.encoder_attention_heads = encoder_attention_heads
-        self.head_dim = head_dim
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.activation_dropout = activation_dropout
-        self.activation_function = activation_function
-        self.init_std = init_std
-        self.encoder_layerdrop = encoder_layerdrop
-        self.classifier_dropout = classifier_dropout
-        self.use_cache = use_cache
-        self.num_hidden_layers = encoder_layers
-        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
-
-        super().__init__(pad_token_id=pad_token_id, **kwargs)
-
-
-class LDMBertPretrainedModel(PretrainedModel):
-    pretrained_init_configuration = {}
-    pretrained_resource_files_map = {}
-    base_model_prefix = "ldmbert"
-    config_class = LDMBertConfig
-    _supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_unexpected = [r"encoder\.version", r"decoder\.version"]
-
-    def init_weights(self):
-        """
-        A method executed at the end of each Transformer model initialization, to execute code that needs the model's
-        modules properly initialized (such as weight initialization).
-        """
-        self.apply(self._init_weights)
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, nn.TransformerEncoder):
-            module.enable_recompute = value
-
-    def gradient_checkpointing_enable(self):
-        """
-        Activates gradient checkpointing for the current model.
-
-        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
-        activations".
-        """
-        if not self.supports_gradient_checkpointing:
-            raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
-        self.apply(partial(self._set_gradient_checkpointing, value=True))
-
-    def gradient_checkpointing_disable(self):
-        """
-        Deactivates gradient checkpointing for the current model.
-
-        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
-        activations".
-        """
-        if self.supports_gradient_checkpointing:
-            self.apply(partial(self._set_gradient_checkpointing, value=False))
-
-    def _init_weights(self, module):
-        std = self.config.init_std
-        if isinstance(module, nn.Linear):
-            normal_(module.weight, mean=0.0, std=std)
-            if module.bias is not None:
-                zeros_(module.bias)
-        elif isinstance(module, nn.Embedding):
-            normal_(module.weight, mean=0.0, std=std)
-            if module._padding_idx is not None:
-                with paddle.no_grad():
-                    module.weight[module._padding_idx] = 0
-
-
-class LDMBertEmbeddings(nn.Layer):
-    def __init__(self, vocab_size, hidden_size=768, hidden_dropout_prob=0.0, max_position_embeddings=512):
-        super().__init__()
-        self.word_embeddings = nn.Embedding(vocab_size, hidden_size)
-        self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size)
-        self.dropout = nn.Dropout(hidden_dropout_prob)
-
-    def forward(self, input_ids, position_ids=None):
-        if position_ids is None:
-            ones = paddle.ones_like(input_ids, dtype="int64")
-            seq_length = paddle.cumsum(ones, axis=-1)
-            position_ids = seq_length - ones
-            position_ids.stop_gradient = True
-
-        input_embedings = self.word_embeddings(input_ids)
-        position_embeddings = self.position_embeddings(position_ids)
-
-        embeddings = input_embedings + position_embeddings
-        embeddings = self.dropout(embeddings)
-        return embeddings
-
-
-class TransformerEncoderLayer(nn.TransformerEncoderLayer):
-    def __init__(
-        self,
-        d_model,
-        nhead,
-        dim_feedforward,
-        dropout=0.1,
-        activation="gelu",
-        attn_dropout=None,
-        act_dropout=None,
-        normalize_before=False,
-        weight_attr=None,
-        bias_attr=None,
-        head_dim=64,
-    ):
-        super().__init__(
-            d_model,
-            nhead,
-            dim_feedforward,
-            dropout,
-            activation,
-            attn_dropout,
-            act_dropout,
-            normalize_before,
-            weight_attr,
-            bias_attr,
-        )
-        # update self attn
-        self.self_attn = LDMBertAttention(
-            d_model, head_dim, nhead, dropout=attn_dropout, weight_attr=weight_attr, bias_attr=False
-        )
-
-
-@register_base_model
-class LDMBertModel(LDMBertPretrainedModel):
-    _no_split_modules = []
-
-    def __init__(self, config: LDMBertConfig):
-        super().__init__(config)
-        self.embeddings = LDMBertEmbeddings(
-            config.vocab_size, config.d_model, config.dropout, config.max_position_embeddings
-        )
-        encoder_layer = TransformerEncoderLayer(
-            config.d_model,
-            config.encoder_attention_heads,
-            config.encoder_ffn_dim,
-            dropout=config.dropout,
-            activation=config.activation_function,
-            attn_dropout=config.attention_dropout,
-            act_dropout=config.activation_dropout,
-            normalize_before=True,
-            head_dim=config.head_dim,
-        )
-
-        self.encoder = nn.TransformerEncoder(encoder_layer, config.encoder_layers)
-        self.final_layer_norm = nn.LayerNorm(config.d_model)
-        self.init_weights()
-
-    def get_input_embeddings(self):
-        return self.embeddings.word_embeddings
-
-    def set_input_embeddings(self, value):
-        self.embeddings.word_embeddings = value
-
-    def forward(
-        self,
-        input_ids,
-        position_ids=None,
-        attention_mask=None,
-        output_hidden_states=False,
-        output_attentions=False,
-        return_dict=False,
-    ):
-
-        if attention_mask is not None and attention_mask.ndim == 2:
-            # attention_mask [batch_size, sequence_length] -> [batch_size, 1, 1, sequence_length]
-            attention_mask = attention_mask.unsqueeze(axis=[1, 2]).astype(paddle.get_default_dtype())
-            attention_mask = (1.0 - attention_mask) * -1e4
-
-        embedding_output = self.embeddings(input_ids=input_ids, position_ids=position_ids)
-
-        encoder_outputs = self.encoder(
-            embedding_output,
-            src_mask=attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        if isinstance(encoder_outputs, type(embedding_output)):
-            sequence_output = self.final_layer_norm(encoder_outputs)
-            return (sequence_output,)
-        else:
-            sequence_output = encoder_outputs[0]
-            sequence_output = self.final_layer_norm(sequence_output)
-            if not return_dict:
-                return (sequence_output,) + encoder_outputs[1:]
-            return BaseModelOutputWithPoolingAndCrossAttentions(
-                last_hidden_state=sequence_output,
-                hidden_states=encoder_outputs.hidden_states,
-                attentions=encoder_outputs.attentions,
-            )
-
-
-class LDMBertAttention(nn.MultiHeadAttention):
-    def __init__(
-        self,
-        embed_dim,
-        head_dim,
-        num_heads,
-        dropout=0.0,
-        kdim=None,
-        vdim=None,
-        need_weights=False,
-        weight_attr=None,
-        bias_attr=None,
-    ):
-        super().__init__(embed_dim, num_heads, dropout, kdim, vdim, need_weights, weight_attr, bias_attr)
-        assert embed_dim > 0, "Expected embed_dim to be greater than 0, " "but received {}".format(embed_dim)
-        assert num_heads > 0, "Expected num_heads to be greater than 0, " "but received {}".format(num_heads)
-
-        self.embed_dim = embed_dim
-        self.kdim = kdim if kdim is not None else embed_dim
-        self.vdim = vdim if vdim is not None else embed_dim
-        self.num_heads = num_heads
-        self.dropout = dropout
-        self.need_weights = need_weights
-
-        self.head_dim = head_dim
-        self.inner_dim = head_dim * num_heads
-        self.scaling = self.head_dim**-0.5
-
-        self.q_proj = nn.Linear(embed_dim, self.inner_dim, weight_attr, bias_attr=bias_attr)
-        self.k_proj = nn.Linear(self.kdim, self.inner_dim, weight_attr, bias_attr=bias_attr)
-        self.v_proj = nn.Linear(self.vdim, self.inner_dim, weight_attr, bias_attr=bias_attr)
-        self.out_proj = nn.Linear(self.inner_dim, embed_dim, weight_attr)
-
-
-class LDMBertModelForMaskedLM(LDMBertPretrainedModel):
-    def __init__(self, config: LDMBertConfig):
-        super().__init__(config)
-        self.ldmbert = LDMBertModel(config)
-        self.to_logits = nn.Linear(config.d_model, config.vocab_size)
-        self.init_weights()
-
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        position_ids=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        outputs = self.ldmbert(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        return outputs
diff --git a/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py b/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py
deleted file mode 100644
index 24475c0af099..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py
+++ /dev/null
@@ -1,155 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import List, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-import PIL
-
-from ...models import UNet2DModel, VQModel
-from ...schedulers import (
-    DDIMScheduler,
-    DPMSolverMultistepScheduler,
-    EulerAncestralDiscreteScheduler,
-    EulerDiscreteScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-)
-from ...utils import PIL_INTERPOLATION, randn_tensor
-from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
-
-
-def preprocess(image):
-    w, h = image.size
-    w, h = (x - x % 32 for x in (w, h))
-    image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
-    image = np.array(image).astype(np.float32) / 255.0
-    image = image[None].transpose(0, 3, 1, 2)
-    image = paddle.to_tensor(data=image)
-    return 2.0 * image - 1.0
-
-
-class LDMSuperResolutionPipeline(DiffusionPipeline):
-    """
-    A pipeline for image super-resolution using Latent
-
-    This class inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Parameters:
-        vqvae ([`VQModel`]):
-            Vector-quantized (VQ) VAE Model to encode and decode images to and from latent representations.
-        unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], [`EulerDiscreteScheduler`],
-            [`EulerAncestralDiscreteScheduler`], [`DPMSolverMultistepScheduler`], or [`PNDMScheduler`].
-    """
-
-    def __init__(
-        self,
-        vqvae: VQModel,
-        unet: UNet2DModel,
-        scheduler: Union[
-            DDIMScheduler,
-            PNDMScheduler,
-            LMSDiscreteScheduler,
-            EulerDiscreteScheduler,
-            EulerAncestralDiscreteScheduler,
-            DPMSolverMultistepScheduler,
-        ],
-    ):
-        super().__init__()
-        self.register_modules(vqvae=vqvae, unet=unet, scheduler=scheduler)
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        image: Union[paddle.Tensor, PIL.Image.Image] = None,
-        batch_size: Optional[int] = 1,
-        num_inference_steps: Optional[int] = 100,
-        eta: Optional[float] = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-    ) -> Union[Tuple, ImagePipelineOutput]:
-        """
-        Args:
-            image (`paddle.Tensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, that will be used as the starting point for the
-                process.
-            batch_size (`int`, *optional*, defaults to 1):
-                Number of images to generate.
-            num_inference_steps (`int`, *optional*, defaults to 100):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*):
-                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
-
-        Returns:
-            [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is
-            True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
-        """
-
-        if isinstance(image, PIL.Image.Image):
-            batch_size = 1
-        elif isinstance(image, paddle.Tensor):
-            batch_size = image.shape[0]
-        else:
-            raise ValueError(f"`image` has to be of type `PIL.Image.Image` or `paddle.Tensor` but is {type(image)}")
-        if isinstance(image, PIL.Image.Image):
-            image = preprocess(image)
-        height, width = image.shape[-2:]
-        # in_channels should be 6: 3 for latents, 3 for low resolution image
-        latents_shape = (batch_size, self.unet.config.in_channels // 2, height, width)
-        latents_dtype = self.unet.dtype
-        latents = randn_tensor(latents_shape, generator=generator, dtype=latents_dtype)
-        image = image.cast(latents_dtype)
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps_tensor = self.scheduler.timesteps
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_kwargs = {}
-        if accepts_eta:
-            extra_kwargs["eta"] = eta
-        for t in self.progress_bar(timesteps_tensor):
-            # concat latents and low resolution image in the channel dimension.
-            latents_input = paddle.concat(x=[latents, image], axis=1)
-            latents_input = self.scheduler.scale_model_input(latents_input, t)
-            # predict the noise residual
-            noise_pred = self.unet(latents_input, t).sample
-            # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents, **extra_kwargs).prev_sample
-
-        # decode the image latents with the VQVAE
-        image = self.vqvae.decode(latents).sample
-        image = paddle.clip(x=image, min=-1.0, max=1.0)
-        image = image / 2 + 0.5
-        image = image.cpu().transpose(perm=[0, 2, 3, 1]).numpy()
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-        if not return_dict:
-            return (image,)
-        return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/latent_diffusion_uncond/__init__.py b/ppdiffusers/ppdiffusers/pipelines/latent_diffusion_uncond/__init__.py
deleted file mode 100644
index efa005f31d04..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/latent_diffusion_uncond/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# flake8: noqa
-from .pipeline_latent_diffusion_uncond import LDMPipeline
diff --git a/ppdiffusers/ppdiffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py b/ppdiffusers/ppdiffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
deleted file mode 100644
index b2556c086b5f..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import List, Optional, Tuple, Union
-
-import paddle
-
-from ...models import UNet2DModel, VQModel
-from ...schedulers import DDIMScheduler
-from ...utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
-
-
-class LDMPipeline(DiffusionPipeline):
-    """
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Parameters:
-        vqvae ([`VQModel`]):
-            Vector-quantized (VQ) Model to encode and decode images to and from latent representations.
-        unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            [`DDIMScheduler`] is to be used in combination with `unet` to denoise the encoded image latents.
-    """
-
-    def __init__(self, vqvae: VQModel, unet: UNet2DModel, scheduler: DDIMScheduler):
-        super().__init__()
-        self.register_modules(vqvae=vqvae, unet=unet, scheduler=scheduler)
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        batch_size: int = 1,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        eta: float = 0.0,
-        num_inference_steps: int = 50,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        **kwargs
-    ) -> Union[Tuple, ImagePipelineOutput]:
-        """
-        Args:
-            batch_size (`int`, *optional*, defaults to 1):
-                Number of images to generate.
-            generator (`paddle.Generator`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
-
-        Returns:
-            [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is
-            True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
-        """
-        latents = randn_tensor(
-            (batch_size, self.unet.config.in_channels, self.unet.config.sample_size, self.unet.config.sample_size),
-            generator=generator,
-        )
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-
-        self.scheduler.set_timesteps(num_inference_steps)
-
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_kwargs = {}
-        if accepts_eta:
-            extra_kwargs["eta"] = eta
-        for t in self.progress_bar(self.scheduler.timesteps):
-            latent_model_input = self.scheduler.scale_model_input(latents, t)
-            # predict the noise residual
-            noise_prediction = self.unet(latent_model_input, t).sample
-            # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_prediction, t, latents, **extra_kwargs).prev_sample
-        image = self.vqvae.decode(latents).sample
-        image = (image / 2 + 0.5).clip(min=0, max=1)
-        image = image.cpu().transpose(perm=[0, 2, 3, 1]).numpy()
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-        if not return_dict:
-            return (image,)
-        return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/paint_by_example/__init__.py b/ppdiffusers/ppdiffusers/pipelines/paint_by_example/__init__.py
deleted file mode 100644
index 71563b7b0b62..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/paint_by_example/__init__.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from dataclasses import dataclass
-from typing import List, Optional, Union
-
-import numpy as np
-import PIL
-from PIL import Image
-
-from ...utils import is_paddle_available, is_paddlenlp_available
-
-if is_paddlenlp_available() and is_paddle_available():
-    from .image_encoder import PaintByExampleImageEncoder
-    from .pipeline_paint_by_example import PaintByExamplePipeline
diff --git a/ppdiffusers/ppdiffusers/pipelines/paint_by_example/image_encoder.py b/ppdiffusers/ppdiffusers/pipelines/paint_by_example/image_encoder.py
deleted file mode 100644
index fc2277bc8e29..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/paint_by_example/image_encoder.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import paddle
-from paddle import nn
-
-from paddlenlp.transformers import (
-    CLIPPretrainedModel,
-    CLIPVisionConfig,
-    CLIPVisionModel,
-)
-
-from ...models.attention import BasicTransformerBlock
-from ...utils import logging
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-class PaintByExampleImageEncoder(CLIPPretrainedModel):
-    config_class = CLIPVisionConfig
-
-    def __init__(self, config: CLIPVisionConfig, proj_size=None):
-        super().__init__(config)
-        if proj_size is not None:
-            self.projection_dim = proj_size
-        else:
-            self.projection_dim = config.projection_dim
-
-        self.model = CLIPVisionModel(config)
-        self.mapper = PaintByExampleMapper(config)
-        self.final_layer_norm = nn.LayerNorm(config.hidden_size)
-        self.proj_out = nn.Linear(config.hidden_size, self.projection_dim)
-
-        # uncondition for scaling
-        self.uncond_vector = self.create_parameter(
-            [1, 1, self.projection_dim],
-            dtype=paddle.get_default_dtype(),
-            default_initializer=nn.initializer.Assign(paddle.rand((1, 1, self.projection_dim))),
-        )
-
-    def forward(self, pixel_values, return_uncond_vector=False):
-        clip_output = self.model(pixel_values=pixel_values)
-        latent_states = clip_output.pooler_output
-        latent_states = self.mapper(latent_states[:, None])
-        latent_states = self.final_layer_norm(latent_states)
-        latent_states = self.proj_out(latent_states)
-        if return_uncond_vector:
-            return latent_states, self.uncond_vector
-
-        return latent_states
-
-
-class PaintByExampleMapper(nn.Layer):
-    def __init__(self, config: CLIPVisionConfig):
-        super().__init__()
-        num_layers = (config.num_hidden_layers + 1) // 5
-        hid_size = config.hidden_size
-        num_heads = 1
-        self.blocks = nn.LayerList(
-            [
-                BasicTransformerBlock(hid_size, num_heads, hid_size, activation_fn="gelu", attention_bias=True)
-                for _ in range(num_layers)
-            ]
-        )
-
-    def forward(self, hidden_states):
-        for block in self.blocks:
-            hidden_states = block(hidden_states)
-
-        return hidden_states
diff --git a/ppdiffusers/ppdiffusers/pipelines/paint_by_example/pipeline_paint_by_example.py b/ppdiffusers/ppdiffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
deleted file mode 100644
index 9415511b2b4c..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
+++ /dev/null
@@ -1,530 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import Callable, List, Optional, Union
-
-import numpy as np
-import paddle
-import PIL
-
-from paddlenlp.transformers import CLIPImageProcessor
-
-from ...models import AutoencoderKL, UNet2DConditionModel
-from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
-from ...utils import logging, randn_tensor
-from ..pipeline_utils import DiffusionPipeline
-from ..stable_diffusion import StableDiffusionPipelineOutput
-from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
-from .image_encoder import PaintByExampleImageEncoder
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-def prepare_mask_and_masked_image(image, mask):
-    """
-    Prepares a pair (image, mask) to be consumed by the Paint by Example pipeline. This means that those inputs will be
-    converted to ``paddle.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
-    ``image`` and ``1`` for the ``mask``.
-
-    The ``image`` will be converted to ``paddle.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be
-    binarized (``mask > 0.5``) and cast to ``paddle.float32`` too.
-
-    Args:
-        image (Union[np.array, PIL.Image, paddle.Tensor]): The image to inpaint.
-            It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` or a ``channels x height x width``
-            ``paddle.Tensor`` or a ``batch x channels x height x width`` ``paddle.Tensor``.
-        mask (_type_): The mask to apply to the image, i.e. regions to inpaint.
-            It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width``
-            ``paddle.Tensor`` or a ``batch x 1 x height x width`` ``paddle.Tensor``.
-
-
-    Raises:
-        ValueError: ``paddle.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``paddle.Tensor`` mask
-        should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions.
-        TypeError: ``mask`` is a ``paddle.Tensor`` but ``image`` is not
-            (ot the other way around).
-
-    Returns:
-        tuple[paddle.Tensor]: The pair (mask, masked_image) as ``paddle.Tensor`` with 4
-            dimensions: ``batch x channels x height x width``.
-    """
-    if isinstance(image, paddle.Tensor):
-        if not isinstance(mask, paddle.Tensor):
-            raise TypeError(f"`image` is a paddle.Tensor but `mask` (type: {type(mask)} is not")
-
-        # Batch single image
-        if image.ndim == 3:
-            assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
-            image = image.unsqueeze(0)
-
-        # Batch and add channel dim for single mask
-        if mask.ndim == 2:
-            mask = mask.unsqueeze(0).unsqueeze(0)
-
-        # Batch single mask or add channel dim
-        if mask.ndim == 3:
-            # Batched mask
-            if mask.shape[0] == image.shape[0]:
-                mask = mask.unsqueeze(1)
-            else:
-                mask = mask.unsqueeze(0)
-
-        assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
-        assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
-        assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
-        assert mask.shape[1] == 1, "Mask image must have a single channel"
-
-        # Check image is in [-1, 1]
-        if image.min() < -1 or image.max() > 1:
-            raise ValueError("Image should be in [-1, 1] range")
-
-        # Check mask is in [0, 1]
-        if mask.min() < 0 or mask.max() > 1:
-            raise ValueError("Mask should be in [0, 1] range")
-
-        # paint-by-example inverses the mask
-        mask = 1 - mask
-
-        # Binarize mask
-        mask = paddle.where(mask < 0.5, 0.0, 1.0)
-
-        # Image as float32
-        image = image.cast(paddle.float32)
-    elif isinstance(mask, paddle.Tensor):
-        raise TypeError(f"`mask` is a paddle.Tensor but `image` (type: {type(image)} is not")
-    else:
-        if isinstance(image, PIL.Image.Image):
-            image = [image]
-
-        image = np.concatenate([np.array(i.convert("RGB"))[None, :] for i in image], axis=0)
-        image = image.transpose(0, 3, 1, 2)
-        image = paddle.to_tensor(image).cast(paddle.float32) / 127.5 - 1.0
-
-        # preprocess mask
-        if isinstance(mask, PIL.Image.Image):
-            mask = [mask]
-
-        mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
-        mask = mask.astype(np.float32) / 255.0
-
-        # paint-by-example inverses the mask
-        mask = 1 - mask
-
-        mask[mask < 0.5] = 0
-        mask[mask >= 0.5] = 1
-        mask = paddle.to_tensor(mask)
-
-    masked_image = image * mask
-
-    return mask, masked_image
-
-
-class PaintByExamplePipeline(DiffusionPipeline):
-    r"""
-    Pipeline for image-guided image inpainting using Stable Diffusion. *This is an experimental feature*.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        image_encoder ([`PaintByExampleImageEncoder`]):
-            Encodes the example input image. The unet is conditioned on the example image instead of a text prompt.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPImageProcessor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-    # TODO: feature_extractor is required to encode initial images (if they are in PIL format),
-    # we should give a descriptive message if the pipeline doesn't have one.
-    _optional_components = ["safety_checker"]
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        image_encoder: PaintByExampleImageEncoder,
-        unet: UNet2DConditionModel,
-        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPImageProcessor,
-        requires_safety_checker: bool = False,
-    ):
-        super().__init__()
-
-        self.register_modules(
-            vae=vae,
-            image_encoder=image_encoder,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
-    def run_safety_checker(self, image, dtype):
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
-            )
-        else:
-            has_nsfw_concept = None
-        return image, has_nsfw_concept
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
-    def decode_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
-        image = self.vae.decode(latents).sample
-        image = (image / 2 + 0.5).clip(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        image = image.transpose([0, 2, 3, 1]).cast("float32").numpy()
-        return image
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_image_variation.StableDiffusionImageVariationPipeline.check_inputs
-    def check_inputs(self, image, height, width, callback_steps):
-        if (
-            not isinstance(image, paddle.Tensor)
-            and not isinstance(image, PIL.Image.Image)
-            and not isinstance(image, list)
-        ):
-            raise ValueError(
-                "`image` has to be of type `paddle.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
-                f" {type(image)}"
-            )
-
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, dtype=dtype)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.StableDiffusionInpaintPipeline.prepare_mask_latents
-    def prepare_mask_latents(
-        self, mask, masked_image, batch_size, height, width, dtype, generator, do_classifier_free_guidance
-    ):
-        # resize the mask to latents shape as we concatenate the mask to the latents
-        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
-        # and half precision
-        mask = paddle.nn.functional.interpolate(
-            mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
-        )
-        mask = mask.cast(dtype)
-
-        masked_image = masked_image.cast(dtype)
-
-        # encode the mask image into latents space so we can concatenate it to the latents
-        if isinstance(generator, list):
-            masked_image_latents = [
-                self.vae.encode(masked_image[i : i + 1]).latent_dist.sample(generator=generator[i])
-                for i in range(batch_size)
-            ]
-            masked_image_latents = paddle.concat(masked_image_latents, axis=0)
-        else:
-            masked_image_latents = self.vae.encode(masked_image).latent_dist.sample(generator=generator)
-        masked_image_latents = self.vae.config.scaling_factor * masked_image_latents
-
-        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
-        if mask.shape[0] < batch_size:
-            if not batch_size % mask.shape[0] == 0:
-                raise ValueError(
-                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
-                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
-                    " of masks that you pass is divisible by the total requested batch size."
-                )
-            mask = mask.tile([batch_size // mask.shape[0], 1, 1, 1])
-        if masked_image_latents.shape[0] < batch_size:
-            if not batch_size % masked_image_latents.shape[0] == 0:
-                raise ValueError(
-                    "The passed images and the required batch size don't match. Images are supposed to be duplicated"
-                    f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
-                    " Make sure the number of images that you pass is divisible by the total requested batch size."
-                )
-            masked_image_latents = masked_image_latents.tile([batch_size // masked_image_latents.shape[0], 1, 1, 1])
-
-        mask = paddle.concat([mask] * 2) if do_classifier_free_guidance else mask
-        masked_image_latents = (
-            paddle.concat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
-        )
-
-        # aligning device to prevent device errors when concating it with the latent model input
-        masked_image_latents = masked_image_latents.cast(dtype)
-        return mask, masked_image_latents
-
-    def _encode_image(self, image, num_images_per_prompt, do_classifier_free_guidance):
-        dtype = self.image_encoder.dtype
-
-        if not isinstance(image, paddle.Tensor):
-            image = self.feature_extractor(images=image, return_tensors="pd").pixel_values
-
-        image = image.cast(dtype)
-        image_embeddings, negative_prompt_embeds = self.image_encoder(image, return_uncond_vector=True)
-
-        # duplicate image embeddings for each generation per prompt, using mps friendly method
-        bs_embed, seq_len, _ = image_embeddings.shape
-        image_embeddings = image_embeddings.tile([1, num_images_per_prompt, 1])
-        image_embeddings = image_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        if do_classifier_free_guidance:
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, image_embeddings.shape[0], 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([bs_embed * num_images_per_prompt, 1, -1])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            image_embeddings = paddle.concat([negative_prompt_embeds, image_embeddings])
-
-        return image_embeddings
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        example_image: Union[paddle.Tensor, PIL.Image.Image],
-        image: Union[paddle.Tensor, PIL.Image.Image],
-        mask_image: Union[paddle.Tensor, PIL.Image.Image],
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 5.0,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            example_image (`paddle.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`):
-                The exemplar image to guide the image generation.
-            image (`paddle.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`):
-                `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
-                be masked out with `mask_image` and repainted according to `prompt`.
-            mask_image (`paddle.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`):
-                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
-                repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
-                to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
-                instead of 3, so the expected shape would be `(B, H, W, 1)`.
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        # 1. Define call parameters
-        if isinstance(image, PIL.Image.Image):
-            batch_size = 1
-        elif isinstance(image, list):
-            batch_size = len(image)
-        else:
-            batch_size = image.shape[0]
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 2. Preprocess mask and image
-        mask, masked_image = prepare_mask_and_masked_image(image, mask_image)
-        height, width = masked_image.shape[-2:]
-
-        # 3. Check inputs
-        self.check_inputs(example_image, height, width, callback_steps)
-
-        # 4. Encode input image
-        image_embeddings = self._encode_image(example_image, num_images_per_prompt, do_classifier_free_guidance)
-
-        # 5. set timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps = self.scheduler.timesteps
-
-        # 6. Prepare latent variables
-        num_channels_latents = self.vae.config.latent_channels
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            image_embeddings.dtype,
-            generator,
-            latents,
-        )
-
-        # 7. Prepare mask latent variables
-        mask, masked_image_latents = self.prepare_mask_latents(
-            mask,
-            masked_image,
-            batch_size * num_images_per_prompt,
-            height,
-            width,
-            image_embeddings.dtype,
-            generator,
-            do_classifier_free_guidance,
-        )
-
-        # 8. Check that sizes of mask, masked image and latents match
-        num_channels_mask = mask.shape[1]
-        num_channels_masked_image = masked_image_latents.shape[1]
-        if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
-            raise ValueError(
-                f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
-                f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
-                f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
-                f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
-                " `pipeline.unet` or your `mask_image` or `image` input."
-            )
-
-        # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 10. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-
-                # concat latents, mask, masked_image_latents in the channel dimension
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-                latent_model_input = paddle.concat([latent_model_input, masked_image_latents, mask], axis=1)
-
-                # predict the noise residual
-                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=image_embeddings).sample
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-                # must cast this, paddle.concat has bug...
-                latents = latents.cast(image_embeddings.dtype)
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        # 11. Post-processing
-        image = self.decode_latents(latents)
-
-        # 12. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image, image_embeddings.dtype)
-
-        # 13. Convert to PIL
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/pipeline_utils.py b/ppdiffusers/ppdiffusers/pipelines/pipeline_utils.py
deleted file mode 100644
index b09eeb7e779d..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/pipeline_utils.py
+++ /dev/null
@@ -1,1687 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2022 The HuggingFace Inc. team.
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import fnmatch
-import importlib
-import inspect
-import os
-import re
-import sys
-import tempfile
-import warnings
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Union
-
-import numpy as np
-import PIL
-import PIL.Image
-from huggingface_hub import (
-    create_repo,
-    get_hf_file_metadata,
-    hf_hub_url,
-    model_info,
-    repo_type_and_id_from_hf_id,
-    snapshot_download,
-    upload_folder,
-)
-from huggingface_hub.utils import EntryNotFoundError
-from packaging import version
-from tqdm.auto import tqdm
-
-from ..configuration_utils import ConfigMixin
-from ..schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
-from ..utils import (
-    CONFIG_NAME,
-    DEPRECATED_REVISION_ARGS,
-    DIFFUSERS_CACHE,
-    FLAX_WEIGHTS_NAME,
-    FROM_DIFFUSERS,
-    FROM_HF_HUB,
-    HF_HUB_OFFLINE,
-    LOW_CPU_MEM_USAGE_DEFAULT,
-    ONNX_EXTERNAL_WEIGHTS_NAME,
-    ONNX_WEIGHTS_NAME,
-    PPDIFFUSERS_CACHE,
-    TO_DIFFUSERS,
-    TORCH_SAFETENSORS_WEIGHTS_NAME,
-    TORCH_WEIGHTS_NAME,
-    BaseOutput,
-    deprecate,
-    get_class_from_dynamic_module,
-    is_paddle_available,
-    is_paddlenlp_available,
-    is_safetensors_available,
-    logging,
-    numpy_to_pil,
-    ppdiffusers_bos_dir_download,
-    ppdiffusers_url_download,
-)
-from ..version import VERSION as __version__
-
-if is_paddle_available():
-    import paddle
-    import paddle.nn as nn
-
-if is_paddlenlp_available():
-    from paddlenlp.transformers import PretrainedModel
-
-from .fastdeploy_utils import FastDeployRuntimeModel
-
-TRANSFORMERS_SAFE_WEIGHTS_NAME = "model.safetensors"
-TRANSFORMERS_WEIGHTS_NAME = "pytorch_model.bin"
-TRANSFORMERS_FLAX_WEIGHTS_NAME = "flax_model.msgpack"
-
-
-TORCH_INDEX_FILE = "diffusion_pytorch_model.bin"
-PADDLE_INDEX_FILE = "model_state.pdparams"
-
-CUSTOM_PIPELINE_FILE_NAME = "pipeline.py"
-DUMMY_MODULES_FOLDER = "ppdiffusers.utils"
-PADDLENLP_DUMMY_MODULES_FOLDER = "paddlenlp.transformers.utils"
-
-logger = logging.get_logger(__name__)
-
-
-LOADABLE_CLASSES = {
-    "ppdiffusers": {
-        "ModelMixin": ["save_pretrained", "from_pretrained"],
-        "SchedulerMixin": ["save_pretrained", "from_pretrained"],
-        "DiffusionPipeline": ["save_pretrained", "from_pretrained"],
-        "FastDeployRuntimeModel": ["save_pretrained", "from_pretrained"],
-    },
-    "paddlenlp.transformers": {
-        "PretrainedTokenizer": ["save_pretrained", "from_pretrained"],
-        "PretrainedModel": ["save_pretrained", "from_pretrained"],
-        "FeatureExtractionMixin": ["save_pretrained", "from_pretrained"],
-        "ProcessorMixin": ["save_pretrained", "from_pretrained"],
-        "ImageProcessingMixin": ["save_pretrained", "from_pretrained"],
-        "RobertaTokenizer": ["save_pretrained", "from_pretrained"],
-    },
-}
-
-ALL_IMPORTABLE_CLASSES = {}
-for library in LOADABLE_CLASSES:
-    ALL_IMPORTABLE_CLASSES.update(LOADABLE_CLASSES[library])
-
-
-@dataclass
-class ImagePipelineOutput(BaseOutput):
-    """
-    Output class for image pipelines.
-
-    Args:
-        images (`List[PIL.Image.Image]` or `np.ndarray`)
-            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
-            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
-    """
-
-    images: Union[List[PIL.Image.Image], np.ndarray]
-
-
-@dataclass
-class TextPipelineOutput(BaseOutput):
-    """
-    Output class for text pipelines.
-    Args:
-        prompt (`List[str]` or `str`)
-            List of denoised texts.
-    """
-
-    texts: Union[List[str], str]
-
-
-@dataclass
-class AudioPipelineOutput(BaseOutput):
-    """
-    Output class for audio pipelines.
-
-    Args:
-        audios (`np.ndarray`)
-            List of denoised samples of shape `(batch_size, num_channels, sample_rate)`. Numpy array present the
-            denoised audio samples of the diffusion pipeline.
-    """
-
-    audios: np.ndarray
-
-
-def is_safetensors_compatible(filenames, variant=None, passed_components=None) -> bool:
-    """
-    Checking for safetensors compatibility:
-    - By default, all models are saved with the default pytorch serialization, so we use the list of default pytorch
-      files to know which safetensors files are needed.
-    - The model is safetensors compatible only if there is a matching safetensors file for every default pytorch file.
-    Converting default pytorch serialized filenames to safetensors serialized filenames:
-    - For models from the diffusers library, just replace the ".bin" extension with ".safetensors"
-    - For models from the transformers library, the filename changes from "pytorch_model" to "model", and the ".bin"
-      extension is replaced with ".safetensors"
-    """
-    pt_filenames = []
-
-    sf_filenames = set()
-
-    passed_components = passed_components or []
-
-    for filename in filenames:
-        _, extension = os.path.splitext(filename)
-
-        if len(filename.split("/")) == 2 and filename.split("/")[0] in passed_components:
-            continue
-
-        if extension == ".bin":
-            pt_filenames.append(filename)
-        elif extension == ".safetensors":
-            sf_filenames.add(filename)
-
-    for filename in pt_filenames:
-        #  filename = 'foo/bar/baz.bam' -> path = 'foo/bar', filename = 'baz', extention = '.bam'
-        path, filename = os.path.split(filename)
-        filename, extension = os.path.splitext(filename)
-
-        if filename.startswith("pytorch_model"):
-            filename = filename.replace("pytorch_model", "model")
-        else:
-            filename = filename
-
-        expected_sf_filename = os.path.join(path, filename)
-        expected_sf_filename = f"{expected_sf_filename}.safetensors"
-
-        if expected_sf_filename not in sf_filenames:
-            logger.warning(f"{expected_sf_filename} not found")
-            return False
-
-    return True
-
-
-def variant_compatible_siblings(filenames, variant=None) -> Union[List[os.PathLike], str]:
-    weight_names = [
-        TORCH_WEIGHTS_NAME,
-        TORCH_SAFETENSORS_WEIGHTS_NAME,
-        TRANSFORMERS_WEIGHTS_NAME,
-        TRANSFORMERS_SAFE_WEIGHTS_NAME,
-        TRANSFORMERS_FLAX_WEIGHTS_NAME,
-        FLAX_WEIGHTS_NAME,
-        ONNX_WEIGHTS_NAME,
-        ONNX_EXTERNAL_WEIGHTS_NAME,
-    ]
-    # model_pytorch, diffusion_model_pytorch, ...
-    weight_prefixes = [w.split(".")[0] for w in weight_names]
-    # .bin, .safetensors, ...
-    weight_suffixs = [w.split(".")[-1] for w in weight_names]
-    # -00001-of-00002
-    transformers_index_format = r"\d{5}-of-\d{5}"
-
-    if variant is not None:
-        # `diffusion_pytorch_model.fp16.bin` as well as `model.fp16-00001-of-00002.safetenstors`
-        variant_file_re = re.compile(
-            rf"({'|'.join(weight_prefixes)})\.({variant}|{variant}-{transformers_index_format})\.({'|'.join(weight_suffixs)})$"
-        )
-        # `text_encoder/pytorch_model.bin.index.fp16.json`
-        variant_index_re = re.compile(
-            rf"({'|'.join(weight_prefixes)})\.({'|'.join(weight_suffixs)})\.index\.{variant}\.json$"
-        )
-
-    # `diffusion_pytorch_model.bin` as well as `model-00001-of-00002.safetenstors`
-    non_variant_file_re = re.compile(
-        rf"({'|'.join(weight_prefixes)})(-{transformers_index_format})?\.({'|'.join(weight_suffixs)})$"
-    )
-    # `text_encoder/pytorch_model.bin.index.json`
-    non_variant_index_re = re.compile(rf"({'|'.join(weight_prefixes)})\.({'|'.join(weight_suffixs)})\.index\.json")
-
-    if variant is not None:
-        variant_weights = {f for f in filenames if variant_file_re.match(f.split("/")[-1]) is not None}
-        variant_indexes = {f for f in filenames if variant_index_re.match(f.split("/")[-1]) is not None}
-        variant_filenames = variant_weights | variant_indexes
-    else:
-        variant_filenames = set()
-
-    non_variant_weights = {f for f in filenames if non_variant_file_re.match(f.split("/")[-1]) is not None}
-    non_variant_indexes = {f for f in filenames if non_variant_index_re.match(f.split("/")[-1]) is not None}
-    non_variant_filenames = non_variant_weights | non_variant_indexes
-
-    # all variant filenames will be used by default
-    usable_filenames = set(variant_filenames)
-
-    def convert_to_variant(filename):
-        if "index" in filename:
-            variant_filename = filename.replace("index", f"index.{variant}")
-        elif re.compile(f"^(.*?){transformers_index_format}").match(filename) is not None:
-            variant_filename = f"{filename.split('-')[0]}.{variant}-{'-'.join(filename.split('-')[1:])}"
-        else:
-            variant_filename = f"{filename.split('.')[0]}.{variant}.{filename.split('.')[1]}"
-        return variant_filename
-
-    for f in non_variant_filenames:
-        variant_filename = convert_to_variant(f)
-        if variant_filename not in usable_filenames:
-            usable_filenames.add(f)
-
-    return usable_filenames, variant_filenames
-
-
-def warn_deprecated_model_variant(pretrained_model_name_or_path, use_auth_token, variant, revision, model_filenames):
-    info = model_info(
-        pretrained_model_name_or_path,
-        use_auth_token=use_auth_token,
-        revision=None,
-    )
-    filenames = {sibling.rfilename for sibling in info.siblings}
-    comp_model_filenames, _ = variant_compatible_siblings(filenames, variant=revision)
-    comp_model_filenames = [".".join(f.split(".")[:1] + f.split(".")[2:]) for f in comp_model_filenames]
-
-    if set(comp_model_filenames) == set(model_filenames):
-        warnings.warn(
-            f"You are loading the variant {revision} from {pretrained_model_name_or_path} via `revision='{revision}'` even though you can load it via `variant=`{revision}`. Loading model variants via `revision='{revision}'` is deprecated and will be removed in diffusers v1. Please use `variant='{revision}'` instead.",
-            FutureWarning,
-        )
-    else:
-        warnings.warn(
-            f"You are loading the variant {revision} from {pretrained_model_name_or_path} via `revision='{revision}'`. This behavior is deprecated and will be removed in diffusers v1. One should use `variant='{revision}'` instead. However, it appears that {pretrained_model_name_or_path} currently does not have the required variant filenames in the 'main' branch. \n The Diffusers team and community would be very grateful if you could open an issue: https://github.com/huggingface/diffusers/issues/new with the title '{pretrained_model_name_or_path} is missing {revision} files' so that the correct variant file can be added.",
-            FutureWarning,
-        )
-
-
-def maybe_raise_or_warn(
-    library_name, library, class_name, importable_classes, passed_class_obj, name, is_pipeline_module
-):
-    """Simple helper method to raise or warn in case incorrect module has been passed"""
-    if not is_pipeline_module:
-        library = importlib.import_module(library_name)
-        class_obj = getattr(library, class_name)
-        class_candidates = {c: getattr(library, c, None) for c in importable_classes.keys()}
-
-        expected_class_obj = None
-        for class_name, class_candidate in class_candidates.items():
-            if class_candidate is not None and issubclass(class_obj, class_candidate):
-                expected_class_obj = class_candidate
-
-        # Dynamo wraps the original model in a private class.
-        # I didn't find a public API to get the original class.
-        sub_model = passed_class_obj[name]
-        model_cls = sub_model.__class__
-
-        if not issubclass(model_cls, expected_class_obj):
-            raise ValueError(
-                f"{passed_class_obj[name]} is of type: {type(passed_class_obj[name])}, but should be"
-                f" {expected_class_obj}"
-            )
-    else:
-        logger.warning(
-            f"You have passed a non-standard module {passed_class_obj[name]}. We cannot verify whether it"
-            " has the correct type"
-        )
-
-
-def get_class_obj_and_candidates(library_name, class_name, importable_classes, pipelines, is_pipeline_module):
-    """Simple helper method to retrieve class object of module as well as potential parent class objects"""
-    if is_pipeline_module:
-        pipeline_module = getattr(pipelines, library_name)
-
-        class_obj = getattr(pipeline_module, class_name)
-        class_candidates = {c: class_obj for c in importable_classes.keys()}
-    else:
-        # else we just import it from the library.
-        library = importlib.import_module(library_name)
-        class_obj = getattr(library, class_name)
-        class_candidates = {c: getattr(library, c, None) for c in importable_classes.keys()}
-
-    return class_obj, class_candidates
-
-
-def _get_pipeline_class(class_obj, config, custom_pipeline=None, cache_dir=None, revision=None):
-    if custom_pipeline is not None:
-        if custom_pipeline.endswith(".py"):
-            path = Path(custom_pipeline)
-            # decompose into folder & file
-            file_name = path.name
-            custom_pipeline = path.parent.absolute()
-        else:
-            file_name = CUSTOM_PIPELINE_FILE_NAME
-
-        return get_class_from_dynamic_module(
-            custom_pipeline, module_file=file_name, cache_dir=cache_dir, revision=revision
-        )
-
-    if class_obj != DiffusionPipeline:
-        return class_obj
-
-    ppdiffusers_module = importlib.import_module(class_obj.__module__.split(".")[0])
-    return getattr(ppdiffusers_module, config["_class_name"])
-
-
-def load_sub_model(
-    library_name: str,
-    class_name: str,
-    importable_classes: List[Any],
-    pipelines: Any,
-    is_pipeline_module: bool,
-    pipeline_class: Any,
-    paddle_dtype: paddle.dtype,
-    runtime_options: Any,
-    model_variants: Dict[str, str],
-    name: str,
-    from_diffusers: bool,
-    low_cpu_mem_usage: bool = False,
-    cached_folder: Union[str, os.PathLike] = None,
-    **kwargs,
-):
-    # support huggingface diffusers onnx model
-    is_onnx_model = False
-    if "Onnx" in class_name:
-        class_name = class_name.replace("Onnx", "FastDeploy")
-        is_onnx_model = True
-
-    """Helper method to load the module `name` from `library_name` and `class_name`"""
-    # retrieve class candidates
-    class_obj, class_candidates = get_class_obj_and_candidates(
-        library_name, class_name, importable_classes, pipelines, is_pipeline_module
-    )
-
-    load_method_name = None
-    # retrive load method name
-    for class_name, class_candidate in class_candidates.items():
-        if class_candidate is not None and issubclass(class_obj, class_candidate):
-            load_method_name = importable_classes[class_name][1]
-
-    # if load method name is None, then we have a dummy module -> raise Error
-    if load_method_name is None:
-        none_module = class_obj.__module__
-        is_dummy_path = none_module.startswith(DUMMY_MODULES_FOLDER) or none_module.startswith(
-            PADDLENLP_DUMMY_MODULES_FOLDER
-        )
-        if is_dummy_path and "dummy" in none_module:
-            # call class_obj for nice error message of missing requirements
-            class_obj()
-
-        raise ValueError(
-            f"The component {class_obj} of {pipeline_class} cannot be loaded as it does not seem to have"
-            f" any of the loading methods defined in {ALL_IMPORTABLE_CLASSES}."
-        )
-
-    load_method = getattr(class_obj, load_method_name)
-
-    # add kwargs to loading method
-    loading_kwargs = {}
-
-    # FastDeploy Model
-    if issubclass(class_obj, FastDeployRuntimeModel):
-        loading_kwargs["runtime_options"] = (
-            runtime_options.get(name, None) if isinstance(runtime_options, dict) else runtime_options
-        )
-        if not is_onnx_model:
-            if os.path.isdir(os.path.join(cached_folder, name)):
-                is_onnx_model = any(
-                    d.endswith(".onnx") or d.endswith(".pb") for d in os.listdir(os.path.join(cached_folder, name))
-                )
-            else:
-                is_onnx_model = any(
-                    d.endswith(".onnx") or d.endswith(".pb") for d in os.listdir(os.path.join(cached_folder))
-                )
-        loading_kwargs["is_onnx_model"] = is_onnx_model
-
-    from ppdiffusers import ModelMixin
-
-    # PaddleNLP or PPDiffusers Model
-    if issubclass(class_obj, (PretrainedModel, ModelMixin)):
-        loading_kwargs["variant"] = model_variants.pop(name, None)
-        loading_kwargs["from_diffusers"] = from_diffusers
-        loading_kwargs["paddle_dtype"] = paddle_dtype
-        loading_kwargs["low_cpu_mem_usage"] = low_cpu_mem_usage
-
-    loaded_sub_model = None
-    try:
-        # check if the module is in a subdirectory
-        if os.path.isdir(os.path.join(cached_folder, name)):
-            loaded_sub_model = load_method(os.path.join(cached_folder, name), **loading_kwargs)
-        else:
-            # else load from the root directory
-            loaded_sub_model = load_method(cached_folder, **loading_kwargs)
-    except Exception as e:
-        # (TODO, junnyu)
-        # if we cant find this file, we will try to download this
-        local_files_only = kwargs["local_files_only"]
-        is_local_dir = kwargs["is_local_dir"]
-        from_hf_hub = kwargs["from_hf_hub"]
-        pretrained_model_name_or_path = kwargs["pretrained_model_name_or_path"]
-        cache_dir = kwargs["cache_dir"]
-        if not local_files_only and not is_local_dir and not from_hf_hub:
-            loaded_sub_model = load_method(
-                pretrained_model_name_or_path + "/" + name, cache_dir=cache_dir, **loading_kwargs
-            )
-        if loaded_sub_model is None:
-            raise ValueError(f"We cant load '{name}' from {pretrained_model_name_or_path} or {cached_folder}! \n {e} ")
-
-    return loaded_sub_model
-
-
-class DiffusionPipeline(ConfigMixin):
-    r"""
-    Base class for all models.
-
-    [`DiffusionPipeline`] takes care of storing all components (models, schedulers, processors) for diffusion pipelines
-    and handles methods for loading, downloading and saving models as well as a few methods common to all pipelines to:
-
-        - move all PyTorch modules to the device of your choice
-        - enabling/disabling the progress bar for the denoising iteration
-
-    Class attributes:
-
-        - **config_name** (`str`) -- name of the config file that will store the class and module names of all
-          components of the diffusion pipeline.
-        - **_optional_components** (List[`str`]) -- list of all components that are optional so they don't have to be
-          passed for the pipeline to function (should be overridden by subclasses).
-    """
-    config_name = "model_index.json"
-    _optional_components = []
-
-    def register_modules(self, **kwargs):
-        # import it here to avoid circular import
-        from ppdiffusers import pipelines
-
-        for name, module in kwargs.items():
-            # retrieve library
-            if module is None:
-                register_dict = {name: (None, None)}
-            else:
-                # TODO (junnyu) support paddlenlp.transformers
-                if "paddlenlp" in module.__module__.split(".") or "ppnlp_patch_utils" in module.__module__.split("."):
-                    library = "paddlenlp.transformers"
-                else:
-                    library = module.__module__.split(".")[0]
-
-                # check if the module is a pipeline module
-                pipeline_dir = module.__module__.split(".")[-2] if len(module.__module__.split(".")) > 2 else None
-                path = module.__module__.split(".")
-                is_pipeline_module = pipeline_dir in path and hasattr(pipelines, pipeline_dir)
-
-                # if library is not in LOADABLE_CLASSES, then it is a custom module.
-                # Or if it's a pipeline module, then the module is inside the pipeline
-                # folder so we set the library to module name.
-                if library not in LOADABLE_CLASSES or is_pipeline_module:
-                    library = pipeline_dir
-
-                # retrieve class_name
-                class_name = module.__class__.__name__
-
-                register_dict = {name: (library, class_name)}
-
-            # save model index config
-            self.register_to_config(**register_dict)
-
-            # set models
-            setattr(self, name, module)
-
-            # TODO junnyu, before register model, we may need to keep some module in fp32
-            if (
-                isinstance(module, nn.Layer)
-                and hasattr(module, "_keep_in_fp32_modules")
-                and module.dtype == paddle.float16
-            ):
-                for module_name, sub_module in module.named_sublayers(include_self=True):
-                    if any(n in module_name for n in module._keep_in_fp32_modules):
-                        sub_module.to(dtype=paddle.float32)
-                        if hasattr(sub_module, "pre_hook"):
-                            sub_module.pre_hook.remove()
-                        sub_module.pre_hook = sub_module.register_forward_pre_hook(
-                            lambda layer, input: input[0].cast("float32")
-                        )
-
-    def __setattr__(self, name: str, value: Any):
-        if name in self.__dict__ and hasattr(self.config, name):
-            # We need to overwrite the config if name exists in config
-            if isinstance(getattr(self.config, name), (tuple, list)):
-                if value is not None and self.config[name][0] is not None:
-                    class_library_tuple = (value.__module__.split(".")[0], value.__class__.__name__)
-                else:
-                    class_library_tuple = (None, None)
-
-                self.register_to_config(**{name: class_library_tuple})
-            else:
-                self.register_to_config(**{name: value})
-
-        super().__setattr__(name, value)
-
-    def save_pretrained(
-        self,
-        save_directory: Union[str, os.PathLike],
-        safe_serialization: bool = False,
-        variant: Optional[str] = None,
-        to_diffusers: bool = None,
-    ):
-        """
-        Save all variables of the pipeline that can be saved and loaded as well as the pipelines configuration file to
-        a directory. A pipeline variable can be saved and loaded if its class implements both a save and loading
-        method. The pipeline can easily be re-loaded using the `[`~DiffusionPipeline.from_pretrained`]` class method.
-
-        Arguments:
-            save_directory (`str` or `os.PathLike`):
-                Directory to which to save. Will be created if it doesn't exist.
-            safe_serialization (`bool`, *optional*, defaults to `False`):
-                Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
-            variant (`str`, *optional*):
-                If specified, weights are saved in the format pytorch_model.<variant>.bin.
-        """
-        if to_diffusers is None:
-            to_diffusers = TO_DIFFUSERS
-
-        model_index_dict = dict(self.config)
-        model_index_dict.pop("_class_name", None)
-        # TODO (junnyu) support old version
-        model_index_dict.pop("_diffusers_paddle_version", None)
-        model_index_dict.pop("_diffusers_version", None)
-        model_index_dict.pop("_ppdiffusers_version", None)
-        model_index_dict.pop("_module", None)
-
-        expected_modules, optional_kwargs = self._get_signature_keys(self)
-
-        def is_saveable_module(name, value):
-            if name not in expected_modules:
-                return False
-            if name in self._optional_components and value[0] is None:
-                return False
-            return True
-
-        model_index_dict = {k: v for k, v in model_index_dict.items() if is_saveable_module(k, v)}
-
-        for pipeline_component_name in model_index_dict.keys():
-            sub_model = getattr(self, pipeline_component_name)
-            model_cls = sub_model.__class__
-
-            save_method_name = None
-            # search for the model's base class in LOADABLE_CLASSES
-            for library_name, library_classes in LOADABLE_CLASSES.items():
-                if library_name in sys.modules:
-                    library = importlib.import_module(library_name)
-                else:
-                    logger.info(
-                        f"{library_name} is not installed. Cannot save {pipeline_component_name} as {library_classes} from {library_name}"
-                    )
-                for base_class, save_load_methods in library_classes.items():
-                    class_candidate = getattr(library, base_class, None)
-                    if class_candidate is not None and issubclass(model_cls, class_candidate):
-                        # if we found a suitable base class in LOADABLE_CLASSES then grab its save method
-                        save_method_name = save_load_methods[0]
-                        break
-                if save_method_name is not None:
-                    break
-
-            if save_method_name is None:
-                logger.warn(f"self.{pipeline_component_name}={sub_model} of type {type(sub_model)} cannot be saved.")
-                # make sure that unsaveable components are not tried to be loaded afterward
-                self.register_to_config(**{pipeline_component_name: (None, None)})
-                continue
-
-            save_method = getattr(sub_model, save_method_name)
-
-            # Call the save method with the argument safe_serialization only if it's supported
-            save_method_signature = inspect.signature(save_method)
-            save_method_accept_safe = "safe_serialization" in save_method_signature.parameters
-            save_method_accept_variant = "variant" in save_method_signature.parameters
-            save_method_accept_to_diffusers = "to_diffusers" in save_method_signature.parameters
-
-            save_kwargs = {}
-            # maybe we donot have torch so we use safe_serialization
-            if to_diffusers:
-                safe_serialization = True
-
-            if save_method_accept_safe:
-                save_kwargs["safe_serialization"] = safe_serialization
-            if save_method_accept_variant:
-                save_kwargs["variant"] = variant
-            if save_method_accept_to_diffusers:
-                save_kwargs["to_diffusers"] = to_diffusers
-
-            save_method(os.path.join(save_directory, pipeline_component_name), **save_kwargs)
-
-        # finally save the config
-        self.save_config(save_directory, to_diffusers=to_diffusers)
-
-    def save_to_hf_hub(
-        self,
-        repo_id: str,
-        private: Optional[bool] = None,
-        commit_message: Optional[str] = None,
-        revision: Optional[str] = None,
-        create_pr: bool = False,
-    ):
-        """
-        Uploads all elements of this pipeline to a new HuggingFace Hub repository.
-        Args:
-            repo_id (str): Repository name for your model/tokenizer in the Hub.
-            private (bool, optional): Whether the model/tokenizer is set to private
-            commit_message (str, optional) — The summary / title / first line of the generated commit. Defaults to: f"Upload {path_in_repo} with huggingface_hub"
-            revision (str, optional) — The git revision to commit from. Defaults to the head of the "main" branch.
-            create_pr (boolean, optional) — Whether or not to create a Pull Request with that commit. Defaults to False.
-                If revision is not set, PR is opened against the "main" branch. If revision is set and is a branch, PR is opened against this branch.
-                If revision is set and is not a branch name (example: a commit oid), an RevisionNotFoundError is returned by the server.
-
-        Returns: The url of the commit of your model in the given repository.
-        """
-        repo_url = create_repo(repo_id, private=private, exist_ok=True)
-
-        # Infer complete repo_id from repo_url
-        # Can be different from the input `repo_id` if repo_owner was implicit
-        _, repo_owner, repo_name = repo_type_and_id_from_hf_id(repo_url)
-
-        repo_id = f"{repo_owner}/{repo_name}"
-
-        # Check if README file already exist in repo
-        try:
-            get_hf_file_metadata(hf_hub_url(repo_id=repo_id, filename="README.md", revision=revision))
-            has_readme = True
-        except EntryNotFoundError:
-            has_readme = False
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            # save model
-            self.save_pretrained(tmp_dir)
-            # Add readme if does not exist
-            logger.info("README.md not found, adding the default README.md")
-            if not has_readme:
-                with open(os.path.join(tmp_dir, "README.md"), "w") as f:
-                    f.write(f"---\nlibrary_name: ppdiffusers\n---\n# {repo_id}")
-
-            # Upload model and return
-            logger.info(f"Pushing to the {repo_id}. This might take a while")
-            return upload_folder(
-                repo_id=repo_id,
-                repo_type="model",
-                folder_path=tmp_dir,
-                commit_message=commit_message,
-                revision=revision,
-                create_pr=create_pr,
-            )
-
-    def to(
-        self,
-        paddle_device: Optional[str] = None,
-        paddle_dtype: Optional[paddle.dtype] = None,
-        silence_dtype_warnings: bool = True,
-    ):
-        if paddle_device is None and paddle_dtype is None:
-            return self
-
-        module_names, _ = self._get_signature_keys(self)
-        modules = [getattr(self, n, None) for n in module_names]
-        modules = [m for m in modules if isinstance(m, nn.Layer)]
-        for module in modules:
-            if (
-                paddle_device is not None
-                and module.dtype == paddle.float16
-                and str(paddle_device) in ["cpu"]
-                and not silence_dtype_warnings
-            ):
-                logger.warning(
-                    "Pipelines loaded with `paddle_dtype=paddle.float16` cannot run with `cpu` device. It"
-                    " is not recommended to move them to `cpu` as running them will fail. Please make"
-                    " sure to use an accelerator to run the pipeline in inference, due to the lack of"
-                    " support for`float16` operations on this device in Paddle. Please, remove the"
-                    " `paddle_dtype=paddle.float16` argument, or use another device for inference."
-                )
-            kwargs = {}
-            if paddle_device is not None:
-                kwargs["device"] = paddle_device
-            if paddle_dtype is not None:
-                kwargs["dtype"] = paddle_dtype
-            module.to(**kwargs)
-
-            # TODO junnyu, before register model, we may need to keep some module in fp32
-            if (
-                isinstance(module, nn.Layer)
-                and hasattr(module, "_keep_in_fp32_modules")
-                and module.dtype == paddle.float16
-            ):
-                for module_name, sub_module in module.named_sublayers(include_self=True):
-                    if any(n in module_name for n in module._keep_in_fp32_modules):
-                        sub_module.to(dtype=paddle.float32)
-                        if hasattr(sub_module, "pre_hook"):
-                            sub_module.pre_hook.remove()
-                        sub_module.pre_hook = sub_module.register_forward_pre_hook(
-                            lambda layer, input: input[0].cast("float32")
-                        )
-        return self
-
-    @property
-    def device(self):
-        r"""
-        Returns:
-            `paddle.device`: The paddle device on which the pipeline is located.
-        """
-        module_names, _ = self._get_signature_keys(self)
-        modules = [getattr(self, n, None) for n in module_names]
-        modules = [m for m in modules if isinstance(m, nn.Layer)]
-
-        for module in modules:
-            return module.place
-        return "cpu"
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
-        r"""
-        Instantiate a Paddle diffusion pipeline from pre-trained pipeline weights.
-
-        The pipeline is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated).
-
-        The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come
-        pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
-        task.
-
-        The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those
-        weights are discarded.
-
-        Parameters:
-            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
-                Can be either:
-
-                    - A string, the *repo id* of a pretrained pipeline hosted inside a model repo on
-                      https://huggingface.co/ Valid repo ids have to be located under a user or organization name, like
-                      `CompVis/ldm-text2im-large-256`.
-                    - A path to a *directory* containing pipeline weights saved using
-                      [`~DiffusionPipeline.save_pretrained`], e.g., `./my_pipeline_directory/`.
-            paddle_dtype (`str` or `paddle.dtype`, *optional*):
-                Override the default `paddle.dtype` and load the model under this dtype. If `"auto"` is passed the dtype
-                will be automatically derived from the model's weights.
-            custom_pipeline (`str`, *optional*):
-
-                <Tip warning={true}>
-
-                    This is an experimental feature and is likely to change in the future.
-
-                </Tip>
-
-                Can be either:
-
-                    - A string, the *repo id* of a custom pipeline hosted inside a model repo on
-                      https://huggingface.co/. Valid repo ids have to be located under a user or organization name,
-                      like `hf-internal-testing/diffusers-dummy-pipeline`.
-
-                        <Tip>
-
-                         It is required that the model repo has a file, called `pipeline.py` that defines the custom
-                         pipeline.
-
-                        </Tip>
-
-                    - A string, the *file name* of a community pipeline hosted on GitHub under
-                      https://github.com/huggingface/diffusers/tree/main/examples/community. Valid file names have to
-                      match exactly the file name without `.py` located under the above link, *e.g.*
-                      `clip_guided_stable_diffusion`.
-
-                        <Tip>
-
-                         Community pipelines are always loaded from the current `main` branch of GitHub.
-
-                        </Tip>
-
-                    - A path to a *directory* containing a custom pipeline, e.g., `./my_pipeline_directory/`.
-
-                        <Tip>
-
-                         It is required that the directory has a file, called `pipeline.py` that defines the custom
-                         pipeline.
-
-                        </Tip>
-
-                For more information on how to load and create custom pipelines, please have a look at [Loading and
-                Adding Custom
-                Pipelines](https://huggingface.co/docs/diffusers/using-diffusers/custom_pipeline_overview)
-
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory in which a downloaded pretrained model configuration should be cached if the
-                standard cache should not be used.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
-                file exists.
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            output_loading_info(`bool`, *optional*, defaults to `False`):
-                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
-            local_files_only(`bool`, *optional*, defaults to `False`):
-                Whether or not to only look at local files (i.e., do not try to download the model).
-            use_auth_token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-                when running `huggingface-cli login` (stored in `~/.huggingface`).
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
-                identifier allowed by git.
-            custom_revision (`str`, *optional*, defaults to `"main"` when loading from the Hub and to local version of `diffusers` when loading from GitHub):
-                The specific model version to use. It can be a branch name, a tag name, or a commit id similar to
-                `revision` when loading a custom pipeline from the Hub. It can be a diffusers version when loading a
-                custom pipeline from GitHub.
-            mirror (`str`, *optional*):
-                Mirror source to accelerate downloads in China. If you are from China and have an accessibility
-                problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
-                Please refer to the mirror site for more information. specify the folder name here.
-            return_cached_folder (`bool`, *optional*, defaults to `False`):
-                If set to `True`, path to downloaded cached folder will be returned in addition to loaded pipeline.
-            use_safetensors (`bool`, *optional* ):
-                If set to `True`, the pipeline will be loaded from `safetensors` weights. If set to `None` (the
-                default). The pipeline will load using `safetensors` if the safetensors weights are available *and* if
-                `safetensors` is installed. If the to `False` the pipeline will *not* use `safetensors`.
-            kwargs (remaining dictionary of keyword arguments, *optional*):
-                Can be used to overwrite load - and saveable variables - *i.e.* the pipeline components - of the
-                specific pipeline class. The overwritten components are then directly passed to the pipelines
-                `__init__` method. See example below for more information.
-            variant (`str`, *optional*):
-                If specified load weights from `variant` filename, *e.g.* pytorch_model.<variant>.bin. `variant` is
-                ignored when using `from_flax`.
-
-        <Tip>
-
-         It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated
-         models](https://huggingface.co/docs/hub/models-gated#gated-models), *e.g.* `"runwayml/stable-diffusion-v1-5"`
-
-        </Tip>
-
-        <Tip>
-
-        Activate the special ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use
-        this method in a firewalled environment.
-
-        </Tip>
-
-        Examples:
-
-        ```py
-        >>> from ppdiffusers import DiffusionPipeline
-
-        >>> # Download pipeline from huggingface.co and cache.
-        >>> pipeline = DiffusionPipeline.from_pretrained("CompVis/ldm-text2im-large-256")
-
-        >>> # Download pipeline that requires an authorization token
-        >>> # For more information on access tokens, please refer to this section
-        >>> # of the documentation](https://huggingface.co/docs/hub/security-tokens)
-        >>> pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-
-        >>> # Use a different scheduler
-        >>> from ppdiffusers import LMSDiscreteScheduler
-
-        >>> scheduler = LMSDiscreteScheduler.from_config(pipeline.scheduler.config)
-        >>> pipeline.scheduler = scheduler
-        ```
-        """
-        resume_download = kwargs.pop("resume_download", False)
-        force_download = kwargs.pop("force_download", False)
-        proxies = kwargs.pop("proxies", None)
-        local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE)
-        use_auth_token = kwargs.pop("use_auth_token", None)
-        revision = kwargs.pop("revision", None)
-        from_diffusers = kwargs.pop("from_diffusers", FROM_DIFFUSERS)
-        paddle_dtype = kwargs.pop("paddle_dtype", None)
-        custom_pipeline = kwargs.pop("custom_pipeline", None)
-        custom_revision = kwargs.pop("custom_revision", None)
-        runtime_options = kwargs.pop("runtime_options", None)
-        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", LOW_CPU_MEM_USAGE_DEFAULT)
-        use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False)
-        variant = kwargs.pop("variant", None)
-        from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB)
-
-        # deperate
-        return_cached_folder = kwargs.pop("return_cached_folder", False)
-
-        cache_dir = (
-            kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)
-        )
-
-        load_sub_model_kwargs = {
-            "pretrained_model_name_or_path": pretrained_model_name_or_path,
-            "is_local_dir": False,
-            "local_files_only": local_files_only,
-            "from_hf_hub": from_hf_hub,
-            "cache_dir": cache_dir,
-        }
-
-        # 1. Download the checkpoints and configs
-        # use snapshot download here to get it working from from_pretrained
-        if not os.path.isdir(pretrained_model_name_or_path):
-            cached_folder = cls.download(
-                pretrained_model_name_or_path,
-                cache_dir=cache_dir,
-                resume_download=resume_download,
-                force_download=force_download,
-                proxies=proxies,
-                local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
-                revision=revision,
-                use_safetensors=use_safetensors,
-                custom_pipeline=custom_pipeline,
-                custom_revision=custom_revision,
-                variant=variant,
-                from_hf_hub=from_hf_hub,
-                from_diffusers=from_diffusers,
-                **kwargs,
-            )
-        else:
-            # is_local_dir
-            load_sub_model_kwargs["is_local_dir"] = True
-            cached_folder = pretrained_model_name_or_path
-
-        config_dict = cls.load_config(cached_folder)
-
-        # pop out "_ignore_files" as it is only needed for download
-        config_dict.pop("_ignore_files", None)
-
-        # 2. Define which model components should load variants
-        # We retrieve the information by matching whether variant
-        # model checkpoints exist in the subfolders
-        model_variants = {}
-        if variant is not None:
-            for folder in os.listdir(cached_folder):
-                folder_path = os.path.join(cached_folder, folder)
-                is_folder = os.path.isdir(folder_path) and folder in config_dict
-                variant_exists = is_folder and any(
-                    p.split(".")[1].startswith(variant) for p in os.listdir(folder_path)
-                )
-                if variant_exists:
-                    model_variants[folder] = variant
-
-        # 3. Load the pipeline class, if using custom module then load it from the hub
-        # if we load from explicit class, let's use it
-        pipeline_class = _get_pipeline_class(
-            cls, config_dict, custom_pipeline=custom_pipeline, cache_dir=cache_dir, revision=custom_revision
-        )
-
-        # DEPRECATED: To be removed in 1.0.0
-        _ppdiffusers_version = (
-            config_dict["_diffusers_paddle_version"]
-            if "_diffusers_paddle_version" in config_dict
-            else config_dict["_ppdiffusers_version"]
-        )
-        if pipeline_class.__name__ == "StableDiffusionInpaintPipeline" and version.parse(
-            version.parse(_ppdiffusers_version).base_version
-        ) <= version.parse("0.5.1"):
-            from ppdiffusers import (
-                StableDiffusionInpaintPipeline,
-                StableDiffusionInpaintPipelineLegacy,
-            )
-
-            pipeline_class = StableDiffusionInpaintPipelineLegacy
-
-            deprecation_message = (
-                "You are using a legacy checkpoint for inpainting with Stable Diffusion, therefore we are loading the"
-                f" {StableDiffusionInpaintPipelineLegacy} class instead of {StableDiffusionInpaintPipeline}. For"
-                " better inpainting results, we strongly suggest using Stable Diffusion's official inpainting"
-                " checkpoint: https://huggingface.co/runwayml/stable-diffusion-inpainting instead or adapting your"
-                f" checkpoint {pretrained_model_name_or_path} to the format of"
-                " https://huggingface.co/runwayml/stable-diffusion-inpainting. Note that we do not actively maintain"
-                f" the {StableDiffusionInpaintPipelineLegacy} class and will likely remove it in version 1.0.0."
-            )
-            deprecate("StableDiffusionInpaintPipelineLegacy", "1.0.0", deprecation_message, standard_warn=False)
-
-        # 4. Define expected modules given pipeline signature
-        # and define non-None initialized modules (=`init_kwargs`)
-
-        # some modules can be passed directly to the init
-        # in this case they are already instantiated in `kwargs`
-        # extract them here
-        expected_modules, optional_kwargs = cls._get_signature_keys(pipeline_class)
-        passed_class_obj = {k: kwargs.pop(k) for k in expected_modules if k in kwargs}
-        passed_pipe_kwargs = {k: kwargs.pop(k) for k in optional_kwargs if k in kwargs}
-
-        init_dict, unused_kwargs, _ = pipeline_class.extract_init_dict(config_dict, **kwargs)
-
-        # define init kwargs
-        init_kwargs = {k: init_dict.pop(k) for k in optional_kwargs if k in init_dict}
-        init_kwargs = {**init_kwargs, **passed_pipe_kwargs}
-
-        # remove `null` components
-        def load_module(name, value):
-            if value[0] is None:
-                return False
-            if name in passed_class_obj and passed_class_obj[name] is None:
-                return False
-            return True
-
-        init_dict = {k: v for k, v in init_dict.items() if load_module(k, v)}
-
-        # 5. Throw nice warnings / errors for fast accelerate loading
-        if len(unused_kwargs) > 0:
-            logger.warning(
-                f"Keyword arguments {unused_kwargs} are not expected by {pipeline_class.__name__} and will be ignored."
-            )
-        # import it here to avoid circular import
-        from ppdiffusers import pipelines
-
-        # 6. Load each module in the pipeline
-        for name, (library_name, class_name) in init_dict.items():
-            # 6.0 - support old model_index.json and hf model_index.json
-            if library_name in ["diffusers_paddle", "diffusers"]:
-                library_name = "ppdiffusers"
-            if library_name == "transformers":
-                library_name = "paddlenlp.transformers"
-
-            # 6.1 - now that JAX/Flax is an official framework of the library, we might load from Flax names
-            if class_name.startswith("Flax"):
-                class_name = class_name[4:]
-
-            if class_name.endswith("TokenizerFast"):
-                class_name = class_name[:-4]
-
-            # 6.2 Define all importable classes
-            is_pipeline_module = hasattr(pipelines, library_name)
-            importable_classes = ALL_IMPORTABLE_CLASSES if is_pipeline_module else LOADABLE_CLASSES[library_name]
-            loaded_sub_model = None
-
-            # 6.3 Use passed sub model or load class_name from library_name
-            if name in passed_class_obj:
-                # 1. check that passed_class_obj has correct parent class
-                if not is_pipeline_module:
-                    # if the model is in a pipeline module, then we load it from the pipeline
-                    # check that passed_class_obj has correct parent class
-                    maybe_raise_or_warn(
-                        library_name,
-                        library,
-                        class_name,
-                        importable_classes,
-                        passed_class_obj,
-                        name,
-                        is_pipeline_module,
-                    )
-
-                loaded_sub_model = passed_class_obj[name]
-            else:
-                # load sub model
-                loaded_sub_model = load_sub_model(
-                    library_name=library_name,
-                    class_name=class_name,
-                    importable_classes=importable_classes,
-                    pipelines=pipelines,
-                    is_pipeline_module=is_pipeline_module,
-                    pipeline_class=pipeline_class,
-                    paddle_dtype=paddle_dtype,
-                    runtime_options=runtime_options,
-                    model_variants=model_variants,
-                    name=name,
-                    from_diffusers=from_diffusers,
-                    variant=variant,
-                    low_cpu_mem_usage=low_cpu_mem_usage,
-                    cached_folder=cached_folder,
-                    **load_sub_model_kwargs,
-                )
-
-            init_kwargs[name] = loaded_sub_model  # UNet(...), # DiffusionSchedule(...)
-
-        # 7. Potentially add passed objects if expected
-        missing_modules = set(expected_modules) - set(init_kwargs.keys())
-        passed_modules = list(passed_class_obj.keys())
-        optional_modules = pipeline_class._optional_components
-        if len(missing_modules) > 0 and missing_modules <= set(passed_modules + optional_modules):
-            for module in missing_modules:
-                init_kwargs[module] = passed_class_obj.get(module, None)
-        elif len(missing_modules) > 0:
-            passed_modules = set(list(init_kwargs.keys()) + list(passed_class_obj.keys())) - optional_kwargs
-            raise ValueError(
-                f"Pipeline {pipeline_class} expected {expected_modules}, but only {passed_modules} were passed."
-            )
-
-        # 8. (TODO, junnyu) make sure all modules are in eval mode and cast dtype
-        for name, _module in init_kwargs.items():
-            if isinstance(_module, nn.Layer):
-                _module.eval()
-                if paddle_dtype is not None and _module.dtype != paddle_dtype:
-                    _module.to(dtype=paddle_dtype)
-            elif isinstance(_module, (tuple, list)):
-                for _submodule in _module:
-                    if isinstance(_submodule, nn.Layer):
-                        _submodule.eval()
-                        if paddle_dtype is not None and _submodule.dtype != paddle_dtype:
-                            _submodule.to(dtype=paddle_dtype)
-
-        # 9. Instantiate the pipeline
-        model = pipeline_class(**init_kwargs)
-
-        if return_cached_folder:
-            message = f"Passing `return_cached_folder=True` is deprecated and will be removed in `diffusers=0.17.0`. Please do the following instead: \n 1. Load the cached_folder via `cached_folder={cls}.download({pretrained_model_name_or_path})`. \n 2. Load the pipeline by loading from the cached folder: `pipeline={cls}.from_pretrained(cached_folder)`."
-            deprecate("return_cached_folder", "0.17.0", message)
-            return model, cached_folder
-
-        return model
-
-    @classmethod
-    def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]:
-        r"""
-        Download and cache a PyTorch diffusion pipeline from pre-trained pipeline weights.
-        Parameters:
-             pretrained_model_name (`str` or `os.PathLike`, *optional*):
-                Should be a string, the *repo id* of a pretrained pipeline hosted inside a model repo on
-                https://huggingface.co/ Valid repo ids have to be located under a user or organization name, like
-                `CompVis/ldm-text2im-large-256`.
-            custom_pipeline (`str`, *optional*):
-                <Tip warning={true}>
-                    This is an experimental feature and is likely to change in the future.
-                </Tip>
-                Can be either:
-                    - A string, the *repo id* of a custom pipeline hosted inside a model repo on
-                      https://huggingface.co/. Valid repo ids have to be located under a user or organization name,
-                      like `hf-internal-testing/diffusers-dummy-pipeline`.
-                        <Tip>
-                         It is required that the model repo has a file, called `pipeline.py` that defines the custom
-                         pipeline.
-                        </Tip>
-                    - A string, the *file name* of a community pipeline hosted on GitHub under
-                      https://github.com/huggingface/diffusers/tree/main/examples/community. Valid file names have to
-                      match exactly the file name without `.py` located under the above link, *e.g.*
-                      `clip_guided_stable_diffusion`.
-                        <Tip>
-                         Community pipelines are always loaded from the current `main` branch of GitHub.
-                        </Tip>
-                    - A path to a *directory* containing a custom pipeline, e.g., `./my_pipeline_directory/`.
-                        <Tip>
-                         It is required that the directory has a file, called `pipeline.py` that defines the custom
-                         pipeline.
-                        </Tip>
-                For more information on how to load and create custom pipelines, please have a look at [Loading and
-                Adding Custom
-                Pipelines](https://huggingface.co/docs/diffusers/using-diffusers/custom_pipeline_overview)
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
-                file exists.
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            output_loading_info(`bool`, *optional*, defaults to `False`):
-                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
-            local_files_only(`bool`, *optional*, defaults to `False`):
-                Whether or not to only look at local files (i.e., do not try to download the model).
-            use_auth_token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-                when running `huggingface-cli login` (stored in `~/.huggingface`).
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
-                identifier allowed by git.
-            custom_revision (`str`, *optional*, defaults to `"main"` when loading from the Hub and to local version of
-            `diffusers` when loading from GitHub):
-                The specific model version to use. It can be a branch name, a tag name, or a commit id similar to
-                `revision` when loading a custom pipeline from the Hub. It can be a diffusers version when loading a
-                custom pipeline from GitHub.
-            mirror (`str`, *optional*):
-                Mirror source to accelerate downloads in China. If you are from China and have an accessibility
-                problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
-                Please refer to the mirror site for more information. specify the folder name here.
-            variant (`str`, *optional*):
-                If specified load weights from `variant` filename, *e.g.* pytorch_model.<variant>.bin or
-                model_state.<variant>.pdparams.
-        <Tip>
-         It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated
-         models](https://huggingface.co/docs/hub/models-gated#gated-models)
-        </Tip>
-        """
-        from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB)
-        cache_dir = (
-            kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)
-        )
-        from_diffusers = kwargs.pop("from_diffusers", FROM_DIFFUSERS)
-        resume_download = kwargs.pop("resume_download", False)
-        force_download = kwargs.pop("force_download", False)
-        proxies = kwargs.pop("proxies", None)
-        local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE)
-        use_auth_token = kwargs.pop("use_auth_token", None)
-        revision = kwargs.pop("revision", None)
-        custom_pipeline = kwargs.pop("custom_pipeline", None)
-        custom_revision = kwargs.pop("custom_revision", None)
-        variant = kwargs.pop("variant", None)
-        use_safetensors = kwargs.pop("use_safetensors", None)
-        max_workers = int(kwargs.pop("max_workers", 1))
-
-        if from_diffusers and use_safetensors and not is_safetensors_available():
-            raise ValueError(
-                "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors"
-            )
-        allow_pickle = False
-        if use_safetensors is None:
-            use_safetensors = is_safetensors_available()
-            allow_pickle = True
-
-        pipeline_is_cached = False
-        allow_patterns = []
-        ignore_patterns = []
-
-        # load config
-        config_dict, config_file = cls.load_config(
-            pretrained_model_name,
-            cache_dir=cache_dir,
-            resume_download=resume_download,
-            force_download=force_download,
-            proxies=proxies,
-            local_files_only=local_files_only,
-            use_auth_token=use_auth_token,
-            revision=revision,
-            from_hf_hub=from_hf_hub,
-            return_config_file=True,
-        )
-
-        ignore_filenames = config_dict.pop("_ignore_files", [])
-
-        # is_fastdeploy_model we wont use safetensors
-        if cls == DiffusionPipeline:
-            is_fastdeploy_model = "fastdeploy" in config_dict.get("_class_name", "").lower()
-        else:
-            is_fastdeploy_model = "fastdeploy" in cls.__name__.lower()
-        if is_fastdeploy_model:
-            use_safetensors = False
-
-        # retrieve all folder_names that contain relevant files
-        folder_names = []
-        for k, v in config_dict.items():
-            # if we pass specifc module, we won't donwload this
-            if k in kwargs:
-                continue
-            if isinstance(v, list):
-                if None in v:
-                    continue
-                folder_names.append(k)
-
-        # support [PT] .bin, .safetensors, [PD] .pdparams, fastdeploy model
-        if from_hf_hub:
-            if not local_files_only:
-                info = model_info(
-                    pretrained_model_name,
-                    use_auth_token=use_auth_token,
-                    revision=revision,
-                )
-
-                filenames = {sibling.rfilename for sibling in info.siblings}
-                model_filenames, variant_filenames = variant_compatible_siblings(filenames, variant=variant)
-
-                # remove ignored filenames
-                model_filenames = set(model_filenames) - set(ignore_filenames)
-                variant_filenames = set(variant_filenames) - set(ignore_filenames)
-
-                # if the whole pipeline is cached we don't have to ping the Hub
-                if revision in DEPRECATED_REVISION_ARGS and version.parse(
-                    version.parse(__version__).base_version
-                ) >= version.parse("0.17.0"):
-                    warn_deprecated_model_variant(
-                        pretrained_model_name, use_auth_token, variant, revision, model_filenames
-                    )
-
-                model_folder_names = {os.path.split(f)[0] for f in model_filenames}
-
-                # all filenames compatible with variant will be added
-                allow_patterns = list(model_filenames)
-
-                # allow all patterns from non-model folders
-                # this enables downloading schedulers, tokenizers, ...
-                allow_patterns += [os.path.join(k, "*") for k in folder_names if k not in model_folder_names]
-                # also allow downloading config.json files with the model
-                allow_patterns += [os.path.join(k, "config.json") for k in model_folder_names]
-
-                allow_patterns += [
-                    SCHEDULER_CONFIG_NAME,
-                    CONFIG_NAME,
-                    cls.config_name,
-                    CUSTOM_PIPELINE_FILE_NAME,
-                ]
-
-                # retrieve passed components that should not be downloaded
-                pipeline_class = _get_pipeline_class(
-                    cls, config_dict, custom_pipeline=custom_pipeline, cache_dir=cache_dir, revision=custom_revision
-                )
-                expected_components, _ = cls._get_signature_keys(pipeline_class)
-                passed_components = [k for k in expected_components if k in kwargs]
-
-                if (
-                    use_safetensors
-                    and not allow_pickle
-                    and not is_safetensors_compatible(
-                        model_filenames, variant=variant, passed_components=passed_components
-                    )
-                ):
-                    raise EnvironmentError(
-                        f"Could not found the necessary `safetensors` weights in {model_filenames} (variant={variant})"
-                    )
-                elif use_safetensors and is_safetensors_compatible(
-                    model_filenames, variant=variant, passed_components=passed_components
-                ):
-                    ignore_patterns = [
-                        "*.msgpack",
-                        "*.bin",
-                        "*.pdparams",
-                        "*.pdiparams",
-                        "*.pdmodel",
-                    ]
-
-                    safetensors_variant_filenames = {f for f in variant_filenames if f.endswith(".safetensors")}
-                    safetensors_model_filenames = {f for f in model_filenames if f.endswith(".safetensors")}
-                    if (
-                        len(safetensors_variant_filenames) > 0
-                        and safetensors_model_filenames != safetensors_variant_filenames
-                    ):
-                        logger.warn(
-                            f"\nA mixture of {variant} and non-{variant} filenames will be loaded.\nLoaded {variant} filenames:\n[{', '.join(safetensors_variant_filenames)}]\nLoaded non-{variant} filenames:\n[{', '.join(safetensors_model_filenames - safetensors_variant_filenames)}\nIf this behavior is not expected, please check your folder structure."
-                        )
-                else:
-                    ignore_patterns = ["*.safetensors", "*.msgpack"]
-                    if from_diffusers:
-                        ignore_patterns.extend(["*.pdparams", "*.pdiparams", "*.pdmodel"])
-                        suffix = ".bin"
-                    else:
-                        if is_fastdeploy_model:
-                            ignore_patterns.extend(["*.pdparams", "*.bin"])
-                            suffix = ".pdmodel"
-                        else:
-                            ignore_patterns.extend(["*.pdiparams", "*.pdmodel", "*.bin"])
-                            suffix = ".pdparams"
-
-                    bin_variant_filenames = {f for f in variant_filenames if f.endswith(suffix)}
-                    bin_model_filenames = {f for f in model_filenames if f.endswith(suffix)}
-                    if len(bin_variant_filenames) > 0 and bin_model_filenames != bin_variant_filenames:
-                        logger.warn(
-                            f"\nA mixture of {variant} and non-{variant} filenames will be loaded.\nLoaded {variant} filenames:\n[{', '.join(bin_variant_filenames)}]\nLoaded non-{variant} filenames:\n[{', '.join(bin_model_filenames - bin_variant_filenames)}\nIf this behavior is not expected, please check your folder structure."
-                        )
-
-                # Don't download any objects that are passed
-                allow_patterns = [
-                    p for p in allow_patterns if not (len(p.split("/")) == 2 and p.split("/")[0] in passed_components)
-                ]
-                # Don't download index files of forbidden patterns either
-                ignore_patterns = ignore_patterns + [f"{i}.index.*json" for i in ignore_patterns]
-
-                re_ignore_pattern = [re.compile(fnmatch.translate(p)) for p in ignore_patterns]
-                re_allow_pattern = [re.compile(fnmatch.translate(p)) for p in allow_patterns]
-
-                expected_files = [f for f in filenames if not any(p.match(f) for p in re_ignore_pattern)]
-                expected_files = [f for f in expected_files if any(p.match(f) for p in re_allow_pattern)]
-
-                snapshot_folder = Path(config_file).parent
-                pipeline_is_cached = all((snapshot_folder / f).is_file() for f in expected_files)
-
-                if pipeline_is_cached:
-                    # if the pipeline is cached, we can directly return it
-                    # else call snapshot_download
-                    return snapshot_folder
-
-            user_agent = {"pipeline_class": cls.__name__}
-            if custom_pipeline is not None and not custom_pipeline.endswith(".py"):
-                user_agent["custom_pipeline"] = custom_pipeline
-
-            # download all allow_patterns - ignore_patterns
-            cached_folder = snapshot_download(
-                pretrained_model_name,
-                cache_dir=cache_dir,
-                resume_download=resume_download,
-                force_download=force_download,  # new added force_download
-                proxies=proxies,
-                local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
-                revision=revision,
-                allow_patterns=list(set(allow_patterns) - set(ignore_filenames)),
-                ignore_patterns=list(
-                    set(ignore_patterns + ignore_filenames)
-                ),  # diffusers bug, so we must add this ignore_filenames!
-                user_agent=user_agent,
-                max_workers=max_workers,
-            )
-        else:
-            # only support [PD] .pdparams, fastdeploy model
-            cached_folder = ppdiffusers_bos_dir_download(
-                pretrained_model_name,
-                revision=revision,
-                cache_dir=cache_dir,
-                resume_download=resume_download,
-                force_download=force_download,  # new added force_download
-                folder_names=folder_names,
-                variant=variant,
-                is_fastdeploy_model=is_fastdeploy_model,
-                local_files_only=local_files_only,
-                max_workers=max_workers,
-            )
-
-        return cached_folder
-
-    @classmethod
-    def from_pretrained_original_ckpt(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
-        from .stable_diffusion.convert_from_ckpt_deprecated import (
-            load_pipeline_from_original_stable_diffusion_ckpt,
-        )
-
-        resume_download = kwargs.pop("resume_download", False)
-        force_download = kwargs.pop("force_download", False)
-        paddle_dtype = kwargs.pop("paddle_dtype", None)
-        cache_dir = kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)
-        original_config_file = kwargs.pop("original_config_file", None)
-        requires_safety_checker = kwargs.pop("requires_safety_checker", False)
-        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
-        if os.path.isfile(pretrained_model_name_or_path):
-            checkpoint_path = pretrained_model_name_or_path
-        elif pretrained_model_name_or_path.startswith("http://") or pretrained_model_name_or_path.startswith(
-            "https://"
-        ):
-            checkpoint_path = ppdiffusers_url_download(
-                pretrained_model_name_or_path,
-                cache_dir=cache_dir,
-                resume_download=resume_download,
-                force_download=force_download,
-            )
-        else:
-            raise EnvironmentError(f"Please check your {pretrained_model_name_or_path}.")
-        pipeline = load_pipeline_from_original_stable_diffusion_ckpt(
-            checkpoint_path=checkpoint_path,
-            original_config_file=original_config_file,
-            paddle_dtype=paddle_dtype,
-            requires_safety_checker=requires_safety_checker,
-            cls=cls,
-            **kwargs,
-        )
-
-        return pipeline
-
-    @staticmethod
-    def _get_signature_keys(obj):
-        parameters = inspect.signature(obj.__init__).parameters
-        required_parameters = {k: v for k, v in parameters.items() if v.default == inspect._empty}
-        optional_parameters = set({k for k, v in parameters.items() if v.default != inspect._empty})
-        expected_modules = set(required_parameters.keys()) - {"self"}
-        return expected_modules, optional_parameters
-
-    @property
-    def components(self) -> Dict[str, Any]:
-        r"""
-
-        The `self.components` property can be useful to run different pipelines with the same weights and
-        configurations to not have to re-allocate memory.
-
-        Examples:
-
-        ```py
-        >>> from ppdiffusers import (
-        ...     StableDiffusionPipeline,
-        ...     StableDiffusionImg2ImgPipeline,
-        ...     StableDiffusionInpaintPipeline,
-        ... )
-
-        >>> text2img = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-        >>> img2img = StableDiffusionImg2ImgPipeline(**text2img.components)
-        >>> inpaint = StableDiffusionInpaintPipeline(**text2img.components)
-        ```
-
-        Returns:
-            A dictionary containing all the modules needed to initialize the pipeline.
-        """
-        expected_modules, optional_parameters = self._get_signature_keys(self)
-        components = {
-            k: getattr(self, k) for k in self.config.keys() if not k.startswith("_") and k not in optional_parameters
-        }
-
-        if set(components.keys()) != expected_modules:
-            raise ValueError(
-                f"{self} has been incorrectly initialized or {self.__class__} is incorrectly implemented. Expected"
-                f" {expected_modules} to be defined, but {components.keys()} are defined."
-            )
-
-        return components
-
-    @staticmethod
-    def numpy_to_pil(images):
-        """
-        Convert a numpy image or a batch of images to a PIL image.
-        """
-        return numpy_to_pil(images)
-
-    def progress_bar(self, iterable=None, total=None):
-        if not hasattr(self, "_progress_bar_config"):
-            self._progress_bar_config = {}
-        elif not isinstance(self._progress_bar_config, dict):
-            raise ValueError(
-                f"`self._progress_bar_config` should be of type `dict`, but is {type(self._progress_bar_config)}."
-            )
-
-        if iterable is not None:
-            return tqdm(iterable, **self._progress_bar_config)
-        elif total is not None:
-            return tqdm(total=total, **self._progress_bar_config)
-        else:
-            raise ValueError("Either `total` or `iterable` has to be defined.")
-
-    def set_progress_bar_config(self, **kwargs):
-        self._progress_bar_config = kwargs
-
-    def enable_xformers_memory_efficient_attention(self, attention_op: Optional[str] = None):
-        r"""
-        Enable memory efficient attention as implemented in xformers.
-
-        When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
-        time. Speed up at training time is not guaranteed.
-
-        Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
-        is used.
-
-        Parameters:
-            attention_op (`Callable`, *optional*):
-                Override the default `None` operator for use as `op` argument to the
-                [`memory_efficient_attention()`](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.memory_efficient_attention)
-                function of xFormers.
-
-        Examples:
-
-        ```py
-        >>> import paddle
-        >>> from ppdiffusers import DiffusionPipeline
-
-        >>> pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1", paddle_dtype=paddle.float16)
-        >>> pipe.enable_xformers_memory_efficient_attention("cutlass")
-        ```
-        """
-        self.set_use_memory_efficient_attention_xformers(True, attention_op)
-
-    def disable_xformers_memory_efficient_attention(self):
-        r"""
-        Disable memory efficient attention as implemented in xformers.
-        """
-        self.set_use_memory_efficient_attention_xformers(False)
-
-    def set_use_memory_efficient_attention_xformers(self, valid: bool, attention_op: Optional[str] = None) -> None:
-        # Recursively walk through all the children.
-        # Any children which exposes the set_use_memory_efficient_attention_xformers method
-        # gets the message
-        def fn_recursive_set_mem_eff(module: nn.Layer):
-            if hasattr(module, "set_use_memory_efficient_attention_xformers"):
-                module.set_use_memory_efficient_attention_xformers(valid, attention_op)
-
-            for child in module.children():
-                fn_recursive_set_mem_eff(child)
-
-        module_names, _ = self._get_signature_keys(self)
-        modules = [getattr(self, n, None) for n in module_names]
-        modules = [m for m in modules if isinstance(m, nn.Layer)]
-
-        for module in modules:
-            fn_recursive_set_mem_eff(module)
-
-    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
-        r"""
-        Enable sliced attention computation.
-
-        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
-        in several steps. This is useful to save some memory in exchange for a small speed decrease.
-
-        Args:
-            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
-                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
-                `"max"`, maximum amount of memory will be saved by running only one slice at a time. If a number is
-                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
-                must be a multiple of `slice_size`.
-        """
-        self.set_attention_slice(slice_size)
-
-    def disable_attention_slicing(self):
-        r"""
-        Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
-        back to computing attention in one step.
-        """
-        # set slice_size = `None` to disable `attention slicing`
-        self.enable_attention_slicing(None)
-
-    def set_attention_slice(self, slice_size: Optional[int]):
-        module_names, _ = self._get_signature_keys(self)
-        modules = [getattr(self, n, None) for n in module_names]
-        modules = [m for m in modules if isinstance(m, nn.Layer) and hasattr(m, "set_attention_slice")]
-
-        for module in modules:
-            module.set_attention_slice(slice_size)
-
-    def enable_vae_slicing(self):
-        r"""
-        Enable sliced VAE decoding.
-
-        When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several
-        steps. This is useful to save some memory and allow larger batch sizes.
-        """
-        if hasattr(self, "vae"):
-            self.vae.enable_slicing()
-        if hasattr(self, "vqvae"):
-            self.vqvae.enable_slicing()
-
-    def disable_vae_slicing(self):
-        r"""
-        Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
-        computing decoding in one step.
-        """
-        if hasattr(self, "vae"):
-            self.vae.disable_slicing()
-        if hasattr(self, "vqvae"):
-            self.vqvae.disable_slicing()
-
-    def enable_vae_tiling(self):
-        r"""
-        Enable tiled VAE decoding.
-        When this option is enabled, the VAE will split the input tensor into tiles to compute decoding and encoding in
-        several steps. This is useful to save a large amount of memory and to allow the processing of larger images.
-        """
-        if hasattr(self, "vae"):
-            self.vae.enable_tiling()
-        if hasattr(self, "vqvae"):
-            self.vqvae.enable_tiling()
-
-    def disable_vae_tiling(self):
-        r"""
-        Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to
-        computing decoding in one step.
-        """
-        if hasattr(self, "vae"):
-            self.vae.disable_tiling()
-        if hasattr(self, "vqvae"):
-            self.vqvae.disable_tiling()
diff --git a/ppdiffusers/ppdiffusers/pipelines/pndm/__init__.py b/ppdiffusers/ppdiffusers/pipelines/pndm/__init__.py
deleted file mode 100644
index c7e9fab7f29e..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/pndm/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# flake8: noqa
-from .pipeline_pndm import PNDMPipeline
diff --git a/ppdiffusers/ppdiffusers/pipelines/pndm/pipeline_pndm.py b/ppdiffusers/ppdiffusers/pipelines/pndm/pipeline_pndm.py
deleted file mode 100644
index a39aca956b97..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/pndm/pipeline_pndm.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from typing import List, Optional, Tuple, Union
-
-import paddle
-
-from ...models import UNet2DModel
-from ...schedulers import PNDMScheduler
-from ...utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
-
-
-class PNDMPipeline(DiffusionPipeline):
-    r"""
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Parameters:
-        unet (`UNet2DModel`): U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            The `PNDMScheduler` to be used in combination with `unet` to denoise the encoded image.
-    """
-
-    unet: UNet2DModel
-    scheduler: PNDMScheduler
-
-    def __init__(self, unet: UNet2DModel, scheduler: PNDMScheduler):
-        super().__init__()
-
-        scheduler = PNDMScheduler.from_config(scheduler.config)
-
-        self.register_modules(unet=unet, scheduler=scheduler)
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        batch_size: int = 1,
-        num_inference_steps: int = 50,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        **kwargs,
-    ) -> Union[ImagePipelineOutput, Tuple]:
-        r"""
-        Args:
-            batch_size (`int`, `optional`, defaults to 1): The number of images to generate.
-            num_inference_steps (`int`, `optional`, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            generator (`paddle.Generator`, `optional`): A [paddle
-                generator](to make generation deterministic.
-            output_type (`str`, `optional`, defaults to `"pil"`): The output format of the generate image. Choose
-                between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, `optional`, defaults to `True`): Whether or not to return a
-                [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
-
-        Returns:
-            [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is
-            True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
-        """
-        # For more information on the sampling method you can take a look at Algorithm 2 of
-        # the official paper: https://arxiv.org/pdf/2202.09778.pdf
-
-        # Sample gaussian noise to begin loop
-        image = randn_tensor(
-            (batch_size, self.unet.config.in_channels, self.unet.config.sample_size, self.unet.config.sample_size),
-            generator=generator,
-        )
-
-        self.scheduler.set_timesteps(num_inference_steps)
-        for t in self.progress_bar(self.scheduler.timesteps):
-            model_output = self.unet(image, t).sample
-
-            image = self.scheduler.step(model_output, t, image).prev_sample
-
-        image = (image / 2 + 0.5).clip(0, 1)
-        image = image.transpose([0, 2, 3, 1]).cast("float32").numpy()
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image,)
-
-        return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/repaint/__init__.py b/ppdiffusers/ppdiffusers/pipelines/repaint/__init__.py
deleted file mode 100644
index 18784e3ee227..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/repaint/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# flake8: noqa
-from .pipeline_repaint import RePaintPipeline
diff --git a/ppdiffusers/ppdiffusers/pipelines/repaint/pipeline_repaint.py b/ppdiffusers/ppdiffusers/pipelines/repaint/pipeline_repaint.py
deleted file mode 100644
index 10c982149f4b..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/repaint/pipeline_repaint.py
+++ /dev/null
@@ -1,169 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2022 ETH Zurich Computer Vision Lab and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from typing import List, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-import PIL
-
-from ...models import UNet2DModel
-from ...schedulers import RePaintScheduler
-from ...utils import PIL_INTERPOLATION, logging, randn_tensor
-from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
-def _preprocess_image(image: Union[List, PIL.Image.Image, paddle.Tensor]):
-    if isinstance(image, paddle.Tensor):
-        return image
-    elif isinstance(image, PIL.Image.Image):
-        image = [image]
-
-    if isinstance(image[0], PIL.Image.Image):
-        w, h = image[0].size
-        w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
-
-        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
-        image = np.concatenate(image, axis=0)
-        image = np.array(image).astype(np.float32) / 255.0
-        image = image.transpose(0, 3, 1, 2)
-        image = 2.0 * image - 1.0
-        image = paddle.to_tensor(image)
-    elif isinstance(image[0], paddle.Tensor):
-        image = paddle.concat(image, axis=0)
-    return image
-
-
-def _preprocess_mask(mask: Union[List, PIL.Image.Image, paddle.Tensor]):
-    if isinstance(mask, paddle.Tensor):
-        return mask
-    elif isinstance(mask, PIL.Image.Image):
-        mask = [mask]
-
-    if isinstance(mask[0], PIL.Image.Image):
-        w, h = mask[0].size
-        w, h = (x - x % 32 for x in (w, h))  # resize to integer multiple of 32
-        mask = [np.array(m.convert("L").resize((w, h), resample=PIL_INTERPOLATION["nearest"]))[None, :] for m in mask]
-        mask = np.concatenate(mask, axis=0)
-        mask = mask.astype(np.float32) / 255.0
-        mask[mask < 0.5] = 0
-        mask[mask >= 0.5] = 1
-        mask = paddle.to_tensor(mask)
-    elif isinstance(mask[0], paddle.Tensor):
-        mask = paddle.concat(mask, axis=0)
-    return mask
-
-
-class RePaintPipeline(DiffusionPipeline):
-    unet: UNet2DModel
-    scheduler: RePaintScheduler
-
-    def __init__(self, unet, scheduler):
-        super().__init__()
-        self.register_modules(unet=unet, scheduler=scheduler)
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        image: Union[paddle.Tensor, PIL.Image.Image],
-        mask_image: Union[paddle.Tensor, PIL.Image.Image],
-        num_inference_steps: int = 250,
-        eta: float = 0.0,
-        jump_length: int = 10,
-        jump_n_sample: int = 10,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-    ) -> Union[ImagePipelineOutput, Tuple]:
-        r"""
-        Args:
-            image (`paddle.Tensor` or `PIL.Image.Image`):
-                The original image to inpaint on.
-            mask_image (`paddle.Tensor` or `PIL.Image.Image`):
-                The mask_image where 0.0 values define which part of the original image to inpaint (change).
-            num_inference_steps (`int`, *optional*, defaults to 1000):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            eta (`float`):
-                The weight of noise for added noise in a diffusion step. Its value is between 0.0 and 1.0 - 0.0 is DDIM
-                and 1.0 is DDPM scheduler respectively.
-            jump_length (`int`, *optional*, defaults to 10):
-                The number of steps taken forward in time before going backward in time for a single jump ("j" in
-                RePaint paper). Take a look at Figure 9 and 10 in https://arxiv.org/pdf/2201.09865.pdf.
-            jump_n_sample (`int`, *optional*, defaults to 10):
-                The number of times we will make forward time jump for a given chosen time sample. Take a look at
-                Figure 9 and 10 in https://arxiv.org/pdf/2201.09865.pdf.
-            generator (`paddle.Generator`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
-
-        Returns:
-            [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is
-            True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
-        """
-        original_image = image
-
-        original_image = _preprocess_image(original_image)
-        original_image = original_image.cast(self.unet.dtype)
-        mask_image = _preprocess_mask(mask_image)
-        mask_image = mask_image.cast(self.unet.dtype)
-
-        batch_size = original_image.shape[0]
-
-        # sample gaussian noise to begin the loop
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        image_shape = original_image.shape
-        image = randn_tensor(image_shape, generator=generator, dtype=self.unet.dtype)
-
-        # set step values
-        self.scheduler.set_timesteps(num_inference_steps, jump_length, jump_n_sample)
-        self.scheduler.eta = eta
-
-        t_last = self.scheduler.timesteps[0] + 1
-        generator = generator[0] if isinstance(generator, list) else generator
-        for i, t in enumerate(self.progress_bar(self.scheduler.timesteps)):
-            if t < t_last:
-                # predict the noise residual
-                model_output = self.unet(image, t).sample
-                # compute previous image: x_t -> x_t-1
-                image = self.scheduler.step(model_output, t, image, original_image, mask_image, generator).prev_sample
-
-            else:
-                # compute the reverse: x_t-1 -> x_t
-                image = self.scheduler.undo_step(image, t_last, generator)
-            t_last = t
-
-        image = (image / 2 + 0.5).clip(0, 1)
-        image = image.transpose([0, 2, 3, 1]).cast("float32").numpy()
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image,)
-
-        return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/score_sde_ve/__init__.py b/ppdiffusers/ppdiffusers/pipelines/score_sde_ve/__init__.py
deleted file mode 100644
index 14f547799039..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/score_sde_ve/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# flake8: noqa
-from .pipeline_score_sde_ve import ScoreSdeVePipeline
diff --git a/ppdiffusers/ppdiffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py b/ppdiffusers/ppdiffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py
deleted file mode 100644
index 4e81855ba00f..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import List, Optional, Tuple, Union
-
-import paddle
-
-from ...models import UNet2DModel
-from ...schedulers import ScoreSdeVeScheduler
-from ...utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
-
-
-class ScoreSdeVePipeline(DiffusionPipeline):
-    r"""
-    Parameters:
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-        unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image. scheduler ([`SchedulerMixin`]):
-            The [`ScoreSdeVeScheduler`] scheduler to be used in combination with `unet` to denoise the encoded image.
-    """
-    unet: UNet2DModel
-    scheduler: ScoreSdeVeScheduler
-
-    def __init__(self, unet: UNet2DModel, scheduler: DiffusionPipeline):
-        super().__init__()
-        self.register_modules(unet=unet, scheduler=scheduler)
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        batch_size: int = 1,
-        num_inference_steps: int = 2000,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        **kwargs,
-    ) -> Union[ImagePipelineOutput, Tuple]:
-        r"""
-        Args:
-            batch_size (`int`, *optional*, defaults to 1):
-                The number of images to generate.
-            generator (`paddle.Generator`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
-
-        Returns:
-            [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is
-            True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
-        """
-
-        img_size = self.unet.config.sample_size
-        shape = (batch_size, 3, img_size, img_size)
-
-        model = self.unet
-
-        sample = randn_tensor(shape, generator=generator) * self.scheduler.init_noise_sigma
-
-        self.scheduler.set_timesteps(num_inference_steps)
-        self.scheduler.set_sigmas(num_inference_steps)
-
-        for i, t in enumerate(self.progress_bar(self.scheduler.timesteps)):
-            sigma_t = self.scheduler.sigmas[i] * paddle.ones((shape[0],))
-
-            # correction step
-            for _ in range(self.scheduler.config.correct_steps):
-                model_output = self.unet(sample, sigma_t).sample
-                sample = self.scheduler.step_correct(model_output, sample, generator=generator).prev_sample
-
-            # prediction step
-            model_output = model(sample, sigma_t).sample
-            output = self.scheduler.step_pred(model_output, t, sample, generator=generator)
-
-            sample, sample_mean = output.prev_sample, output.prev_sample_mean
-
-        sample = sample_mean.clip(0, 1)
-        sample = sample.transpose([0, 2, 3, 1]).numpy()
-        if output_type == "pil":
-            sample = self.numpy_to_pil(sample)
-
-        if not return_dict:
-            return (sample,)
-
-        return ImagePipelineOutput(images=sample)
diff --git a/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/__init__.py b/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/__init__.py
deleted file mode 100644
index 9842e59ad078..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/__init__.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# flake8: noqa
-from dataclasses import dataclass
-from typing import List, Optional, Union
-
-import numpy as np
-import PIL.Image
-
-from ...utils import BaseOutput, is_paddle_available, is_paddlenlp_available
-
-
-@dataclass
-class SemanticStableDiffusionPipelineOutput(BaseOutput):
-    """
-    Output class for Stable Diffusion pipelines.
-
-    Args:
-        images (`List[PIL.Image.Image]` or `np.ndarray`)
-            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
-            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
-        nsfw_content_detected (`List[bool]`)
-            List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, or `None` if safety checking could not be performed.
-    """
-
-    images: Union[List[PIL.Image.Image], np.ndarray]
-    nsfw_content_detected: Optional[List[bool]]
-
-
-if is_paddle_available() and is_paddlenlp_available():
-    from .pipeline_semantic_stable_diffusion import SemanticStableDiffusionPipeline
diff --git a/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/custom_quantile.py b/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/custom_quantile.py
deleted file mode 100644
index 7fd2b4f40775..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/custom_quantile.py
+++ /dev/null
@@ -1,207 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-
-try:
-    from paddle.framework import in_dygraph_mode
-except ImportError:
-    from paddle.fluid.framework import in_dygraph_mode
-
-
-def _compute_quantile(x, q, axis=None, keepdim=False, ignore_nan=False):
-    """
-    Compute the quantile of the input along the specified axis.
-
-    Args:
-        x (Tensor): The input Tensor, it's data type can be float32, float64, int32, int64.
-        q (int|float|list): The q for calculate quantile, which should be in range [0, 1]. If q is a list,
-            each q will be calculated and the first dimension of output is same to the number of ``q`` .
-        axis (int|list, optional): The axis along which to calculate quantile. ``axis`` should be int or list of int.
-            ``axis`` should be in range [-D, D), where D is the dimensions of ``x`` .
-            If ``axis`` is less than 0, it works the same way as :math:`axis + D`.
-            If ``axis`` is a list, quantile is calculated over all elements of given axises.
-            If ``axis`` is None, quantile is calculated over all elements of ``x``. Default is None.
-        keepdim (bool, optional): Whether to reserve the reduced dimension(s)
-            in the output Tensor. If ``keepdim`` is True, the dimensions of
-            the output Tensor is the same as ``x`` except in the reduced
-            dimensions(it is of size 1 in this case). Otherwise, the shape of
-            the output Tensor is squeezed in ``axis`` . Default is False.
-        ignore_nan: (bool, optional): Whether to ignore NaN of input Tensor.
-            If ``ignore_nan`` is True, it will calculate nanquantile.
-            Otherwise it will calculate quantile. Default is False.
-
-    Returns:
-        Tensor, results of quantile along ``axis`` of ``x``.
-        In order to obtain higher precision, data type of results will be float64.
-    """
-
-    # Validate q
-    if isinstance(q, (int, float)):
-        q = [q]
-    elif isinstance(q, (list, tuple)):
-        if len(q) <= 0:
-            raise ValueError("q should not be empty")
-    else:
-        raise TypeError("Type of q should be int, float, list or tuple.")
-
-    # Validate axis
-    dims = len(x.shape)
-    out_shape = list(x.shape)
-    if axis is None:
-        x = paddle.flatten(x)
-        axis = 0
-        out_shape = [1] * dims
-    else:
-        if isinstance(axis, list):
-            axis_src, axis_dst = [], []
-            for axis_single in axis:
-                if not isinstance(axis_single, int) or not (axis_single < dims and axis_single >= -dims):
-                    raise ValueError(
-                        "Axis should be None, int, or a list, element should in range [-rank(x), rank(x))."
-                    )
-                if axis_single < 0:
-                    axis_single = axis_single + dims
-                axis_src.append(axis_single)
-                out_shape[axis_single] = 1
-
-            axis_dst = list(range(-len(axis), 0))
-            x = paddle.moveaxis(x, axis_src, axis_dst)
-            if len(axis_dst) == 0:
-                x = paddle.flatten(x)
-                axis = 0
-            else:
-                x = paddle.flatten(x, axis_dst[0], axis_dst[-1])
-                axis = axis_dst[0]
-        else:
-            if not isinstance(axis, int) or not (axis < dims and axis >= -dims):
-                raise ValueError("Axis should be None, int, or a list, element should in range [-rank(x), rank(x)).")
-            if axis < 0:
-                axis += dims
-            out_shape[axis] = 1
-
-    mask = x.isnan()
-    valid_counts = mask.logical_not().sum(axis=axis, keepdim=True, dtype="float64")
-
-    indices = []
-
-    for q_num in q:
-        if q_num < 0 or q_num > 1:
-            raise ValueError("q should be in range [0, 1]")
-        if in_dygraph_mode():
-            q_num = paddle.to_tensor(q_num, dtype="float64")
-        if ignore_nan:
-            indices.append(q_num * (valid_counts - 1))
-        else:
-            # TODO: Use paddle.index_fill instead of where
-            index = q_num * (valid_counts - 1)
-            last_index = x.shape[axis] - 1
-            nums = paddle.full_like(index, fill_value=last_index)
-            index = paddle.where(mask.any(axis=axis, keepdim=True), nums, index)
-            indices.append(index)
-
-    # sorted_tensor = paddle.sort(x, axis)
-    data, _ = paddle.topk(x, k=x.shape[axis], axis=axis, largest=False)
-    sorted_tensor = data
-
-    outputs = []
-
-    # TODO(chenjianye): replace the for-loop to directly take elements.
-    for index in indices:
-        indices_below = paddle.floor(index).astype(paddle.int32)
-        indices_upper = paddle.ceil(index).astype(paddle.int32)
-        tensor_upper = paddle.take_along_axis(sorted_tensor, indices_upper, axis=axis)
-        tensor_below = paddle.take_along_axis(sorted_tensor, indices_below, axis=axis)
-        weights = index - indices_below.astype("float64")
-        out = paddle.lerp(
-            tensor_below.astype("float64"),
-            tensor_upper.astype("float64"),
-            weights,
-        )
-        if not keepdim:
-            out = paddle.squeeze(out, axis=axis)
-        else:
-            out = out.reshape(out_shape)
-        outputs.append(out)
-
-    if len(q) > 1:
-        outputs = paddle.stack(outputs, 0)
-    else:
-        outputs = outputs[0]
-
-    return outputs
-
-
-def quantile(x, q, axis=None, keepdim=False):
-    """
-    Compute the quantile of the input along the specified axis.
-    If any values in a reduced row are NaN, then the quantiles for that reduction will be NaN.
-
-    Args:
-        x (Tensor): The input Tensor, it's data type can be float32, float64, int32, int64.
-        q (int|float|list): The q for calculate quantile, which should be in range [0, 1]. If q is a list,
-            each q will be calculated and the first dimension of output is same to the number of ``q`` .
-        axis (int|list, optional): The axis along which to calculate quantile. ``axis`` should be int or list of int.
-            ``axis`` should be in range [-D, D), where D is the dimensions of ``x`` .
-            If ``axis`` is less than 0, it works the same way as :math:`axis + D`.
-            If ``axis`` is a list, quantile is calculated over all elements of given axises.
-            If ``axis`` is None, quantile is calculated over all elements of ``x``. Default is None.
-        keepdim (bool, optional): Whether to reserve the reduced dimension(s)
-            in the output Tensor. If ``keepdim`` is True, the dimensions of
-            the output Tensor is the same as ``x`` except in the reduced
-            dimensions(it is of size 1 in this case). Otherwise, the shape of
-            the output Tensor is squeezed in ``axis`` . Default is False.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
-
-    Returns:
-        Tensor, results of quantile along ``axis`` of ``x``.
-        In order to obtain higher precision, data type of results will be float64.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-
-            y = paddle.arange(0, 8 ,dtype="float32").reshape([4, 2])
-            # Tensor(shape=[4, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #        [[0., 1.],
-            #         [2., 3.],
-            #         [4., 5.],
-            #         [6., 7.]])
-
-            y1 = paddle.quantile(y, q=0.5, axis=[0, 1])
-            # Tensor(shape=[], dtype=float64, place=Place(cpu), stop_gradient=True,
-            #        3.50000000)
-
-            y2 = paddle.quantile(y, q=0.5, axis=1)
-            # Tensor(shape=[4], dtype=float64, place=Place(cpu), stop_gradient=True,
-            #        [0.50000000, 2.50000000, 4.50000000, 6.50000000])
-
-            y3 = paddle.quantile(y, q=[0.3, 0.5], axis=0)
-            # Tensor(shape=[2, 2], dtype=float64, place=Place(cpu), stop_gradient=True,
-            #        [[1.80000000, 2.80000000],
-            #         [3.        , 4.        ]])
-
-            y[0,0] = float("nan")
-            y4 = paddle.quantile(y, q=0.8, axis=1, keepdim=True)
-            # Tensor(shape=[4, 1], dtype=float64, place=Place(cpu), stop_gradient=True,
-            #        [[nan       ],
-            #         [2.80000000],
-            #         [4.80000000],
-            #         [6.80000000]])
-
-    """
-    return _compute_quantile(x, q, axis=axis, keepdim=keepdim, ignore_nan=False)
diff --git a/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
deleted file mode 100644
index 366337382838..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
+++ /dev/null
@@ -1,713 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from itertools import repeat
-from typing import Callable, List, Optional, Union
-
-import paddle
-
-from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
-
-from ...models import AutoencoderKL, UNet2DConditionModel
-from ...pipeline_utils import DiffusionPipeline
-from ...pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import logging, randn_tensor
-from . import SemanticStableDiffusionPipelineOutput
-from .custom_quantile import quantile
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> import paddle
-        >>> from ppdiffusers import SemanticStableDiffusionPipeline
-
-        >>> pipe = SemanticStableDiffusionPipeline.from_pretrained(
-        ...     "runwayml/stable-diffusion-v1-5", paddle_dtype=paddle.float16
-        ... )
-
-        >>> out = pipe(
-        ...     prompt="a photo of the face of a woman",
-        ...     num_images_per_prompt=1,
-        ...     guidance_scale=7,
-        ...     editing_prompt=[
-        ...         "smiling, smile",  # Concepts to apply
-        ...         "glasses, wearing glasses",
-        ...         "curls, wavy hair, curly hair",
-        ...         "beard, full beard, mustache",
-        ...     ],
-        ...     reverse_editing_direction=[
-        ...         False,
-        ...         False,
-        ...         False,
-        ...         False,
-        ...     ],  # Direction of guidance i.e. increase all concepts
-        ...     edit_warmup_steps=[10, 10, 10, 10],  # Warmup period for each concept
-        ...     edit_guidance_scale=[4, 5, 5, 5.4],  # Guidance scale for each concept
-        ...     edit_threshold=[
-        ...         0.99,
-        ...         0.975,
-        ...         0.925,
-        ...         0.96,
-        ...     ],  # Threshold for each concept. Threshold equals the percentile of the latent space that will be discarded. I.e. threshold=0.99 uses 1% of the latent dimensions
-        ...     edit_momentum_scale=0.3,  # Momentum scale that will be added to the latent guidance
-        ...     edit_mom_beta=0.6,  # Momentum beta
-        ...     edit_weights=[1, 1, 1, 1, 1],  # Weights of the individual concepts against each other
-        ... )
-        >>> image = out.images[0]
-        ```
-"""
-
-
-class SemanticStableDiffusionPipeline(DiffusionPipeline):
-    r"""
-    Pipeline for text-to-image generation with latent editing.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    This model builds on the implementation of ['StableDiffusionPipeline']
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        safety_checker ([`Q16SafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
-        feature_extractor ([`CLIPImageProcessor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-
-    _optional_components = ["safety_checker", "feature_extractor"]
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPImageProcessor,
-        requires_safety_checker: bool = True,
-    ):
-        super().__init__()
-
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. PaddleNLP team, diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                f"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
-    def decode_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
-        image = self.vae.decode(latents).sample
-        image = (image / 2 + 0.5).clip(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        image = image.transpose([0, 2, 3, 1]).cast("float32").numpy()
-        return image
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
-    def check_inputs(
-        self,
-        prompt,
-        height,
-        width,
-        callback_steps,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-    ):
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, dtype=dtype)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]],
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: int = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        editing_prompt: Optional[Union[str, List[str]]] = None,
-        editing_prompt_embeddings: Optional[paddle.Tensor] = None,
-        reverse_editing_direction: Optional[Union[bool, List[bool]]] = False,
-        edit_guidance_scale: Optional[Union[float, List[float]]] = 5,
-        edit_warmup_steps: Optional[Union[int, List[int]]] = 10,
-        edit_cooldown_steps: Optional[Union[int, List[int]]] = None,
-        edit_threshold: Optional[Union[float, List[float]]] = 0.9,
-        edit_momentum_scale: Optional[float] = 0.1,
-        edit_mom_beta: Optional[float] = 0.4,
-        edit_weights: Optional[List[float]] = None,
-        sem_guidance: Optional[List[paddle.Tensor]] = None,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`):
-                The prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            editing_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to use for Semantic guidance. Semantic guidance is disabled by setting
-                `editing_prompt = None`. Guidance direction of prompt should be specified via
-                `reverse_editing_direction`.
-            editing_prompt_embeddings (`paddle.Tensor>`, *optional*):
-                Pre-computed embeddings to use for semantic guidance. Guidance direction of embedding should be
-                specified via `reverse_editing_direction`.
-            reverse_editing_direction (`bool` or `List[bool]`, *optional*, defaults to `False`):
-                Whether the corresponding prompt in `editing_prompt` should be increased or decreased.
-            edit_guidance_scale (`float` or `List[float]`, *optional*, defaults to 5):
-                Guidance scale for semantic guidance. If provided as list values should correspond to `editing_prompt`.
-                `edit_guidance_scale` is defined as `s_e` of equation 6 of [SEGA
-                Paper](https://arxiv.org/pdf/2301.12247.pdf).
-            edit_warmup_steps (`float` or `List[float]`, *optional*, defaults to 10):
-                Number of diffusion steps (for each prompt) for which semantic guidance will not be applied. Momentum
-                will still be calculated for those steps and applied once all warmup periods are over.
-                `edit_warmup_steps` is defined as `delta` (δ) of [SEGA Paper](https://arxiv.org/pdf/2301.12247.pdf).
-            edit_cooldown_steps (`float` or `List[float]`, *optional*, defaults to `None`):
-                Number of diffusion steps (for each prompt) after which semantic guidance will no longer be applied.
-            edit_threshold (`float` or `List[float]`, *optional*, defaults to 0.9):
-                Threshold of semantic guidance.
-            edit_momentum_scale (`float`, *optional*, defaults to 0.1):
-                Scale of the momentum to be added to the semantic guidance at each diffusion step. If set to 0.0
-                momentum will be disabled. Momentum is already built up during warmup, i.e. for diffusion steps smaller
-                than `sld_warmup_steps`. Momentum will only be added to latent guidance once all warmup periods are
-                finished. `edit_momentum_scale` is defined as `s_m` of equation 7 of [SEGA
-                Paper](https://arxiv.org/pdf/2301.12247.pdf).
-            edit_mom_beta (`float`, *optional*, defaults to 0.4):
-                Defines how semantic guidance momentum builds up. `edit_mom_beta` indicates how much of the previous
-                momentum will be kept. Momentum is already built up during warmup, i.e. for diffusion steps smaller
-                than `edit_warmup_steps`. `edit_mom_beta` is defined as `beta_m` (β) of equation 8 of [SEGA
-                Paper](https://arxiv.org/pdf/2301.12247.pdf).
-            edit_weights (`List[float]`, *optional*, defaults to `None`):
-                Indicates how much each individual concept should influence the overall guidance. If no weights are
-                provided all concepts are applied equally. `edit_mom_beta` is defined as `g_i` of equation 9 of [SEGA
-                Paper](https://arxiv.org/pdf/2301.12247.pdf).
-            sem_guidance (`List[paddle.Tensor]`, *optional*):
-                List of pre-generated guidance vectors to be applied at generation. Length of the list has to
-                correspond to `num_inference_steps`.
-
-        Returns:
-            [`~pipelines.semantic_stable_diffusion.SemanticStableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.semantic_stable_diffusion.SemanticStableDiffusionPipelineOutput`] if `return_dict` is True,
-            otherwise a `tuple. When returning a tuple, the first element is a list with the generated images, and the
-            second element is a list of `bool`s denoting whether the corresponding generated image likely represents
-            "not-safe-for-work" (nsfw) content, according to the `safety_checker`.
-        """
-        # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * self.vae_scale_factor
-        width = width or self.unet.config.sample_size * self.vae_scale_factor
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(prompt, height, width, callback_steps)
-
-        # 2. Define call parameters
-        batch_size = 1 if isinstance(prompt, str) else len(prompt)
-
-        if editing_prompt:
-            enable_edit_guidance = True
-            if isinstance(editing_prompt, str):
-                editing_prompt = [editing_prompt]
-            enabled_editing_prompts = len(editing_prompt)
-        elif editing_prompt_embeddings is not None:
-            enable_edit_guidance = True
-            enabled_editing_prompts = editing_prompt_embeddings.shape[0]
-        else:
-            enabled_editing_prompts = 0
-            enable_edit_guidance = False
-
-        # get prompt text embeddings
-        text_inputs = self.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=self.tokenizer.model_max_length,
-            return_tensors="pd",
-        )
-        text_input_ids = text_inputs.input_ids
-
-        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
-            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
-            logger.warning(
-                "The following part of your input was truncated because CLIP can only handle sequences up to"
-                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-            )
-            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
-        text_embeddings = self.text_encoder(text_input_ids)[0]
-
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        bs_embed, seq_len, _ = text_embeddings.shape
-        text_embeddings = text_embeddings.tile([1, num_images_per_prompt, 1])
-        text_embeddings = text_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        if enable_edit_guidance:
-            # get safety text embeddings
-            if editing_prompt_embeddings is None:
-                edit_concepts_input = self.tokenizer(
-                    [x for item in editing_prompt for x in repeat(item, batch_size)],
-                    padding="max_length",
-                    max_length=self.tokenizer.model_max_length,
-                    return_tensors="pd",
-                )
-
-                edit_concepts_input_ids = edit_concepts_input.input_ids
-
-                if edit_concepts_input_ids.shape[-1] > self.tokenizer.model_max_length:
-                    removed_text = self.tokenizer.batch_decode(
-                        edit_concepts_input_ids[:, self.tokenizer.model_max_length :]
-                    )
-                    logger.warning(
-                        "The following part of your input was truncated because CLIP can only handle sequences up to"
-                        f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                    )
-                    edit_concepts_input_ids = edit_concepts_input_ids[:, : self.tokenizer.model_max_length]
-                edit_concepts = self.text_encoder(edit_concepts_input_ids)[0]
-            else:
-                edit_concepts = editing_prompt_embeddings.tile([batch_size, 1, 1])
-
-            # duplicate text embeddings for each generation per prompt, using mps friendly method
-            bs_embed_edit, seq_len_edit, _ = edit_concepts.shape
-            edit_concepts = edit_concepts.tile([1, num_images_per_prompt, 1])
-            edit_concepts = edit_concepts.reshape([bs_embed_edit * num_images_per_prompt, seq_len_edit, -1])
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-        # get unconditional embeddings for classifier free guidance
-
-        if do_classifier_free_guidance:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""]
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            max_length = text_input_ids.shape[-1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-            uncond_embeddings = self.text_encoder(uncond_input.input_ids)[0]
-
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = uncond_embeddings.shape[1]
-            uncond_embeddings = uncond_embeddings.tile([batch_size, num_images_per_prompt, 1])
-            uncond_embeddings = uncond_embeddings.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            if enable_edit_guidance:
-                text_embeddings = paddle.concat([uncond_embeddings, text_embeddings, edit_concepts])
-            else:
-                text_embeddings = paddle.concat([uncond_embeddings, text_embeddings])
-        # get the initial random noise unless the user supplied it
-
-        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps = self.scheduler.timesteps
-
-        # 5. Prepare latent variables
-        num_channels_latents = self.unet.config.in_channels
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            text_embeddings.dtype,
-            generator,
-            latents,
-        )
-
-        # 6. Prepare extra step kwargs.
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # Initialize edit_momentum to None
-        edit_momentum = None
-
-        self.uncond_estimates = None
-        self.text_estimates = None
-        self.edit_estimates = None
-        self.sem_guidance = None
-
-        for i, t in enumerate(self.progress_bar(timesteps)):
-            # expand the latents if we are doing classifier free guidance
-            latent_model_input = (
-                paddle.concat([latents] * (2 + enabled_editing_prompts)) if do_classifier_free_guidance else latents
-            )
-            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-            # predict the noise residual
-            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
-
-            # perform guidance
-            if do_classifier_free_guidance:
-                noise_pred_out = noise_pred.chunk(2 + enabled_editing_prompts)  # [b,4, 64, 64]
-                noise_pred_uncond, noise_pred_text = noise_pred_out[0], noise_pred_out[1]
-                noise_pred_edit_concepts = noise_pred_out[2:]
-
-                # default text guidance
-                noise_guidance = guidance_scale * (noise_pred_text - noise_pred_uncond)
-                # noise_guidance = (noise_pred_text - noise_pred_edit_concepts[0])
-
-                if self.uncond_estimates is None:
-                    self.uncond_estimates = paddle.zeros(
-                        (num_inference_steps + 1, *noise_pred_uncond.shape), dtype=noise_pred.dtype
-                    )
-                self.uncond_estimates[i] = noise_pred_uncond.detach()
-
-                if self.text_estimates is None:
-                    self.text_estimates = paddle.zeros(
-                        (num_inference_steps + 1, *noise_pred_text.shape), dtype=noise_pred.dtype
-                    )
-                self.text_estimates[i] = noise_pred_text.detach()
-
-                if self.edit_estimates is None and enable_edit_guidance:
-                    self.edit_estimates = paddle.zeros(
-                        (num_inference_steps + 1, len(noise_pred_edit_concepts), *noise_pred_edit_concepts[0].shape),
-                        dtype=noise_pred.dtype,
-                    )
-
-                if self.sem_guidance is None:
-                    self.sem_guidance = paddle.zeros(
-                        (num_inference_steps + 1, *noise_pred_text.shape), dtype=noise_pred.dtype
-                    )
-
-                if edit_momentum is None:
-                    edit_momentum = paddle.zeros_like(noise_guidance)
-
-                if enable_edit_guidance:
-                    concept_weights = paddle.zeros(
-                        (len(noise_pred_edit_concepts), noise_guidance.shape[0]),
-                        dtype=noise_guidance.dtype,
-                    )
-                    noise_guidance_edit = paddle.zeros(
-                        (len(noise_pred_edit_concepts), *noise_guidance.shape),
-                        dtype=noise_guidance.dtype,
-                    )
-                    # noise_guidance_edit = torch.zeros_like(noise_guidance)
-                    warmup_inds = []
-                    for c, noise_pred_edit_concept in enumerate(noise_pred_edit_concepts):
-                        self.edit_estimates[i, c] = noise_pred_edit_concept
-                        if isinstance(edit_guidance_scale, list):
-                            edit_guidance_scale_c = edit_guidance_scale[c]
-                        else:
-                            edit_guidance_scale_c = edit_guidance_scale
-
-                        if isinstance(edit_threshold, list):
-                            edit_threshold_c = edit_threshold[c]
-                        else:
-                            edit_threshold_c = edit_threshold
-                        if isinstance(reverse_editing_direction, list):
-                            reverse_editing_direction_c = reverse_editing_direction[c]
-                        else:
-                            reverse_editing_direction_c = reverse_editing_direction
-                        if edit_weights:
-                            edit_weight_c = edit_weights[c]
-                        else:
-                            edit_weight_c = 1.0
-                        if isinstance(edit_warmup_steps, list):
-                            edit_warmup_steps_c = edit_warmup_steps[c]
-                        else:
-                            edit_warmup_steps_c = edit_warmup_steps
-
-                        if isinstance(edit_cooldown_steps, list):
-                            edit_cooldown_steps_c = edit_cooldown_steps[c]
-                        elif edit_cooldown_steps is None:
-                            edit_cooldown_steps_c = i + 1
-                        else:
-                            edit_cooldown_steps_c = edit_cooldown_steps
-                        if i >= edit_warmup_steps_c:
-                            warmup_inds.append(c)
-                        if i >= edit_cooldown_steps_c:
-                            noise_guidance_edit[c, :, :, :, :] = paddle.zeros_like(noise_pred_edit_concept)
-                            continue
-
-                        noise_guidance_edit_tmp = noise_pred_edit_concept - noise_pred_uncond
-                        # tmp_weights = (noise_pred_text - noise_pred_edit_concept).sum(dim=(1, 2, 3))
-                        tmp_weights = (noise_guidance - noise_pred_edit_concept).sum((1, 2, 3))
-
-                        tmp_weights = paddle.full_like(tmp_weights, edit_weight_c)  # * (1 / enabled_editing_prompts)
-                        if reverse_editing_direction_c:
-                            noise_guidance_edit_tmp = noise_guidance_edit_tmp * -1
-                        concept_weights[c, :] = tmp_weights
-
-                        noise_guidance_edit_tmp = noise_guidance_edit_tmp * edit_guidance_scale_c
-
-                        # quantile function expects float32
-                        if noise_guidance_edit_tmp.dtype == paddle.float32:
-                            tmp = quantile(
-                                paddle.abs(noise_guidance_edit_tmp).flatten(2),
-                                edit_threshold_c,
-                                axis=2,
-                                keepdim=False,
-                            )
-                        else:
-                            tmp = quantile(
-                                paddle.abs(noise_guidance_edit_tmp).flatten(2).cast(paddle.float32),
-                                edit_threshold_c,
-                                axis=2,
-                                keepdim=False,
-                            ).cast(noise_guidance_edit_tmp.dtype)
-
-                        noise_guidance_edit_tmp = paddle.where(
-                            paddle.abs(noise_guidance_edit_tmp) >= tmp[:, :, None, None],
-                            noise_guidance_edit_tmp,
-                            paddle.zeros_like(noise_guidance_edit_tmp),
-                        )
-                        noise_guidance_edit[c, :, :, :, :] = noise_guidance_edit_tmp
-
-                        # noise_guidance_edit = noise_guidance_edit + noise_guidance_edit_tmp
-
-                    warmup_inds = paddle.to_tensor(warmup_inds)
-                    if len(noise_pred_edit_concepts) > warmup_inds.shape[0] > 0:
-                        # concept_weights = concept_weights.to("cpu")  # Offload to cpu
-                        # noise_guidance_edit = noise_guidance_edit.to("cpu")
-
-                        concept_weights_tmp = paddle.index_select(concept_weights, warmup_inds, 0)
-                        concept_weights_tmp = paddle.where(
-                            concept_weights_tmp < 0, paddle.zeros_like(concept_weights_tmp), concept_weights_tmp
-                        )
-                        concept_weights_tmp = concept_weights_tmp / concept_weights_tmp.sum(0)
-                        # concept_weights_tmp = torch.nan_to_num(concept_weights_tmp)
-
-                        noise_guidance_edit_tmp = paddle.index_select(noise_guidance_edit, warmup_inds, 0)
-                        noise_guidance_edit_tmp = paddle.einsum(
-                            "cb,cbijk->bijk", concept_weights_tmp, noise_guidance_edit_tmp
-                        )
-                        noise_guidance_edit_tmp = noise_guidance_edit_tmp
-                        noise_guidance = noise_guidance + noise_guidance_edit_tmp
-
-                        self.sem_guidance[i] = noise_guidance_edit_tmp.detach()
-
-                        del noise_guidance_edit_tmp
-                        del concept_weights_tmp
-                        concept_weights = concept_weights
-                        noise_guidance_edit = noise_guidance_edit
-
-                    concept_weights = paddle.where(
-                        concept_weights < 0, paddle.zeros_like(concept_weights), concept_weights
-                    )
-                    # concept_weights = paddle.nan_to_num(concept_weights)
-
-                    noise_guidance_edit = paddle.einsum("cb,cbijk->bijk", concept_weights, noise_guidance_edit)
-
-                    noise_guidance_edit = noise_guidance_edit + edit_momentum_scale * edit_momentum
-
-                    edit_momentum = edit_mom_beta * edit_momentum + (1 - edit_mom_beta) * noise_guidance_edit
-
-                    if warmup_inds.shape[0] == len(noise_pred_edit_concepts):
-                        noise_guidance = noise_guidance + noise_guidance_edit
-                        self.sem_guidance[i] = noise_guidance_edit.detach()
-
-                if sem_guidance is not None:
-                    edit_guidance = sem_guidance[i]
-                    noise_guidance = noise_guidance + edit_guidance
-
-                noise_pred = noise_pred_uncond + noise_guidance
-
-                # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-            # call the callback, if provided
-            if callback is not None and i % callback_steps == 0:
-                callback(i, t, latents)
-
-        # 8. Post-processing
-        image = self.decode_latents(latents)
-
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.cast(text_embeddings.dtype)
-            )
-        else:
-            has_nsfw_concept = None
-
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return SemanticStableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/__init__.py b/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/__init__.py
deleted file mode 100644
index 53dd30da9855..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/__init__.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ...utils import (
-    OptionalDependencyNotAvailable,
-    is_note_seq_available,
-    is_paddle_available,
-    is_paddlenlp_available,
-)
-
-try:
-    if not (is_paddlenlp_available() and is_paddle_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ...utils.dummy_paddle_and_paddlenlp_objects import *
-else:
-    from .notes_encoder import SpectrogramNotesEncoder
-    from .pipeline_spectrogram_diffusion import (
-        SpectrogramContEncoder,
-        SpectrogramDiffusionPipeline,
-        T5FilmDecoder,
-    )
-try:
-    if not (is_paddlenlp_available() and is_paddle_available() and is_note_seq_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ...utils.dummy_paddle_and_paddlenlp_and_note_seq_objects import *
-else:
-    from .midi_utils import MidiProcessor
diff --git a/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/continous_encoder.py b/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/continous_encoder.py
deleted file mode 100644
index 8bedfba72157..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/continous_encoder.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.nn as nn
-
-from paddlenlp.transformers.t5.configuration import T5Config
-from paddlenlp.transformers.t5.modeling import T5Block, T5LayerNorm
-
-from ...configuration_utils import ConfigMixin, ModuleUtilsMixin, register_to_config
-from ...models import ModelMixin
-
-
-class SpectrogramContEncoder(ModelMixin, ConfigMixin, ModuleUtilsMixin):
-    @register_to_config
-    def __init__(
-        self,
-        input_dims: int,
-        targets_context_length: int,
-        d_model: int,
-        dropout_rate: float,
-        num_layers: int,
-        num_heads: int,
-        d_kv: int,
-        d_ff: int,
-        feed_forward_proj: str,
-        is_decoder: bool = False,
-    ):
-        super().__init__()
-        self.input_proj = nn.Linear(in_features=input_dims, out_features=d_model, bias_attr=False)
-        self.position_encoding = nn.Embedding(targets_context_length, d_model)
-        self.position_encoding.weight.stop_gradient = True
-        self.dropout_pre = nn.Dropout(p=dropout_rate)
-        t5config = T5Config(
-            d_model=d_model,
-            num_heads=num_heads,
-            d_kv=d_kv,
-            d_ff=d_ff,
-            feed_forward_proj=feed_forward_proj,
-            dropout_rate=dropout_rate,
-            is_decoder=is_decoder,
-            is_encoder_decoder=False,
-        )
-        self.encoders = nn.LayerList()
-        for lyr_num in range(num_layers):
-            lyr = T5Block(t5config)
-            self.encoders.append(lyr)
-        self.layer_norm = T5LayerNorm(d_model)
-        self.dropout_post = nn.Dropout(p=dropout_rate)
-
-    def forward(self, encoder_inputs, encoder_inputs_mask):
-
-        # terminal relative positional encodings
-        x = self.input_proj(encoder_inputs)
-        max_positions = encoder_inputs.shape[1]
-        input_positions = paddle.arange(end=max_positions)
-
-        seq_lens = encoder_inputs_mask.sum(axis=-1)
-        input_positions = paddle.roll(x=input_positions.unsqueeze(axis=0), shifts=tuple(seq_lens.tolist()), axis=0)
-        x += self.position_encoding(input_positions)
-        x = self.dropout_pre(x)
-
-        # inverted the attention mask
-        input_shape = encoder_inputs.shape
-        extended_attention_mask = self.get_extended_attention_mask(encoder_inputs_mask, input_shape)
-
-        for lyr in self.encoders:
-            x = lyr(x, extended_attention_mask)[0]
-        x = self.layer_norm(x)
-        return self.dropout_post(x), encoder_inputs_mask
diff --git a/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/midi_utils.py b/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/midi_utils.py
deleted file mode 100644
index 2113c8ffa678..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/midi_utils.py
+++ /dev/null
@@ -1,637 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import dataclasses
-import math
-import os
-from typing import (
-    Any,
-    Callable,
-    List,
-    Mapping,
-    MutableMapping,
-    Optional,
-    Sequence,
-    Tuple,
-    Union,
-)
-
-import numpy as np
-import paddle
-import paddle.nn.functional as F
-
-from ...utils import is_note_seq_available
-from .pipeline_spectrogram_diffusion import TARGET_FEATURE_LENGTH
-
-if is_note_seq_available():
-    import note_seq
-else:
-    raise ImportError("Please install note-seq via `pip install note-seq`")
-INPUT_FEATURE_LENGTH = 2048
-SAMPLE_RATE = 16000
-HOP_SIZE = 320
-FRAME_RATE = int(SAMPLE_RATE // HOP_SIZE)
-DEFAULT_STEPS_PER_SECOND = 100
-DEFAULT_MAX_SHIFT_SECONDS = 10
-DEFAULT_NUM_VELOCITY_BINS = 1
-SLAKH_CLASS_PROGRAMS = {
-    "Acoustic Piano": 0,
-    "Electric Piano": 4,
-    "Chromatic Percussion": 8,
-    "Organ": 16,
-    "Acoustic Guitar": 24,
-    "Clean Electric Guitar": 26,
-    "Distorted Electric Guitar": 29,
-    "Acoustic Bass": 32,
-    "Electric Bass": 33,
-    "Violin": 40,
-    "Viola": 41,
-    "Cello": 42,
-    "Contrabass": 43,
-    "Orchestral Harp": 46,
-    "Timpani": 47,
-    "String Ensemble": 48,
-    "Synth Strings": 50,
-    "Choir and Voice": 52,
-    "Orchestral Hit": 55,
-    "Trumpet": 56,
-    "Trombone": 57,
-    "Tuba": 58,
-    "French Horn": 60,
-    "Brass Section": 61,
-    "Soprano/Alto Sax": 64,
-    "Tenor Sax": 66,
-    "Baritone Sax": 67,
-    "Oboe": 68,
-    "English Horn": 69,
-    "Bassoon": 70,
-    "Clarinet": 71,
-    "Pipe": 73,
-    "Synth Lead": 80,
-    "Synth Pad": 88,
-}
-
-
-@dataclasses.dataclass
-class NoteRepresentationConfig:
-    """Configuration note representations."""
-
-    onsets_only: bool
-    include_ties: bool
-
-
-@dataclasses.dataclass
-class NoteEventData:
-    pitch: int
-    velocity: Optional[int] = None
-    program: Optional[int] = None
-    is_drum: Optional[bool] = None
-    instrument: Optional[int] = None
-
-
-@dataclasses.dataclass
-class NoteEncodingState:
-    """Encoding state for note transcription, keeping track of active pitches."""
-
-    # velocity bin for active pitches and programs
-    active_pitches: MutableMapping[Tuple[int, int], int] = dataclasses.field(default_factory=dict)
-
-
-@dataclasses.dataclass
-class EventRange:
-    type: str
-    min_value: int
-    max_value: int
-
-
-@dataclasses.dataclass
-class Event:
-    type: str
-    value: int
-
-
-class Tokenizer:
-    def __init__(self, regular_ids: int):
-        # The special tokens: 0=PAD, 1=EOS, and 2=UNK
-        self._num_special_tokens = 3
-        self._num_regular_tokens = regular_ids
-
-    def encode(self, token_ids):
-        encoded = []
-        for token_id in token_ids:
-            if not 0 <= token_id < self._num_regular_tokens:
-                raise ValueError(
-                    f"token_id {token_id} does not fall within valid range of [0, {self._num_regular_tokens})"
-                )
-            encoded.append(token_id + self._num_special_tokens)
-
-        # Add EOS token
-        encoded.append(1)
-
-        # Pad to till INPUT_FEATURE_LENGTH
-        encoded = encoded + [0] * (INPUT_FEATURE_LENGTH - len(encoded))
-
-        return encoded
-
-
-class Codec:
-    """Encode and decode events.
-
-    Useful for declaring what certain ranges of a vocabulary should be used for. This is intended to be used from
-    Python before encoding or after decoding with GenericTokenVocabulary. This class is more lightweight and does not
-    include things like EOS or UNK token handling.
-
-    To ensure that 'shift' events are always the first block of the vocab and start at 0, that event type is required
-    and specified separately.
-    """
-
-    def __init__(self, max_shift_steps: int, steps_per_second: float, event_ranges: List[EventRange]):
-        """Define Codec.
-
-        Args:
-          max_shift_steps: Maximum number of shift steps that can be encoded.
-          steps_per_second: Shift steps will be interpreted as having a duration of
-              1 / steps_per_second.
-          event_ranges: Other supported event types and their ranges.
-        """
-        self.steps_per_second = steps_per_second
-        self._shift_range = EventRange(type="shift", min_value=0, max_value=max_shift_steps)
-        self._event_ranges = [self._shift_range] + event_ranges
-
-        # Ensure all event types have unique names.
-        assert len(self._event_ranges) == len({er.type for er in self._event_ranges})
-
-    @property
-    def num_classes(self) -> int:
-        return sum(er.max_value - er.min_value + 1 for er in self._event_ranges)
-
-    # The next couple methods are simplified special case methods just for shift
-    # events that are intended to be used from within autograph functions.
-
-    def is_shift_event_index(self, index: int) -> bool:
-        return self._shift_range.min_value <= index and index <= self._shift_range.max_value
-
-    @property
-    def max_shift_steps(self) -> int:
-        return self._shift_range.max_value
-
-    def encode_event(self, event: Event) -> int:
-        """Encode an event to an index."""
-        offset = 0
-        for er in self._event_ranges:
-            if event.type == er.type:
-                if not er.min_value <= event.value <= er.max_value:
-                    raise ValueError(
-                        f"Event value {event.value} is not within valid range [{er.min_value}, {er.max_value}] for type {event.type}"
-                    )
-                return offset + event.value - er.min_value
-            offset += er.max_value - er.min_value + 1
-        raise ValueError(f"Unknown event type: {event.type}")
-
-    def event_type_range(self, event_type: str) -> Tuple[int, int]:
-        """Return [min_id, max_id] for an event type."""
-        offset = 0
-        for er in self._event_ranges:
-            if event_type == er.type:
-                return offset, offset + (er.max_value - er.min_value)
-            offset += er.max_value - er.min_value + 1
-        raise ValueError(f"Unknown event type: {event_type}")
-
-    def decode_event_index(self, index: int) -> Event:
-        """Decode an event index to an Event."""
-        offset = 0
-        for er in self._event_ranges:
-            if offset <= index <= offset + er.max_value - er.min_value:
-                return Event(type=er.type, value=er.min_value + index - offset)
-            offset += er.max_value - er.min_value + 1
-        raise ValueError(f"Unknown event index: {index}")
-
-
-@dataclasses.dataclass
-class ProgramGranularity:
-    # both tokens_map_fn and program_map_fn should be idempotent
-    tokens_map_fn: Callable[[Sequence[int], Codec], Sequence[int]]
-    program_map_fn: Callable[[int], int]
-
-
-def drop_programs(tokens, codec: Codec):
-    """Drops program change events from a token sequence."""
-    min_program_id, max_program_id = codec.event_type_range("program")
-    return tokens[(tokens < min_program_id) | (tokens > max_program_id)]
-
-
-def programs_to_midi_classes(tokens, codec):
-    """Modifies program events to be the first program in the MIDI class."""
-    min_program_id, max_program_id = codec.event_type_range("program")
-    is_program = (tokens >= min_program_id) & (tokens <= max_program_id)
-    return np.where(is_program, min_program_id + 8 * ((tokens - min_program_id) // 8), tokens)
-
-
-PROGRAM_GRANULARITIES = {
-    # "flat" granularity; drop program change tokens and set NoteSequence
-    # programs to zero
-    "flat": ProgramGranularity(tokens_map_fn=drop_programs, program_map_fn=lambda program: 0),
-    # map each program to the first program in its MIDI class
-    "midi_class": ProgramGranularity(
-        tokens_map_fn=programs_to_midi_classes, program_map_fn=lambda program: 8 * (program // 8)
-    ),
-    # leave programs as is
-    "full": ProgramGranularity(tokens_map_fn=lambda tokens, codec: tokens, program_map_fn=lambda program: program),
-}
-
-
-def unfold(tensor, dimension, size, step=1):
-    assert dimension < len(tensor.shape), "dimension must be less than tensor dimensions"
-    assert tensor.shape[dimension] >= size, "size should not be greater than the dimension of tensor"
-
-    slices = []
-    for i in range(0, tensor.shape[dimension] - size + 1, step):
-        start = [0] * len(tensor.shape)
-        end = list(tensor.shape)
-        start[dimension] = i
-        end[dimension] = i + size
-        axes = list(range(len(start)))
-        slice = paddle.slice(tensor, axes, start, end)
-        slices.append(slice)
-
-    unfolded_tensor = paddle.stack(slices, axis=dimension)
-
-    return unfolded_tensor
-
-
-def frame(signal, frame_length, frame_step, pad_end=False, pad_value=0, axis=-1):
-    """
-    equivalent of tf.signal.frame
-    """
-    signal_length = signal.shape[axis]
-    if pad_end:
-        frames_overlap = frame_length - frame_step
-        rest_samples = np.abs(signal_length - frames_overlap) % np.abs(frame_length - frames_overlap)
-        pad_size = int(frame_length - rest_samples)
-        if pad_size != 0:
-            pad_axis = [0] * signal.ndim
-            pad_axis[axis] = pad_size
-            signal = F.pad(x=signal, pad=pad_axis, mode="constant", value=pad_value)
-    frames = unfold(signal, axis, frame_length, frame_step)
-    return frames
-
-
-def program_to_slakh_program(program):
-    # this is done very hackily, probably should use a custom mapping
-    for slakh_program in sorted(SLAKH_CLASS_PROGRAMS.values(), reverse=True):
-        if program >= slakh_program:
-            return slakh_program
-
-
-def audio_to_frames(samples, hop_size: int, frame_rate: int) -> Tuple[Sequence[Sequence[int]], paddle.Tensor]:
-    """Convert audio samples to non-overlapping frames and frame times."""
-    frame_size = hop_size
-    samples = np.pad(samples, [0, frame_size - len(samples) % frame_size], mode="constant")
-
-    # Split audio into frames.
-    frames = frame(
-        paddle.to_tensor(data=samples).unsqueeze(axis=0), frame_length=frame_size, frame_step=frame_size, pad_end=False
-    )
-    num_frames = len(samples) // frame_size
-    times = np.arange(num_frames) / frame_rate
-    return frames, times
-
-
-def note_sequence_to_onsets_and_offsets_and_programs(
-    ns: note_seq.NoteSequence,
-) -> Tuple[Sequence[float], Sequence[NoteEventData]]:
-    """Extract onset & offset times and pitches & programs from a NoteSequence.
-
-    The onset & offset times will not necessarily be in sorted order.
-
-    Args:
-      ns: NoteSequence from which to extract onsets and offsets.
-
-    Returns:
-      times: A list of note onset and offset times. values: A list of NoteEventData objects where velocity is zero for
-      note
-          offsets.
-    """
-    # Sort by program and pitch and put offsets before onsets as a tiebreaker for
-    # subsequent stable sort.
-    notes = sorted(ns.notes, key=lambda note: (note.is_drum, note.program, note.pitch))
-    times = [note.end_time for note in notes if not note.is_drum] + [note.start_time for note in notes]
-    values = [
-        NoteEventData(pitch=note.pitch, velocity=0, program=note.program, is_drum=False)
-        for note in notes
-        if not note.is_drum
-    ] + [
-        NoteEventData(pitch=note.pitch, velocity=note.velocity, program=note.program, is_drum=note.is_drum)
-        for note in notes
-    ]
-    return times, values
-
-
-def num_velocity_bins_from_codec(codec: Codec):
-    """Get number of velocity bins from event codec."""
-    lo, hi = codec.event_type_range("velocity")
-    return hi - lo
-
-
-# segment an array into segments of length n
-def segment(a, n):
-    return [a[i : i + n] for i in range(0, len(a), n)]
-
-
-def velocity_to_bin(velocity, num_velocity_bins):
-    if velocity == 0:
-        return 0
-    else:
-        return math.ceil(num_velocity_bins * velocity / note_seq.MAX_MIDI_VELOCITY)
-
-
-def note_event_data_to_events(
-    state: Optional[NoteEncodingState], value: NoteEventData, codec: Codec
-) -> Sequence[Event]:
-    """Convert note event data to a sequence of events."""
-    if value.velocity is None:
-        # onsets only, no program or velocity
-        return [Event("pitch", value.pitch)]
-    else:
-        num_velocity_bins = num_velocity_bins_from_codec(codec)
-        velocity_bin = velocity_to_bin(value.velocity, num_velocity_bins)
-        if value.program is None:
-            # onsets + offsets + velocities only, no programs
-            if state is not None:
-                state.active_pitches[value.pitch, 0] = velocity_bin
-            return [Event("velocity", velocity_bin), Event("pitch", value.pitch)]
-        elif value.is_drum:
-            # drum events use a separate vocabulary
-            return [Event("velocity", velocity_bin), Event("drum", value.pitch)]
-        else:
-            # program + velocity + pitch
-            if state is not None:
-                state.active_pitches[value.pitch, value.program] = velocity_bin
-            return [Event("program", value.program), Event("velocity", velocity_bin), Event("pitch", value.pitch)]
-
-
-def note_encoding_state_to_events(state: NoteEncodingState) -> Sequence[Event]:
-    """Output program and pitch events for active notes plus a final tie event."""
-    events = []
-    for pitch, program in sorted(state.active_pitches.keys(), key=lambda k: k[::-1]):
-        if state.active_pitches[pitch, program]:
-            events += [Event("program", program), Event("pitch", pitch)]
-    events.append(Event("tie", 0))
-    return events
-
-
-def encode_and_index_events(
-    state, event_times, event_values, codec, frame_times, encode_event_fn, encoding_state_to_events_fn=None
-):
-    """Encode a sequence of timed events and index to audio frame times.
-
-    Encodes time shifts as repeated single step shifts for later run length encoding.
-
-    Optionally, also encodes a sequence of "state events", keeping track of the current encoding state at each audio
-    frame. This can be used e.g. to prepend events representing the current state to a targets segment.
-
-    Args:
-      state: Initial event encoding state.
-      event_times: Sequence of event times.
-      event_values: Sequence of event values.
-      encode_event_fn: Function that transforms event value into a sequence of one
-          or more Event objects.
-      codec: An Codec object that maps Event objects to indices.
-      frame_times: Time for every audio frame.
-      encoding_state_to_events_fn: Function that transforms encoding state into a
-          sequence of one or more Event objects.
-
-    Returns:
-      events: Encoded events and shifts. event_start_indices: Corresponding start event index for every audio frame.
-          Note: one event can correspond to multiple audio indices due to sampling rate differences. This makes
-          splitting sequences tricky because the same event can appear at the end of one sequence and the beginning of
-          another.
-      event_end_indices: Corresponding end event index for every audio frame. Used
-          to ensure when slicing that one chunk ends where the next begins. Should always be true that
-          event_end_indices[i] = event_start_indices[i + 1].
-      state_events: Encoded "state" events representing the encoding state before
-          each event.
-      state_event_indices: Corresponding state event index for every audio frame.
-    """
-    indices = np.argsort(event_times, kind="stable")
-    event_steps = [round(event_times[i] * codec.steps_per_second) for i in indices]
-    event_values = [event_values[i] for i in indices]
-    events = []
-    state_events = []
-    event_start_indices = []
-    state_event_indices = []
-    cur_step = 0
-    cur_event_idx = 0
-    cur_state_event_idx = 0
-
-    def fill_event_start_indices_to_cur_step():
-        while (
-            len(event_start_indices) < len(frame_times)
-            and frame_times[len(event_start_indices)] < cur_step / codec.steps_per_second
-        ):
-            event_start_indices.append(cur_event_idx)
-            state_event_indices.append(cur_state_event_idx)
-
-    for event_step, event_value in zip(event_steps, event_values):
-        while event_step > cur_step:
-            events.append(codec.encode_event(Event(type="shift", value=1)))
-            cur_step += 1
-            fill_event_start_indices_to_cur_step()
-            cur_event_idx = len(events)
-            cur_state_event_idx = len(state_events)
-        if encoding_state_to_events_fn:
-            # Dump state to state events *before* processing the next event, because
-            # we want to capture the state prior to the occurrence of the event.
-            for e in encoding_state_to_events_fn(state):
-                state_events.append(codec.encode_event(e))
-        for e in encode_event_fn(state, event_value, codec):
-            events.append(codec.encode_event(e))
-
-    # After the last event, continue filling out the event_start_indices array.
-    # The inequality is not strict because if our current step lines up exactly
-    # with (the start of) an audio frame, we need to add an additional shift event
-    # to "cover" that frame.
-    while cur_step / codec.steps_per_second <= frame_times[-1]:
-        events.append(codec.encode_event(Event(type="shift", value=1)))
-        cur_step += 1
-        fill_event_start_indices_to_cur_step()
-        cur_event_idx = len(events)
-
-    # Now fill in event_end_indices. We need this extra array to make sure that
-    # when we slice events, each slice ends exactly where the subsequent slice
-    # begins.
-    event_end_indices = event_start_indices[1:] + [len(events)]
-
-    events = np.array(events).astype(np.int32)
-    state_events = np.array(state_events).astype(np.int32)
-    event_start_indices = segment(np.array(event_start_indices).astype(np.int32), TARGET_FEATURE_LENGTH)
-    event_end_indices = segment(np.array(event_end_indices).astype(np.int32), TARGET_FEATURE_LENGTH)
-    state_event_indices = segment(np.array(state_event_indices).astype(np.int32), TARGET_FEATURE_LENGTH)
-    outputs = []
-    for start_indices, end_indices, event_indices in zip(event_start_indices, event_end_indices, state_event_indices):
-        outputs.append(
-            {
-                "inputs": events,
-                "event_start_indices": start_indices,
-                "event_end_indices": end_indices,
-                "state_events": state_events,
-                "state_event_indices": event_indices,
-            }
-        )
-    return outputs
-
-
-def extract_sequence_with_indices(features, state_events_end_token=None, feature_key="inputs"):
-    """Extract target sequence corresponding to audio token segment."""
-    features = features.copy()
-    start_idx = features["event_start_indices"][0]
-    end_idx = features["event_end_indices"][-1]
-    features[feature_key] = features[feature_key][start_idx:end_idx]
-    if state_events_end_token is not None:
-        # Extract the state events corresponding to the audio start token, and
-        # prepend them to the targets array.
-        state_event_start_idx = features["state_event_indices"][0]
-        state_event_end_idx = state_event_start_idx + 1
-        while features["state_events"][state_event_end_idx - 1] != state_events_end_token:
-            state_event_end_idx += 1
-        features[feature_key] = np.concatenate(
-            [features["state_events"][state_event_start_idx:state_event_end_idx], features[feature_key]], axis=0
-        )
-    return features
-
-
-def map_midi_programs(
-    feature, codec: Codec, granularity_type: str = "full", feature_key: str = "inputs"
-) -> Mapping[str, Any]:
-    """Apply MIDI program map to token sequences."""
-    granularity = PROGRAM_GRANULARITIES[granularity_type]
-    feature[feature_key] = granularity.tokens_map_fn(feature[feature_key], codec)
-    return feature
-
-
-def run_length_encode_shifts_fn(
-    features, codec: Codec, feature_key: str = "inputs", state_change_event_types: Sequence[str] = ()
-) -> Callable[[Mapping[str, Any]], Mapping[str, Any]]:
-    """Return a function that run-length encodes shifts for a given codec.
-
-    Args:
-      codec: The Codec to use for shift events.
-      feature_key: The feature key for which to run-length encode shifts.
-      state_change_event_types: A list of event types that represent state
-          changes; tokens corresponding to these event types will be interpreted as state changes and redundant ones
-          will be removed.
-
-    Returns:
-      A preprocessing function that run-length encodes single-step shifts.
-    """
-    state_change_event_ranges = [codec.event_type_range(event_type) for event_type in state_change_event_types]
-
-    def run_length_encode_shifts(features: MutableMapping[str, Any]) -> Mapping[str, Any]:
-        """Combine leading/interior shifts, trim trailing shifts.
-
-        Args:
-          features: Dict of features to process.
-
-        Returns:
-          A dict of features.
-        """
-        events = features[feature_key]
-        shift_steps = 0
-        total_shift_steps = 0
-        output = np.array([], dtype=np.int32)
-        current_state = np.zeros(len(state_change_event_ranges), dtype=np.int32)
-        for event in events:
-            if codec.is_shift_event_index(event):
-                shift_steps += 1
-                total_shift_steps += 1
-            else:
-                # If this event is a state change and has the same value as the current
-                # state, we can skip it entirely.
-                is_redundant = False
-                for i, (min_index, max_index) in enumerate(state_change_event_ranges):
-                    if min_index <= event and event <= max_index:
-                        if current_state[i] == event:
-                            is_redundant = True
-                        current_state[i] = event
-                if is_redundant:
-                    continue
-
-                # Once we've reached a non-shift event, RLE all previous shift events
-                # before outputting the non-shift event.
-                if shift_steps > 0:
-                    shift_steps = total_shift_steps
-                    while shift_steps > 0:
-                        output_steps = np.minimum(codec.max_shift_steps, shift_steps)
-                        output = np.concatenate([output, [output_steps]], axis=0)
-                        shift_steps -= output_steps
-                output = np.concatenate([output, [event]], axis=0)
-        features[feature_key] = output
-        return features
-
-    return run_length_encode_shifts(features)
-
-
-def note_representation_processor_chain(features, codec: Codec, note_representation_config: NoteRepresentationConfig):
-    tie_token = codec.encode_event(Event("tie", 0))
-    state_events_end_token = tie_token if note_representation_config.include_ties else None
-    features = extract_sequence_with_indices(
-        features, state_events_end_token=state_events_end_token, feature_key="inputs"
-    )
-    features = map_midi_programs(features, codec)
-    features = run_length_encode_shifts_fn(features, codec, state_change_event_types=["velocity", "program"])
-    return features
-
-
-class MidiProcessor:
-    def __init__(self):
-        self.codec = Codec(
-            max_shift_steps=DEFAULT_MAX_SHIFT_SECONDS * DEFAULT_STEPS_PER_SECOND,
-            steps_per_second=DEFAULT_STEPS_PER_SECOND,
-            event_ranges=[
-                EventRange("pitch", note_seq.MIN_MIDI_PITCH, note_seq.MAX_MIDI_PITCH),
-                EventRange("velocity", 0, DEFAULT_NUM_VELOCITY_BINS),
-                EventRange("tie", 0, 0),
-                EventRange("program", note_seq.MIN_MIDI_PROGRAM, note_seq.MAX_MIDI_PROGRAM),
-                EventRange("drum", note_seq.MIN_MIDI_PITCH, note_seq.MAX_MIDI_PITCH),
-            ],
-        )
-        self.tokenizer = Tokenizer(self.codec.num_classes)
-        self.note_representation_config = NoteRepresentationConfig(onsets_only=False, include_ties=True)
-
-    def __call__(self, midi: Union[bytes, os.PathLike, str]):
-        if not isinstance(midi, bytes):
-            with open(midi, "rb") as f:
-                midi = f.read()
-        ns = note_seq.midi_to_note_sequence(midi)
-        ns_sus = note_seq.apply_sustain_control_changes(ns)
-        for note in ns_sus.notes:
-            if not note.is_drum:
-                note.program = program_to_slakh_program(note.program)
-        samples = np.zeros(int(ns_sus.total_time * SAMPLE_RATE))
-        _, frame_times = audio_to_frames(samples, HOP_SIZE, FRAME_RATE)
-        times, values = note_sequence_to_onsets_and_offsets_and_programs(ns_sus)
-        events = encode_and_index_events(
-            state=NoteEncodingState(),
-            event_times=times,
-            event_values=values,
-            frame_times=frame_times,
-            codec=self.codec,
-            encode_event_fn=note_event_data_to_events,
-            encoding_state_to_events_fn=note_encoding_state_to_events,
-        )
-        events = [
-            note_representation_processor_chain(event, self.codec, self.note_representation_config) for event in events
-        ]
-        input_tokens = [self.tokenizer.encode(event["inputs"]) for event in events]
-        return input_tokens
diff --git a/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/notes_encoder.py b/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/notes_encoder.py
deleted file mode 100644
index e4890fed3c66..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/notes_encoder.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.nn as nn
-
-from paddlenlp.transformers.t5.configuration import T5Config
-from paddlenlp.transformers.t5.modeling import T5Block, T5LayerNorm
-
-from ...configuration_utils import ConfigMixin, ModuleUtilsMixin, register_to_config
-from ...models import ModelMixin
-
-
-class SpectrogramNotesEncoder(ModelMixin, ConfigMixin, ModuleUtilsMixin):
-    @register_to_config
-    def __init__(
-        self,
-        max_length: int,
-        vocab_size: int,
-        d_model: int,
-        dropout_rate: float,
-        num_layers: int,
-        num_heads: int,
-        d_kv: int,
-        d_ff: int,
-        feed_forward_proj: str,
-        is_decoder: bool = False,
-    ):
-        super().__init__()
-        self.token_embedder = nn.Embedding(vocab_size, d_model)
-        self.position_encoding = nn.Embedding(max_length, d_model)
-        self.position_encoding.weight.stop_gradient = not False
-        self.dropout_pre = nn.Dropout(p=dropout_rate)
-        t5config = T5Config(
-            vocab_size=vocab_size,
-            d_model=d_model,
-            num_heads=num_heads,
-            d_kv=d_kv,
-            d_ff=d_ff,
-            dropout_rate=dropout_rate,
-            feed_forward_proj=feed_forward_proj,
-            is_decoder=is_decoder,
-            is_encoder_decoder=False,
-        )
-        self.encoders = nn.LayerList()
-        for lyr_num in range(num_layers):
-            lyr = T5Block(t5config)
-            self.encoders.append(lyr)
-        self.layer_norm = T5LayerNorm(d_model)
-        self.dropout_post = nn.Dropout(p=dropout_rate)
-
-    def forward(self, encoder_input_tokens, encoder_inputs_mask):
-        x = self.token_embedder(encoder_input_tokens)
-        seq_length = encoder_input_tokens.shape[1]
-        inputs_positions = paddle.arange(end=seq_length)
-        x += self.position_encoding(inputs_positions)
-        x = self.dropout_pre(x)
-
-        # inverted the attention mask
-        input_shape = encoder_input_tokens.shape
-        extended_attention_mask = self.get_extended_attention_mask(encoder_inputs_mask, input_shape)
-        for lyr in self.encoders:
-            x = lyr(x, extended_attention_mask)[0]
-        x = self.layer_norm(x)
-        return self.dropout_post(x), encoder_inputs_mask
diff --git a/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
deleted file mode 100644
index 0fa69a0aa4af..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ /dev/null
@@ -1,177 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-from typing import Any, Callable, List, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-
-from ...models import T5FilmDecoder
-from ...schedulers import DDPMScheduler
-from ...utils import logging, randn_tensor
-from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
-from .continous_encoder import SpectrogramContEncoder
-from .notes_encoder import SpectrogramNotesEncoder
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-TARGET_FEATURE_LENGTH = 256
-
-
-class SpectrogramDiffusionPipeline(DiffusionPipeline):
-    _optional_components = ["melgan"]
-
-    def __init__(
-        self,
-        notes_encoder: SpectrogramNotesEncoder,
-        continuous_encoder: SpectrogramContEncoder,
-        decoder: T5FilmDecoder,
-        scheduler: DDPMScheduler,
-        melgan: (Any),
-    ) -> None:
-        super().__init__()
-
-        # From MELGAN
-        self.min_value = math.log(1e-05)  # Matches MelGAN training.
-        self.max_value = 4.0  # Largest value for most examples
-        self.n_dims = 128
-        self.register_modules(
-            notes_encoder=notes_encoder,
-            continuous_encoder=continuous_encoder,
-            decoder=decoder,
-            scheduler=scheduler,
-            melgan=melgan,
-        )
-
-    def scale_features(self, features, output_range=(-1.0, 1.0), clip=False):
-        """Linearly scale features to network outputs range."""
-        min_out, max_out = output_range
-        if clip:
-            features = paddle.clip(x=features, min=self.min_value, max=self.max_value)
-        # Scale to [0, 1].
-        zero_one = (features - self.min_value) / (self.max_value - self.min_value)
-        # Scale to [min_out, max_out].
-        return zero_one * (max_out - min_out) + min_out
-
-    def scale_to_features(self, outputs, input_range=(-1.0, 1.0), clip=False):
-        """Invert by linearly scaling network outputs to features range."""
-        min_out, max_out = input_range
-        outputs = paddle.clip(x=outputs, min=min_out, max=max_out) if clip else outputs
-        # Scale to [0, 1].
-        zero_one = (outputs - min_out) / (max_out - min_out)
-        # Scale to [self.min_value, self.max_value].
-        return zero_one * (self.max_value - self.min_value) + self.min_value
-
-    def encode(self, input_tokens, continuous_inputs, continuous_mask):
-        tokens_mask = input_tokens > 0
-        tokens_encoded, tokens_mask = self.notes_encoder(
-            encoder_input_tokens=input_tokens, encoder_inputs_mask=tokens_mask
-        )
-        continuous_encoded, continuous_mask = self.continuous_encoder(
-            encoder_inputs=continuous_inputs.cast(self.continuous_encoder.dtype), encoder_inputs_mask=continuous_mask
-        )
-        return [(tokens_encoded, tokens_mask), (continuous_encoded, continuous_mask)]
-
-    def decode(self, encodings_and_masks, input_tokens, noise_time):
-        timesteps = noise_time
-        if not paddle.is_tensor(x=timesteps):
-            timesteps = paddle.to_tensor(data=[timesteps], dtype="int64", place=input_tokens.place)
-        elif paddle.is_tensor(x=timesteps) and len(timesteps.shape) == 0:
-            if isinstance(input_tokens.place, paddle.dtype):
-                dtype = input_tokens.place
-            elif isinstance(input_tokens.place, str) and input_tokens.place not in ["cpu", "cuda", "ipu", "xpu"]:
-                dtype = input_tokens.place
-            elif isinstance(input_tokens.place, paddle.Tensor):
-                dtype = input_tokens.place.dtype
-            else:
-                dtype = timesteps[None].dtype
-            timesteps = timesteps[None].cast(dtype)
-        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-        timesteps = timesteps * paddle.ones(shape=input_tokens.shape[0], dtype=timesteps.dtype)
-        logits = self.decoder(
-            encodings_and_masks=encodings_and_masks, decoder_input_tokens=input_tokens, decoder_noise_time=timesteps
-        )
-        return logits
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        input_tokens: List[List[int]],
-        generator: Optional[paddle.Generator] = None,
-        num_inference_steps: int = 100,
-        return_dict: bool = True,
-        output_type: str = "numpy",
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: int = 1,
-    ) -> Union[AudioPipelineOutput, Tuple]:
-        if (
-            callback_steps is None
-            or callback_steps is not None
-            and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type {type(callback_steps)}."
-            )
-        pred_mel = np.zeros([1, TARGET_FEATURE_LENGTH, self.n_dims], dtype=np.float32)
-        full_pred_mel = np.zeros([1, 0, self.n_dims], np.float32)
-        ones = paddle.ones(shape=(1, TARGET_FEATURE_LENGTH), dtype=bool)
-        for i, encoder_input_tokens in enumerate(input_tokens):
-            if i == 0:
-                encoder_continuous_inputs = paddle.to_tensor(data=pred_mel[:1].copy()).cast(self.decoder.dtype)
-                # The first chunk has no previous context.
-                encoder_continuous_mask = paddle.zeros(shape=(1, TARGET_FEATURE_LENGTH), dtype=bool)
-            else:
-                # The full song pipeline does not feed in a context feature, so the mask
-                # will be all 0s after the feature converter. Because we know we're
-                # feeding in a full context chunk from the previous prediction, set it
-                # to all 1s.
-                encoder_continuous_mask = ones
-            encoder_continuous_inputs = self.scale_features(
-                encoder_continuous_inputs, output_range=[-1.0, 1.0], clip=True
-            )
-            encodings_and_masks = self.encode(
-                input_tokens=paddle.to_tensor(data=[encoder_input_tokens], dtype="int32"),
-                continuous_inputs=encoder_continuous_inputs,
-                continuous_mask=encoder_continuous_mask,
-            )
-            # Sample encoder_continuous_inputs shaped gaussian noise to begin loop
-            x = randn_tensor(shape=encoder_continuous_inputs.shape, generator=generator, dtype=self.decoder.dtype)
-            # set step values
-            self.scheduler.set_timesteps(num_inference_steps)
-            # Denoising diffusion loop
-            for j, t in enumerate(self.progress_bar(self.scheduler.timesteps)):
-                output = self.decode(
-                    encodings_and_masks=encodings_and_masks,
-                    input_tokens=x,
-                    noise_time=t / self.scheduler.config.num_train_timesteps,
-                )
-
-                # Compute previous output: x_t -> x_t-1
-                x = self.scheduler.step(output, t, x, generator=generator).prev_sample
-            mel = self.scale_to_features(x, input_range=[-1.0, 1.0])
-            encoder_continuous_inputs = mel[:1]
-            pred_mel = mel.cpu().astype(dtype="float32").numpy()
-            full_pred_mel = np.concatenate([full_pred_mel, pred_mel[:1]], axis=1)
-
-            # call the callback, if provided
-            if callback is not None and i % callback_steps == 0:
-                callback(i, full_pred_mel)
-            logger.info("Generated segment", i)
-        if output_type == "numpy":
-            output = self.melgan(input_features=full_pred_mel.astype(np.float32))[0]
-        else:
-            output = full_pred_mel
-        if not return_dict:
-            return (output,)
-        return AudioPipelineOutput(audios=output)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/__init__.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/__init__.py
deleted file mode 100644
index bf0142ee9322..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/__init__.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from dataclasses import dataclass
-from typing import List, Optional, Union
-
-import numpy as np
-import PIL.Image
-
-from ...utils import (
-    BaseOutput,
-    OptionalDependencyNotAvailable,
-    is_fastdeploy_available,
-    is_k_diffusion_available,
-    is_k_diffusion_version,
-    is_paddle_available,
-    is_paddlenlp_available,
-)
-
-
-@dataclass
-class StableDiffusionPipelineOutput(BaseOutput):
-    """
-    Output class for Stable Diffusion pipelines.
-
-    Args:
-        images (`List[PIL.Image.Image]` or `np.ndarray`)
-            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
-            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
-        nsfw_content_detected (`List[bool]`)
-            List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, or `None` if safety checking could not be performed.
-    """
-
-    images: Union[List[PIL.Image.Image], np.ndarray]
-    nsfw_content_detected: Optional[List[bool]]
-
-
-try:
-    if not (is_paddle_available() and is_paddlenlp_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ...utils.dummy_paddle_and_paddlenlp_objects import *  # noqa F403
-else:
-    # new added
-    from .hf_clip_model import (
-        HFCLIPModel,
-        HFCLIPTextModel,
-        HFCLIPTextModelWithProjection,
-        HFCLIPVisionModel,
-        HFCLIPVisionModelWithProjection,
-    )
-    from .pipeline_cycle_diffusion import CycleDiffusionPipeline
-    from .pipeline_stable_diffusion import StableDiffusionPipeline
-    from .pipeline_stable_diffusion_adapter import StableDiffusionAdapterPipeline
-    from .pipeline_stable_diffusion_all_in_one import StableDiffusionPipelineAllinOne
-    from .pipeline_stable_diffusion_attend_and_excite import (
-        StableDiffusionAttendAndExcitePipeline,
-    )
-    from .pipeline_stable_diffusion_controlnet import StableDiffusionControlNetPipeline
-    from .pipeline_stable_diffusion_depth2img import StableDiffusionDepth2ImgPipeline
-    from .pipeline_stable_diffusion_image_variation import (
-        StableDiffusionImageVariationPipeline,
-    )
-    from .pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipeline
-    from .pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipeline
-    from .pipeline_stable_diffusion_inpaint_legacy import (
-        StableDiffusionInpaintPipelineLegacy,
-    )
-    from .pipeline_stable_diffusion_instruct_pix2pix import (
-        StableDiffusionInstructPix2PixPipeline,
-    )
-    from .pipeline_stable_diffusion_k_diffusion import StableDiffusionKDiffusionPipeline
-    from .pipeline_stable_diffusion_latent_upscale import (
-        StableDiffusionLatentUpscalePipeline,
-    )
-    from .pipeline_stable_diffusion_mega import StableDiffusionMegaPipeline
-    from .pipeline_stable_diffusion_model_editing import (
-        StableDiffusionModelEditingPipeline,
-    )
-    from .pipeline_stable_diffusion_panorama import StableDiffusionPanoramaPipeline
-    from .pipeline_stable_diffusion_pix2pix_zero import (
-        StableDiffusionPix2PixZeroPipeline,
-    )
-    from .pipeline_stable_diffusion_sag import StableDiffusionSAGPipeline
-    from .pipeline_stable_diffusion_upscale import StableDiffusionUpscalePipeline
-    from .pipeline_stable_unclip import StableUnCLIPPipeline
-    from .pipeline_stable_unclip_img2img import StableUnCLIPImg2ImgPipeline
-    from .safety_checker import StableDiffusionSafetyChecker
-    from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
-
-
-try:
-    if not (is_paddle_available() and is_fastdeploy_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ...utils.dummy_fastdeploy_objects import *  # noqa F403
-else:
-    from .pipeline_fastdeploy_cycle_diffusion import FastDeployCycleDiffusionPipeline
-    from .pipeline_fastdeploy_stable_diffusion import FastDeployStableDiffusionPipeline
-    from .pipeline_fastdeploy_stable_diffusion_controlnet import (
-        FastDeployStableDiffusionControlNetPipeline,
-    )
-    from .pipeline_fastdeploy_stable_diffusion_image_variation import (
-        FastDeployStableDiffusionImageVariationPipeline,
-    )
-    from .pipeline_fastdeploy_stable_diffusion_img2img import (
-        FastDeployStableDiffusionImg2ImgPipeline,
-    )
-    from .pipeline_fastdeploy_stable_diffusion_inpaint import (
-        FastDeployStableDiffusionInpaintPipeline,
-    )
-    from .pipeline_fastdeploy_stable_diffusion_inpaint_legacy import (
-        FastDeployStableDiffusionInpaintPipelineLegacy,
-    )
-    from .pipeline_fastdeploy_stable_diffusion_mega import (
-        FastDeployStableDiffusionMegaPipeline,
-    )
-    from .pipeline_fastdeploy_stable_diffusion_upscale import (
-        FastDeployStableDiffusionUpscalePipeline,
-    )
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/convert_from_ckpt.py
deleted file mode 100644
index c325b0840cfd..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/convert_from_ckpt.py
+++ /dev/null
@@ -1,1421 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Conversion script for the Stable Diffusion checkpoints."""
-
-import re
-from io import BytesIO
-from typing import Optional
-
-import numpy as np
-import requests
-
-from paddlenlp.transformers import (
-    BertTokenizer,
-    CLIPFeatureExtractor,
-    CLIPImageProcessor,
-    CLIPTextModel,
-    CLIPTextModelWithProjection,
-    CLIPTokenizer,
-    CLIPVisionConfig,
-    CLIPVisionModelWithProjection,
-)
-
-from ...models import (
-    AutoencoderKL,
-    ControlNetModel,
-    PriorTransformer,
-    UNet2DConditionModel,
-)
-from ...schedulers import (
-    DDIMScheduler,
-    DDPMScheduler,
-    DPMSolverMultistepScheduler,
-    EulerAncestralDiscreteScheduler,
-    EulerDiscreteScheduler,
-    HeunDiscreteScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    UnCLIPScheduler,
-)
-from ...utils import is_omegaconf_available, logging
-from ...utils.import_utils import BACKENDS_MAPPING
-from ...utils.load_utils import smart_load
-from ..latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel
-from ..paint_by_example import PaintByExampleImageEncoder
-from ..pipeline_utils import DiffusionPipeline
-from .safety_checker import StableDiffusionSafetyChecker
-from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-def shave_segments(path, n_shave_prefix_segments=1):
-    """
-    Removes segments. Positive values shave the first segments, negative shave the last segments.
-    """
-    if n_shave_prefix_segments >= 0:
-        return ".".join(path.split(".")[n_shave_prefix_segments:])
-    else:
-        return ".".join(path.split(".")[:n_shave_prefix_segments])
-
-
-def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside resnets to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item.replace("in_layers.0", "norm1")
-        new_item = new_item.replace("in_layers.2", "conv1")
-
-        new_item = new_item.replace("out_layers.0", "norm2")
-        new_item = new_item.replace("out_layers.3", "conv2")
-
-        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
-        new_item = new_item.replace("skip_connection", "conv_shortcut")
-
-        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-
-        mapping.append({"old": old_item, "new": new_item})
-
-    return mapping
-
-
-def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside resnets to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item
-
-        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
-        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-
-        mapping.append({"old": old_item, "new": new_item})
-
-    return mapping
-
-
-def renew_attention_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside attentions to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item
-
-        mapping.append({"old": old_item, "new": new_item})
-
-    return mapping
-
-
-def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside attentions to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item
-
-        new_item = new_item.replace("norm.weight", "group_norm.weight")
-        new_item = new_item.replace("norm.bias", "group_norm.bias")
-
-        new_item = new_item.replace("q.weight", "query.weight")
-        new_item = new_item.replace("q.bias", "query.bias")
-
-        new_item = new_item.replace("k.weight", "key.weight")
-        new_item = new_item.replace("k.bias", "key.bias")
-
-        new_item = new_item.replace("v.weight", "value.weight")
-        new_item = new_item.replace("v.bias", "value.bias")
-
-        new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
-        new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
-
-        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-
-        mapping.append({"old": old_item, "new": new_item})
-
-    return mapping
-
-
-def assign_to_checkpoint(
-    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
-):
-    """
-    This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits
-    attention layers, and takes into account additional replacements that may arise.
-
-    Assigns the weights to the new checkpoint.
-    """
-    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
-
-    # Splits the attention layers into three variables.
-    if attention_paths_to_split is not None:
-        for path, path_map in attention_paths_to_split.items():
-            old_tensor = old_checkpoint[path]
-            channels = old_tensor.shape[0] // 3
-
-            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
-
-            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
-
-            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
-            query, key, value = np.split(old_tensor, 3, axis=1)
-
-            checkpoint[path_map["query"]] = query.reshape(target_shape)
-            checkpoint[path_map["key"]] = key.reshape(target_shape)
-            checkpoint[path_map["value"]] = value.reshape(target_shape)
-
-    for path in paths:
-        new_path = path["new"]
-
-        # These have already been assigned
-        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
-            continue
-
-        # Global renaming happens here
-        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
-        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
-        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
-
-        if additional_replacements is not None:
-            for replacement in additional_replacements:
-                new_path = new_path.replace(replacement["old"], replacement["new"])
-
-        # proj_attn.weight has to be converted from conv 1D to linear
-        if "proj_attn.weight" in new_path:
-            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
-        else:
-            checkpoint[new_path] = old_checkpoint[path["old"]]
-
-
-def conv_attn_to_linear(checkpoint):
-    keys = list(checkpoint.keys())
-    attn_keys = ["query.weight", "key.weight", "value.weight"]
-    for key in keys:
-        if ".".join(key.split(".")[-2:]) in attn_keys:
-            if checkpoint[key].ndim > 2:
-                checkpoint[key] = checkpoint[key][:, :, 0, 0]
-        elif "proj_attn.weight" in key:
-            if checkpoint[key].ndim > 2:
-                checkpoint[key] = checkpoint[key][:, :, 0]
-
-
-def create_unet_diffusers_config(original_config, image_size: int, controlnet=False):
-    """
-    Creates a config for the diffusers based on the config of the LDM model.
-    """
-    if controlnet:
-        unet_params = original_config.model.params.control_stage_config.params
-    else:
-        unet_params = original_config.model.params.unet_config.params
-
-    vae_params = original_config.model.params.first_stage_config.params.ddconfig
-
-    block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
-
-    down_block_types = []
-    resolution = 1
-    for i in range(len(block_out_channels)):
-        block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D"
-        down_block_types.append(block_type)
-        if i != len(block_out_channels) - 1:
-            resolution *= 2
-
-    up_block_types = []
-    for i in range(len(block_out_channels)):
-        block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D"
-        up_block_types.append(block_type)
-        resolution //= 2
-
-    vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1)
-
-    head_dim = unet_params.num_heads if "num_heads" in unet_params else None
-    use_linear_projection = (
-        unet_params.use_linear_in_transformer if "use_linear_in_transformer" in unet_params else False
-    )
-    if use_linear_projection:
-        # stable diffusion 2-base-512 and 2-768
-        if head_dim is None:
-            head_dim = [5, 10, 20, 20]
-
-    class_embed_type = None
-    projection_class_embeddings_input_dim = None
-
-    if "num_classes" in unet_params:
-        if unet_params.num_classes == "sequential":
-            class_embed_type = "projection"
-            assert "adm_in_channels" in unet_params
-            projection_class_embeddings_input_dim = unet_params.adm_in_channels
-        else:
-            raise NotImplementedError(f"Unknown conditional unet num_classes config: {unet_params.num_classes}")
-
-    config = {
-        "sample_size": image_size // vae_scale_factor,
-        "in_channels": unet_params.in_channels,
-        "down_block_types": tuple(down_block_types),
-        "block_out_channels": tuple(block_out_channels),
-        "layers_per_block": unet_params.num_res_blocks,
-        "cross_attention_dim": unet_params.context_dim,
-        "attention_head_dim": head_dim,
-        "use_linear_projection": use_linear_projection,
-        "class_embed_type": class_embed_type,
-        "projection_class_embeddings_input_dim": projection_class_embeddings_input_dim,
-    }
-
-    if not controlnet:
-        config["out_channels"] = unet_params.out_channels
-        config["up_block_types"] = tuple(up_block_types)
-
-    return config
-
-
-def create_vae_diffusers_config(original_config, image_size: int):
-    """
-    Creates a config for the diffusers based on the config of the LDM model.
-    """
-    vae_params = original_config.model.params.first_stage_config.params.ddconfig
-    _ = original_config.model.params.first_stage_config.params.embed_dim
-
-    block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult]
-    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
-    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
-
-    config = {
-        "sample_size": image_size,
-        "in_channels": vae_params.in_channels,
-        "out_channels": vae_params.out_ch,
-        "down_block_types": tuple(down_block_types),
-        "up_block_types": tuple(up_block_types),
-        "block_out_channels": tuple(block_out_channels),
-        "latent_channels": vae_params.z_channels,
-        "layers_per_block": vae_params.num_res_blocks,
-    }
-    return config
-
-
-def get_default(params, key, default):
-    if key in params:
-        return params[key]
-    else:
-        return default
-
-
-def create_diffusers_schedular(original_config):
-    schedular = DDIMScheduler(
-        num_train_timesteps=original_config.model.params.timesteps,
-        beta_start=original_config.model.params.linear_start,
-        beta_end=original_config.model.params.linear_end,
-        beta_schedule="scaled_linear",
-    )
-    return schedular
-
-
-def create_ldm_bert_config(original_config):
-    bert_params = dict(original_config.model.params.cond_stage_config.params)
-    config = dict(
-        vocab_size=get_default(bert_params, "vocab_size", 30522),
-        max_position_embeddings=get_default(bert_params, "max_seq_len", 77),
-        encoder_layers=get_default(bert_params, "n_layer", 32),
-        encoder_ffn_dim=get_default(bert_params, "n_embed", 1280) * 4,
-        encoder_attention_heads=8,
-        head_dim=64,
-        activation_function="gelu",
-        d_model=get_default(bert_params, "n_embed", 1280),
-        dropout=0.0,
-        attention_dropout=0.0,
-        activation_dropout=0.0,
-        init_std=0.02,
-        pad_token_id=0,
-    )
-    return LDMBertConfig(**config)
-
-
-def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False, controlnet=False, no_unet_key=False):
-    """
-    Takes a state dict and a config, and returns a converted checkpoint.
-    """
-
-    # extract state_dict for UNet
-    unet_state_dict = {}
-    keys = list(checkpoint.keys())
-
-    if no_unet_key:
-        unet_key = ""
-    else:
-        if controlnet:
-            unet_key = "control_model."
-        else:
-            unet_key = "model.diffusion_model."
-
-    # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
-    if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
-        print(f"Checkpoint {path} has both EMA and non-EMA weights.")
-        print(
-            "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
-            " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
-        )
-        for key in keys:
-            if key.startswith("model.diffusion_model"):
-                flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
-                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
-    else:
-        if sum(k.startswith("model_ema") for k in keys) > 100:
-            print(
-                "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
-                " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
-            )
-
-        for key in keys:
-            if key.startswith(unet_key):
-                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
-
-    new_checkpoint = {}
-
-    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
-    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
-    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
-    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
-
-    if config["class_embed_type"] is None:
-        # No parameters to port
-        ...
-    elif config["class_embed_type"] == "timestep" or config["class_embed_type"] == "projection":
-        new_checkpoint["class_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"]
-        new_checkpoint["class_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"]
-        new_checkpoint["class_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"]
-        new_checkpoint["class_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"]
-    else:
-        raise NotImplementedError(f"Not implemented `class_embed_type`: {config['class_embed_type']}")
-
-    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
-    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
-
-    if not controlnet:
-        new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
-        new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
-        new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
-        new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
-
-    # Retrieves the keys for the input blocks only
-    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
-    input_blocks = {
-        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
-        for layer_id in range(num_input_blocks)
-    }
-
-    # Retrieves the keys for the middle blocks only
-    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
-    middle_blocks = {
-        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
-        for layer_id in range(num_middle_blocks)
-    }
-
-    # Retrieves the keys for the output blocks only
-    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
-    output_blocks = {
-        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
-        for layer_id in range(num_output_blocks)
-    }
-
-    for i in range(1, num_input_blocks):
-        block_id = (i - 1) // (config["layers_per_block"] + 1)
-        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
-
-        resnets = [
-            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
-        ]
-        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
-
-        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
-            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
-                f"input_blocks.{i}.0.op.weight"
-            )
-            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
-                f"input_blocks.{i}.0.op.bias"
-            )
-
-        paths = renew_resnet_paths(resnets)
-        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
-        assign_to_checkpoint(
-            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-        )
-
-        if len(attentions):
-            paths = renew_attention_paths(attentions)
-            meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
-            assign_to_checkpoint(
-                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-            )
-
-    resnet_0 = middle_blocks[0]
-    attentions = middle_blocks[1]
-    resnet_1 = middle_blocks[2]
-
-    resnet_0_paths = renew_resnet_paths(resnet_0)
-    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
-
-    resnet_1_paths = renew_resnet_paths(resnet_1)
-    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
-
-    attentions_paths = renew_attention_paths(attentions)
-    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
-    assign_to_checkpoint(
-        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-    )
-
-    for i in range(num_output_blocks):
-        block_id = i // (config["layers_per_block"] + 1)
-        layer_in_block_id = i % (config["layers_per_block"] + 1)
-        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
-        output_block_list = {}
-
-        for layer in output_block_layers:
-            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
-            if layer_id in output_block_list:
-                output_block_list[layer_id].append(layer_name)
-            else:
-                output_block_list[layer_id] = [layer_name]
-
-        if len(output_block_list) > 1:
-            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
-            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
-
-            resnet_0_paths = renew_resnet_paths(resnets)
-            paths = renew_resnet_paths(resnets)
-
-            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
-            assign_to_checkpoint(
-                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-            )
-
-            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
-
-            if ["conv.bias", "conv.weight"] in output_block_list.values():
-                index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
-                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
-                    f"output_blocks.{i}.{index}.conv.weight"
-                ]
-                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
-                    f"output_blocks.{i}.{index}.conv.bias"
-                ]
-
-                # Clear attentions as they have been attributed above.
-                if len(attentions) == 2:
-                    attentions = []
-
-            if len(attentions):
-                paths = renew_attention_paths(attentions)
-                meta_path = {
-                    "old": f"output_blocks.{i}.1",
-                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
-                }
-                assign_to_checkpoint(
-                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-                )
-        else:
-            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
-            for path in resnet_0_paths:
-                old_path = ".".join(["output_blocks", str(i), path["old"]])
-                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
-
-                new_checkpoint[new_path] = unet_state_dict[old_path]
-
-    if controlnet:
-        # conditioning embedding
-
-        orig_index = 0
-
-        new_checkpoint["controlnet_cond_embedding.conv_in.weight"] = unet_state_dict.pop(
-            f"input_hint_block.{orig_index}.weight"
-        )
-        new_checkpoint["controlnet_cond_embedding.conv_in.bias"] = unet_state_dict.pop(
-            f"input_hint_block.{orig_index}.bias"
-        )
-
-        orig_index += 2
-
-        diffusers_index = 0
-
-        while diffusers_index < 6:
-            new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.weight"] = unet_state_dict.pop(
-                f"input_hint_block.{orig_index}.weight"
-            )
-            new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.bias"] = unet_state_dict.pop(
-                f"input_hint_block.{orig_index}.bias"
-            )
-            diffusers_index += 1
-            orig_index += 2
-
-        new_checkpoint["controlnet_cond_embedding.conv_out.weight"] = unet_state_dict.pop(
-            f"input_hint_block.{orig_index}.weight"
-        )
-        new_checkpoint["controlnet_cond_embedding.conv_out.bias"] = unet_state_dict.pop(
-            f"input_hint_block.{orig_index}.bias"
-        )
-
-        # down blocks
-        for i in range(num_input_blocks):
-            new_checkpoint[f"controlnet_down_blocks.{i}.weight"] = unet_state_dict.pop(f"zero_convs.{i}.0.weight")
-            new_checkpoint[f"controlnet_down_blocks.{i}.bias"] = unet_state_dict.pop(f"zero_convs.{i}.0.bias")
-
-        # mid block
-        new_checkpoint["controlnet_mid_block.weight"] = unet_state_dict.pop("middle_block_out.0.weight")
-        new_checkpoint["controlnet_mid_block.bias"] = unet_state_dict.pop("middle_block_out.0.bias")
-
-    return new_checkpoint
-
-
-def convert_ldm_vae_checkpoint(checkpoint, config):
-    # extract state dict for VAE
-    vae_state_dict = {}
-    vae_key = "first_stage_model."
-    keys = list(checkpoint.keys())
-    for key in keys:
-        if key.startswith(vae_key):
-            vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key)
-
-    new_checkpoint = {}
-
-    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
-    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
-    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
-    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
-    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
-    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
-
-    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
-    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
-    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
-    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
-    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
-    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
-
-    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
-    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
-    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
-    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
-
-    # Retrieves the keys for the encoder down blocks only
-    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
-    down_blocks = {
-        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
-    }
-
-    # Retrieves the keys for the decoder up blocks only
-    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
-    up_blocks = {
-        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
-    }
-
-    for i in range(num_down_blocks):
-        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
-
-        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
-            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
-                f"encoder.down.{i}.downsample.conv.weight"
-            )
-            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
-                f"encoder.down.{i}.downsample.conv.bias"
-            )
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
-    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
-    num_mid_res_blocks = 2
-    for i in range(1, num_mid_res_blocks + 1):
-        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
-    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
-    paths = renew_vae_attention_paths(mid_attentions)
-    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
-    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-    conv_attn_to_linear(new_checkpoint)
-
-    for i in range(num_up_blocks):
-        block_id = num_up_blocks - 1 - i
-        resnets = [
-            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
-        ]
-
-        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
-            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
-                f"decoder.up.{block_id}.upsample.conv.weight"
-            ]
-            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
-                f"decoder.up.{block_id}.upsample.conv.bias"
-            ]
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
-    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
-    num_mid_res_blocks = 2
-    for i in range(1, num_mid_res_blocks + 1):
-        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
-    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
-    paths = renew_vae_attention_paths(mid_attentions)
-    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
-    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-    conv_attn_to_linear(new_checkpoint)
-    return new_checkpoint
-
-
-def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet, diffusers_vae_unet_checkpoint):
-    import paddle.nn as nn
-
-    need_transpose = []
-    for k, v in vae_or_unet.named_sublayers(include_self=True):
-        if isinstance(v, nn.Linear):
-            need_transpose.append(k + ".weight")
-    new_vae_or_unet = {}
-    for k, v in diffusers_vae_unet_checkpoint.items():
-        if k not in need_transpose:
-            new_vae_or_unet[k] = v
-        else:
-            new_vae_or_unet[k] = v.T
-    return new_vae_or_unet
-
-
-def convert_ldm_bert_checkpoint(checkpoint, config):
-    # extract state dict for bert
-    bert_state_dict = {}
-    bert_key = "cond_stage_model."
-    keys = list(checkpoint.keys())
-    for key in keys:
-        if key.startswith(bert_key):
-            bert_state_dict[key.replace(bert_key, "")] = checkpoint.get(key)
-
-    new_checkpoint = {}
-    new_checkpoint["embeddings.word_embeddings.weight"] = bert_state_dict["transformer.token_emb.weight"]
-    new_checkpoint["embeddings.position_embeddings.weight"] = bert_state_dict["transformer.pos_emb.emb.weight"]
-    for i in range(config.encoder_layers):
-        double_i = 2 * i
-        double_i_plus1 = 2 * i + 1
-        # convert norm
-        new_checkpoint[f"encoder.layers.{i}.norm1.weight"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i}.0.weight"
-        ]
-        new_checkpoint[f"encoder.layers.{i}.norm1.bias"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i}.0.bias"
-        ]
-
-        new_checkpoint[f"encoder.layers.{i}.self_attn.q_proj.weight"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i}.1.to_q.weight"
-        ].T
-        new_checkpoint[f"encoder.layers.{i}.self_attn.k_proj.weight"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i}.1.to_k.weight"
-        ].T
-        new_checkpoint[f"encoder.layers.{i}.self_attn.v_proj.weight"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i}.1.to_v.weight"
-        ].T
-        new_checkpoint[f"encoder.layers.{i}.self_attn.out_proj.weight"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i}.1.to_out.weight"
-        ].T
-        new_checkpoint[f"encoder.layers.{i}.self_attn.out_proj.bias"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i}.1.to_out.bias"
-        ]
-
-        new_checkpoint[f"encoder.layers.{i}.norm2.weight"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i_plus1}.0.weight"
-        ]
-        new_checkpoint[f"encoder.layers.{i}.norm2.bias"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i_plus1}.0.bias"
-        ]
-        new_checkpoint[f"encoder.layers.{i}.linear1.weight"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.weight"
-        ].T
-        new_checkpoint[f"encoder.layers.{i}.linear1.bias"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.bias"
-        ]
-        new_checkpoint[f"encoder.layers.{i}.linear2.weight"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.weight"
-        ].T
-        new_checkpoint[f"encoder.layers.{i}.linear2.bias"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.bias"
-        ].T
-
-    new_checkpoint["final_layer_norm.weight"] = bert_state_dict["transformer.norm.weight"]
-    new_checkpoint["final_layer_norm.bias"] = bert_state_dict["transformer.norm.bias"]
-    ldmbert = LDMBertModel(config)
-    ldmbert.eval()
-    ldmbert.load_dict(new_checkpoint)
-    return ldmbert
-
-
-def convert_ldm_clip_checkpoint(checkpoint):
-    text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
-    text_model.eval()
-
-    keys = list(checkpoint.keys())
-
-    text_model_dict = {}
-
-    for key in keys:
-        if key.startswith("cond_stage_model.transformer"):
-            text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key]
-
-    if len(text_model_dict) > 0:
-        text_model.load_dict(CLIPTextModel.smart_convert(text_model_dict, text_model))
-
-    return text_model
-
-
-textenc_conversion_lst = [
-    ("cond_stage_model.model.positional_embedding", "text_model.embeddings.position_embedding.weight"),
-    ("cond_stage_model.model.token_embedding.weight", "text_model.embeddings.token_embedding.weight"),
-    ("cond_stage_model.model.ln_final.weight", "text_model.final_layer_norm.weight"),
-    ("cond_stage_model.model.ln_final.bias", "text_model.final_layer_norm.bias"),
-]
-textenc_conversion_map = {x[0]: x[1] for x in textenc_conversion_lst}
-
-textenc_transformer_conversion_lst = [
-    # (stable-diffusion, HF Diffusers)
-    ("resblocks.", "text_model.encoder.layers."),
-    ("ln_1", "layer_norm1"),
-    ("ln_2", "layer_norm2"),
-    (".c_fc.", ".fc1."),
-    (".c_proj.", ".fc2."),
-    (".attn", ".self_attn"),
-    ("ln_final.", "transformer.text_model.final_layer_norm."),
-    ("token_embedding.weight", "transformer.text_model.embeddings.token_embedding.weight"),
-    ("positional_embedding", "transformer.text_model.embeddings.position_embedding.weight"),
-]
-protected = {re.escape(x[0]): x[1] for x in textenc_transformer_conversion_lst}
-textenc_pattern = re.compile("|".join(protected.keys()))
-
-
-def convert_paint_by_example_checkpoint(checkpoint):
-    config = CLIPVisionConfig.from_pretrained("openai/clip-vit-large-patch14")
-    model = PaintByExampleImageEncoder(config)
-    model.eval()
-
-    keys = list(checkpoint.keys())
-
-    model_dict = {}
-
-    for key in keys:
-        if key.startswith("cond_stage_model.transformer"):
-            model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key]
-
-    # load mapper
-    keys_mapper = {
-        k[len("cond_stage_model.mapper.res") :]: v
-        for k, v in checkpoint.items()
-        if k.startswith("cond_stage_model.mapper")
-    }
-
-    MAPPING = {
-        "attn.c_qkv": ["attn1.to_q", "attn1.to_k", "attn1.to_v"],
-        "attn.c_proj": ["attn1.to_out.0"],
-        "ln_1": ["norm1"],
-        "ln_2": ["norm3"],
-        "mlp.c_fc": ["ff.net.0.proj"],
-        "mlp.c_proj": ["ff.net.2"],
-    }
-
-    for key, value in keys_mapper.items():
-        prefix = key[: len("blocks.i")]
-        suffix = key.split(prefix)[-1].split(".")[-1]
-        name = key.split(prefix)[-1].split(suffix)[0][1:-1]
-        mapped_names = MAPPING[name]
-
-        num_splits = len(mapped_names)
-        for i, mapped_name in enumerate(mapped_names):
-            new_name = ".".join([prefix, mapped_name, suffix])
-            shape = value.shape[0] // num_splits
-            model_dict[new_name] = value[i * shape : (i + 1) * shape]
-
-    # load final layer norm
-    model_dict["final_layer_norm.bias"] = checkpoint["cond_stage_model.final_ln.bias"]
-    model_dict["final_layer_norm.weight"] = checkpoint["cond_stage_model.final_ln.bias"]
-
-    # load proj_out
-    model_dict["proj_out.bias"] = checkpoint["proj_out.bias"]
-    model_dict["proj_out.weight"] = checkpoint["proj_out.weight"]
-
-    # load uncond vector
-    model_dict["uncond_vector"] = checkpoint["learnable_vector"]
-
-    if len(model_dict) > 0:
-        model.load_dict(PaintByExampleImageEncoder.smart_convert(model_dict, model))
-
-    return model
-
-
-def convert_open_clip_checkpoint(checkpoint):
-    text_model = CLIPTextModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="text_encoder")
-    text_model.eval()
-    keys = list(checkpoint.keys())
-
-    text_model_dict = {}
-
-    if "cond_stage_model.model.text_projection" in checkpoint:
-        d_model = int(checkpoint["cond_stage_model.model.text_projection"].shape[0])
-    else:
-        d_model = 1024
-
-    # text_model_dict["text_model.embeddings.position_ids"] = text_model.text_model.embeddings.get_buffer("position_ids")
-
-    for key in keys:
-        if "resblocks.23" in key:  # Diffusers drops the final layer and only uses the penultimate layer
-            continue
-        if key in textenc_conversion_map:
-            text_model_dict[textenc_conversion_map[key]] = checkpoint[key]
-        if key.startswith("cond_stage_model.model.transformer."):
-            new_key = key[len("cond_stage_model.model.transformer.") :]
-            if new_key.endswith(".in_proj_weight"):
-                new_key = new_key[: -len(".in_proj_weight")]
-                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
-                text_model_dict[new_key + ".q_proj.weight"] = checkpoint[key][:d_model, :]
-                text_model_dict[new_key + ".k_proj.weight"] = checkpoint[key][d_model : d_model * 2, :]
-                text_model_dict[new_key + ".v_proj.weight"] = checkpoint[key][d_model * 2 :, :]
-            elif new_key.endswith(".in_proj_bias"):
-                new_key = new_key[: -len(".in_proj_bias")]
-                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
-                text_model_dict[new_key + ".q_proj.bias"] = checkpoint[key][:d_model]
-                text_model_dict[new_key + ".k_proj.bias"] = checkpoint[key][d_model : d_model * 2]
-                text_model_dict[new_key + ".v_proj.bias"] = checkpoint[key][d_model * 2 :]
-            else:
-                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
-
-                text_model_dict[new_key] = checkpoint[key]
-    if len(text_model_dict) > 0:
-        text_model.load_dict(CLIPTextModel.smart_convert(text_model_dict, text_model))
-    return text_model
-
-
-def stable_unclip_image_encoder(original_config):
-    """
-    Returns the image processor and clip image encoder for the img2img unclip pipeline.
-
-    We currently know of two types of stable unclip models which separately use the clip and the openclip image
-    encoders.
-    """
-
-    image_embedder_config = original_config.model.params.embedder_config
-
-    sd_clip_image_embedder_class = image_embedder_config.target
-    sd_clip_image_embedder_class = sd_clip_image_embedder_class.split(".")[-1]
-
-    if sd_clip_image_embedder_class == "ClipImageEmbedder":
-        clip_model_name = image_embedder_config.params.model
-
-        if clip_model_name == "ViT-L/14":
-            feature_extractor = CLIPImageProcessor()
-            image_encoder = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-large-patch14")
-        else:
-            raise NotImplementedError(f"Unknown CLIP checkpoint name in stable diffusion checkpoint {clip_model_name}")
-
-    elif sd_clip_image_embedder_class == "FrozenOpenCLIPImageEmbedder":
-        feature_extractor = CLIPImageProcessor()
-        image_encoder = CLIPVisionModelWithProjection.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K")
-    else:
-        raise NotImplementedError(
-            f"Unknown CLIP image embedder class in stable diffusion checkpoint {sd_clip_image_embedder_class}"
-        )
-    image_encoder.eval()
-    return feature_extractor, image_encoder
-
-
-def stable_unclip_image_noising_components(
-    original_config,
-    clip_stats_path: Optional[str] = None,
-):
-    """
-    Returns the noising components for the img2img and txt2img unclip pipelines.
-
-    Converts the stability noise augmentor into
-    1. a `StableUnCLIPImageNormalizer` for holding the CLIP stats
-    2. a `DDPMScheduler` for holding the noise schedule
-
-    If the noise augmentor config specifies a clip stats path, the `clip_stats_path` must be provided.
-    """
-    noise_aug_config = original_config.model.params.noise_aug_config
-    noise_aug_class = noise_aug_config.target
-    noise_aug_class = noise_aug_class.split(".")[-1]
-
-    if noise_aug_class == "CLIPEmbeddingNoiseAugmentation":
-        noise_aug_config = noise_aug_config.params
-        embedding_dim = noise_aug_config.timestep_dim
-        max_noise_level = noise_aug_config.noise_schedule_config.timesteps
-        beta_schedule = noise_aug_config.noise_schedule_config.beta_schedule
-
-        image_normalizer = StableUnCLIPImageNormalizer(embedding_dim=embedding_dim)
-        image_noising_scheduler = DDPMScheduler(num_train_timesteps=max_noise_level, beta_schedule=beta_schedule)
-
-        if "clip_stats_path" in noise_aug_config:
-            if clip_stats_path is None:
-                raise ValueError("This stable unclip config requires a `clip_stats_path`")
-
-            from ...utils import torch_load
-
-            clip_mean, clip_std = torch_load(clip_stats_path)
-            if hasattr(clip_mean, "numpy"):
-                clip_mean = clip_mean.numpy()
-            if hasattr(clip_std, "numpy"):
-                clip_std = clip_std.numpy()
-            clip_mean = clip_mean[None, :]
-            clip_std = clip_std[None, :]
-
-            clip_stats_state_dict = {
-                "mean": clip_mean,
-                "std": clip_std,
-            }
-
-            image_normalizer.load_dict(clip_stats_state_dict)
-    else:
-        raise NotImplementedError(f"Unknown noise augmentor class: {noise_aug_class}")
-    image_normalizer.eval()
-    return image_normalizer, image_noising_scheduler
-
-
-def convert_controlnet_checkpoint(
-    checkpoint, original_config, checkpoint_path, image_size, upcast_attention, extract_ema, no_unet_key=False
-):
-    ctrlnet_config = create_unet_diffusers_config(original_config, image_size=image_size, controlnet=True)
-    ctrlnet_config["upcast_attention"] = upcast_attention
-
-    ctrlnet_config.pop("sample_size")
-
-    controlnet_model = ControlNetModel(**ctrlnet_config)
-
-    converted_ctrl_checkpoint = convert_ldm_unet_checkpoint(
-        checkpoint,
-        ctrlnet_config,
-        path=checkpoint_path,
-        extract_ema=extract_ema,
-        controlnet=True,
-        no_unet_key=no_unet_key,
-    )
-
-    controlnet_model.load_dict(convert_diffusers_vae_unet_to_ppdiffusers(controlnet_model, converted_ctrl_checkpoint))
-    controlnet_model.eval()
-    return controlnet_model
-
-
-def download_from_original_stable_diffusion_ckpt(
-    checkpoint_path: str,
-    original_config_file: str = None,
-    image_size: int = 512,
-    prediction_type: str = None,
-    model_type: str = None,
-    extract_ema: bool = False,
-    scheduler_type: str = "pndm",
-    num_in_channels: Optional[int] = None,
-    upcast_attention: Optional[bool] = None,
-    stable_unclip: Optional[str] = None,
-    stable_unclip_prior: Optional[str] = None,
-    clip_stats_path: Optional[str] = None,
-    controlnet: Optional[bool] = None,
-    load_safety_checker: bool = True,
-    pipeline_class: DiffusionPipeline = None,
-    paddle_dtype=None,
-    **kwargs,
-) -> DiffusionPipeline:
-    """
-    Load a Stable Diffusion pipeline object from a CompVis-style `.ckpt`/`.safetensors` file and (ideally) a `.yaml`
-    config file.
-
-    Although many of the arguments can be automatically inferred, some of these rely on brittle checks against the
-    global step count, which will likely fail for models that have undergone further fine-tuning. Therefore, it is
-    recommended that you override the default values and/or supply an `original_config_file` wherever possible.
-
-    Args:
-        checkpoint_path (`str`): Path to `.ckpt` file.
-        original_config_file (`str`):
-            Path to `.yaml` config file corresponding to the original architecture. If `None`, will be automatically
-            inferred by looking for a key that only exists in SD2.0 models.
-        image_size (`int`, *optional*, defaults to 512):
-            The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable Diffusion v2
-            Base. Use 768 for Stable Diffusion v2.
-        prediction_type (`str`, *optional*):
-            The prediction type that the model was trained on. Use `'epsilon'` for Stable Diffusion v1.X and Stable
-            Diffusion v2 Base. Use `'v_prediction'` for Stable Diffusion v2.
-        num_in_channels (`int`, *optional*, defaults to None):
-            The number of input channels. If `None`, it will be automatically inferred.
-        scheduler_type (`str`, *optional*, defaults to 'pndm'):
-            Type of scheduler to use. Should be one of `["pndm", "lms", "heun", "euler", "euler-ancestral", "dpm",
-            "ddim"]`.
-        model_type (`str`, *optional*, defaults to `None`):
-            The pipeline type. `None` to automatically infer, or one of `["FrozenOpenCLIPEmbedder",
-            "FrozenCLIPEmbedder", "PaintByExample"]`.
-        is_img2img (`bool`, *optional*, defaults to `False`):
-            Whether the model should be loaded as an img2img pipeline.
-        extract_ema (`bool`, *optional*, defaults to `False`): Only relevant for
-            checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights or not. Defaults to
-            `False`. Pass `True` to extract the EMA weights. EMA weights usually yield higher quality images for
-            inference. Non-EMA weights are usually better to continue fine-tuning.
-        upcast_attention (`bool`, *optional*, defaults to `None`):
-            Whether the attention computation should always be upcasted. This is necessary when running stable
-            diffusion 2.1.
-        load_safety_checker (`bool`, *optional*, defaults to `True`):
-            Whether to load the safety checker or not. Defaults to `True`.
-        pipeline_class (`str`, *optional*, defaults to `None`):
-            The pipeline class to use. Pass `None` to determine automatically.
-        return: A StableDiffusionPipeline object representing the passed-in `.ckpt`/`.safetensors` file.
-    """
-
-    # import pipelines here to avoid circular import error when using from_ckpt method
-    from ppdiffusers import (
-        LDMTextToImagePipeline,
-        PaintByExamplePipeline,
-        StableDiffusionControlNetPipeline,
-        StableDiffusionPipeline,
-        StableUnCLIPImg2ImgPipeline,
-        StableUnCLIPPipeline,
-    )
-
-    if pipeline_class is None or pipeline_class.__name__ == "DiffusionPipeline":
-        pipeline_class = StableDiffusionPipeline
-
-    if prediction_type == "v-prediction":
-        prediction_type = "v_prediction"
-
-    if not is_omegaconf_available():
-        raise ValueError(BACKENDS_MAPPING["omegaconf"][1])
-
-    from omegaconf import OmegaConf
-
-    checkpoint = smart_load(checkpoint_path, return_numpy=True, return_global_step=True)
-
-    # NOTE: this while loop isn't great but this controlnet checkpoint has one additional
-    # "state_dict" key https://huggingface.co/thibaud/controlnet-canny-sd21
-    while "state_dict" in checkpoint:
-        checkpoint = checkpoint["state_dict"]
-
-    global_step = int(checkpoint.pop("global_step", -1))
-
-    if global_step == -1:
-        print("global_step key not found in model")
-
-    # must cast them to float32
-    newcheckpoint = {}
-    for k, v in checkpoint.items():
-        try:
-            if "int" in str(v.dtype):
-                continue
-        except Exception:
-            continue
-        newcheckpoint[k] = v.astype("float32")
-    checkpoint = newcheckpoint
-
-    if original_config_file is None:
-        key_name = "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_k.weight"
-
-        # model_type = "v1"
-        config_url = "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/v1-inference.yaml"
-
-        if key_name in checkpoint and checkpoint[key_name].shape[-1] == 1024:
-            # model_type = "v2"
-            config_url = "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/v2-inference-v.yaml"
-
-            if global_step == 110000:
-                # v2.1 needs to upcast attention
-                upcast_attention = True
-
-        original_config_file = BytesIO(requests.get(config_url).content)
-
-    original_config = OmegaConf.load(original_config_file)
-
-    if num_in_channels is not None:
-        original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
-
-    if (
-        "parameterization" in original_config["model"]["params"]
-        and original_config["model"]["params"]["parameterization"] == "v"
-    ):
-        if prediction_type is None:
-            # NOTE: For stable diffusion 2 base it is recommended to pass `prediction_type=="epsilon"`
-            # as it relies on a brittle global step parameter here
-            prediction_type = "epsilon" if global_step == 875000 else "v_prediction"
-        if image_size is None:
-            # NOTE: For stable diffusion 2 base one has to pass `image_size==512`
-            # as it relies on a brittle global step parameter here
-            image_size = 512 if global_step == 875000 else 768
-    else:
-        if prediction_type is None:
-            prediction_type = "epsilon"
-        if image_size is None:
-            image_size = 512
-
-    if controlnet is None:
-        controlnet = "control_stage_config" in original_config.model.params
-
-    if controlnet:
-        controlnet_model = convert_controlnet_checkpoint(
-            checkpoint, original_config, checkpoint_path, image_size, upcast_attention, extract_ema
-        )
-    num_train_timesteps = original_config.model.params.timesteps
-    beta_start = original_config.model.params.linear_start
-    beta_end = original_config.model.params.linear_end
-
-    scheduler = DDIMScheduler(
-        beta_end=beta_end,
-        beta_schedule="scaled_linear",
-        beta_start=beta_start,
-        num_train_timesteps=num_train_timesteps,
-        steps_offset=1,
-        clip_sample=False,
-        set_alpha_to_one=False,
-        prediction_type=prediction_type,
-    )
-    # make sure scheduler works correctly with DDIM
-    scheduler.register_to_config(clip_sample=False)
-
-    if scheduler_type == "pndm":
-        config = dict(scheduler.config)
-        config["skip_prk_steps"] = True
-        scheduler = PNDMScheduler.from_config(config)
-    elif scheduler_type == "lms":
-        scheduler = LMSDiscreteScheduler.from_config(scheduler.config)
-    elif scheduler_type == "heun":
-        scheduler = HeunDiscreteScheduler.from_config(scheduler.config)
-    elif scheduler_type == "euler":
-        scheduler = EulerDiscreteScheduler.from_config(scheduler.config)
-    elif scheduler_type == "euler-ancestral":
-        scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config)
-    elif scheduler_type == "dpm":
-        scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
-    elif scheduler_type == "ddim":
-        scheduler = scheduler
-    else:
-        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
-
-    # Convert the UNet2DConditionModel model.
-    unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
-    unet_config["upcast_attention"] = upcast_attention
-    unet = UNet2DConditionModel(**unet_config)
-    unet.eval()
-
-    converted_unet_checkpoint = convert_ldm_unet_checkpoint(
-        checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema
-    )
-    unet.load_dict(convert_diffusers_vae_unet_to_ppdiffusers(unet, converted_unet_checkpoint))
-
-    # Convert the VAE model.
-    vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
-    converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
-
-    vae = AutoencoderKL(**vae_config)
-    vae.eval()
-    vae.load_dict(convert_diffusers_vae_unet_to_ppdiffusers(vae, converted_vae_checkpoint))
-
-    # Convert the text model.
-    if model_type is None:
-        model_type = original_config.model.params.cond_stage_config.target.split(".")[-1]
-        logger.debug(f"no `model_type` given, `model_type` inferred as: {model_type}")
-    if model_type == "FrozenOpenCLIPEmbedder":
-        text_model = convert_open_clip_checkpoint(checkpoint)
-        tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2/tokenizer")
-
-        if stable_unclip is None:
-            if controlnet:
-                pipe = StableDiffusionControlNetPipeline(
-                    vae=vae,
-                    text_encoder=text_model,
-                    tokenizer=tokenizer,
-                    unet=unet,
-                    scheduler=scheduler,
-                    controlnet=controlnet_model,
-                    safety_checker=None,
-                    feature_extractor=None,
-                    requires_safety_checker=False,
-                )
-            else:
-                pipe = pipeline_class(
-                    vae=vae,
-                    text_encoder=text_model,
-                    tokenizer=tokenizer,
-                    unet=unet,
-                    scheduler=scheduler,
-                    safety_checker=None,
-                    feature_extractor=None,
-                    requires_safety_checker=False,
-                )
-        else:
-            image_normalizer, image_noising_scheduler = stable_unclip_image_noising_components(
-                original_config,
-                clip_stats_path=clip_stats_path,
-            )
-
-            if stable_unclip == "img2img":
-                feature_extractor, image_encoder = stable_unclip_image_encoder(original_config)
-
-                pipe = StableUnCLIPImg2ImgPipeline(
-                    # image encoding components
-                    feature_extractor=feature_extractor,
-                    image_encoder=image_encoder,
-                    # image noising components
-                    image_normalizer=image_normalizer,
-                    image_noising_scheduler=image_noising_scheduler,
-                    # regular denoising components
-                    tokenizer=tokenizer,
-                    text_encoder=text_model,
-                    unet=unet,
-                    scheduler=scheduler,
-                    # vae
-                    vae=vae,
-                )
-            elif stable_unclip == "txt2img":
-                if stable_unclip_prior is None or stable_unclip_prior == "karlo":
-                    karlo_model = "kakaobrain/karlo-v1-alpha"
-                    prior = PriorTransformer.from_pretrained(karlo_model, subfolder="prior")
-
-                    prior_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
-                    prior_text_model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-large-patch14")
-
-                    prior_scheduler = UnCLIPScheduler.from_pretrained(karlo_model, subfolder="prior_scheduler")
-                    prior_scheduler = DDPMScheduler.from_config(prior_scheduler.config)
-                else:
-                    raise NotImplementedError(f"unknown prior for stable unclip model: {stable_unclip_prior}")
-
-                pipe = StableUnCLIPPipeline(
-                    # prior components
-                    prior_tokenizer=prior_tokenizer,
-                    prior_text_encoder=prior_text_model,
-                    prior=prior,
-                    prior_scheduler=prior_scheduler,
-                    # image noising components
-                    image_normalizer=image_normalizer,
-                    image_noising_scheduler=image_noising_scheduler,
-                    # regular denoising components
-                    tokenizer=tokenizer,
-                    text_encoder=text_model,
-                    unet=unet,
-                    scheduler=scheduler,
-                    # vae
-                    vae=vae,
-                )
-            else:
-                raise NotImplementedError(f"unknown `stable_unclip` type: {stable_unclip}")
-    elif model_type == "PaintByExample":
-        vision_model = convert_paint_by_example_checkpoint(checkpoint)
-        tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
-        feature_extractor = CLIPFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker")
-        pipe = PaintByExamplePipeline(
-            vae=vae,
-            image_encoder=vision_model,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=None,
-            feature_extractor=feature_extractor,
-        )
-    elif model_type == "FrozenCLIPEmbedder":
-        text_model = convert_ldm_clip_checkpoint(checkpoint)
-        tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
-
-        if load_safety_checker:
-            safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker")
-            feature_extractor = CLIPFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker")
-        else:
-            safety_checker = None
-            feature_extractor = None
-
-        if controlnet:
-            pipe = StableDiffusionControlNetPipeline(
-                vae=vae,
-                text_encoder=text_model,
-                tokenizer=tokenizer,
-                unet=unet,
-                controlnet=controlnet_model,
-                scheduler=scheduler,
-                safety_checker=safety_checker,
-                feature_extractor=feature_extractor,
-                requires_safety_checker=load_safety_checker,
-            )
-        else:
-            pipe = pipeline_class(
-                vae=vae,
-                text_encoder=text_model,
-                tokenizer=tokenizer,
-                unet=unet,
-                scheduler=scheduler,
-                safety_checker=safety_checker,
-                feature_extractor=feature_extractor,
-                requires_safety_checker=load_safety_checker,
-            )
-    else:
-        text_config = create_ldm_bert_config(original_config)
-        text_model = convert_ldm_bert_checkpoint(checkpoint, text_config)
-        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", model_max_length=77)
-
-        pipe = LDMTextToImagePipeline(vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
-    if paddle_dtype is not None:
-        pipe.to(paddle_dtype=paddle_dtype)
-
-    return pipe
-
-
-def download_controlnet_from_original_ckpt(
-    checkpoint_path: str,
-    original_config_file: str,
-    image_size: int = 512,
-    extract_ema: bool = False,
-    num_in_channels: Optional[int] = None,
-    upcast_attention: Optional[bool] = None,
-    no_unet_key: Optional[bool] = False,
-) -> DiffusionPipeline:
-    if not is_omegaconf_available():
-        raise ValueError(BACKENDS_MAPPING["omegaconf"][1])
-
-    from omegaconf import OmegaConf
-
-    checkpoint = smart_load(checkpoint_path, return_numpy=True)
-
-    # NOTE: this while loop isn't great but this controlnet checkpoint has one additional
-    # "state_dict" key https://huggingface.co/thibaud/controlnet-canny-sd21
-    while "state_dict" in checkpoint:
-        checkpoint = checkpoint["state_dict"]
-
-    # must cast them to float32
-    newcheckpoint = {}
-    for k, v in checkpoint.items():
-        try:
-            if "int" in str(v.dtype):
-                continue
-        except Exception:
-            continue
-        newcheckpoint[k] = v.astype("float32")
-    checkpoint = newcheckpoint
-
-    original_config = OmegaConf.load(original_config_file)
-
-    if num_in_channels is not None:
-        original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
-
-    if "control_stage_config" not in original_config.model.params:
-        raise ValueError("`control_stage_config` not present in original config")
-
-    controlnet_model = convert_controlnet_checkpoint(
-        checkpoint, original_config, checkpoint_path, image_size, upcast_attention, extract_ema, no_unet_key
-    )
-
-    return controlnet_model
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/convert_from_ckpt_deprecated.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/convert_from_ckpt_deprecated.py
deleted file mode 100644
index 4ddf69fa090c..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/convert_from_ckpt_deprecated.py
+++ /dev/null
@@ -1,1151 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Conversion script for the Stable Diffusion checkpoints."""
-import os
-import re
-import tempfile
-from typing import Optional
-
-import numpy as np
-import requests
-
-from paddlenlp.transformers import (
-    BertTokenizer,
-    CLIPFeatureExtractor,
-    CLIPTextModel,
-    CLIPTokenizer,
-)
-from ppdiffusers import (
-    AutoencoderKL,
-    ControlNetModel,
-    DDIMScheduler,
-    DPMSolverMultistepScheduler,
-    EulerAncestralDiscreteScheduler,
-    EulerDiscreteScheduler,
-    HeunDiscreteScheduler,
-    LDMTextToImagePipeline,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    StableDiffusionControlNetPipeline,
-    StableDiffusionPipeline,
-    UNet2DConditionModel,
-)
-from ppdiffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import (
-    LDMBertConfig,
-    LDMBertModel,
-)
-from ppdiffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
-
-from ...utils import is_omegaconf_available, logging
-from ...utils.import_utils import BACKENDS_MAPPING
-from ...utils.load_utils import smart_load
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-def shave_segments(path, n_shave_prefix_segments=1):
-    """
-    Removes segments. Positive values shave the first segments, negative shave the last segments.
-    """
-    if n_shave_prefix_segments >= 0:
-        return ".".join(path.split(".")[n_shave_prefix_segments:])
-    else:
-        return ".".join(path.split(".")[:n_shave_prefix_segments])
-
-
-def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside resnets to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item.replace("in_layers.0", "norm1")
-        new_item = new_item.replace("in_layers.2", "conv1")
-
-        new_item = new_item.replace("out_layers.0", "norm2")
-        new_item = new_item.replace("out_layers.3", "conv2")
-
-        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
-        new_item = new_item.replace("skip_connection", "conv_shortcut")
-
-        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-
-        mapping.append({"old": old_item, "new": new_item})
-
-    return mapping
-
-
-def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside resnets to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item
-
-        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
-        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-
-        mapping.append({"old": old_item, "new": new_item})
-
-    return mapping
-
-
-def renew_attention_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside attentions to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item
-
-        mapping.append({"old": old_item, "new": new_item})
-
-    return mapping
-
-
-def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside attentions to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item
-
-        new_item = new_item.replace("norm.weight", "group_norm.weight")
-        new_item = new_item.replace("norm.bias", "group_norm.bias")
-
-        new_item = new_item.replace("q.weight", "query.weight")
-        new_item = new_item.replace("q.bias", "query.bias")
-
-        new_item = new_item.replace("k.weight", "key.weight")
-        new_item = new_item.replace("k.bias", "key.bias")
-
-        new_item = new_item.replace("v.weight", "value.weight")
-        new_item = new_item.replace("v.bias", "value.bias")
-
-        new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
-        new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
-
-        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-
-        mapping.append({"old": old_item, "new": new_item})
-
-    return mapping
-
-
-def assign_to_checkpoint(
-    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
-):
-    """
-    This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits
-    attention layers, and takes into account additional replacements that may arise.
-
-    Assigns the weights to the new checkpoint.
-    """
-    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
-
-    # Splits the attention layers into three variables.
-    if attention_paths_to_split is not None:
-        for path, path_map in attention_paths_to_split.items():
-            old_tensor = old_checkpoint[path]
-            channels = old_tensor.shape[0] // 3
-
-            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
-
-            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
-
-            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
-            query, key, value = np.split(old_tensor, 3, axis=1)
-
-            checkpoint[path_map["query"]] = query.reshape(target_shape)
-            checkpoint[path_map["key"]] = key.reshape(target_shape)
-            checkpoint[path_map["value"]] = value.reshape(target_shape)
-
-    for path in paths:
-        new_path = path["new"]
-
-        # These have already been assigned
-        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
-            continue
-
-        # Global renaming happens here
-        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
-        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
-        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
-
-        if additional_replacements is not None:
-            for replacement in additional_replacements:
-                new_path = new_path.replace(replacement["old"], replacement["new"])
-
-        # proj_attn.weight has to be converted from conv 1D to linear
-        if "proj_attn.weight" in new_path:
-            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
-        else:
-            checkpoint[new_path] = old_checkpoint[path["old"]]
-
-
-def conv_attn_to_linear(checkpoint):
-    keys = list(checkpoint.keys())
-    attn_keys = ["query.weight", "key.weight", "value.weight"]
-    for key in keys:
-        if ".".join(key.split(".")[-2:]) in attn_keys:
-            if checkpoint[key].ndim > 2:
-                checkpoint[key] = checkpoint[key][:, :, 0, 0]
-        elif "proj_attn.weight" in key:
-            if checkpoint[key].ndim > 2:
-                checkpoint[key] = checkpoint[key][:, :, 0]
-
-
-def create_unet_diffusers_config(original_config, image_size: int, controlnet=False):
-    """
-    Creates a config for the diffusers based on the config of the LDM model.
-    """
-    if controlnet:
-        unet_params = original_config.model.params.control_stage_config.params
-    else:
-        unet_params = original_config.model.params.unet_config.params
-
-    vae_params = original_config.model.params.first_stage_config.params.ddconfig
-
-    block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
-
-    down_block_types = []
-    resolution = 1
-    for i in range(len(block_out_channels)):
-        block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D"
-        down_block_types.append(block_type)
-        if i != len(block_out_channels) - 1:
-            resolution *= 2
-
-    up_block_types = []
-    for i in range(len(block_out_channels)):
-        block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D"
-        up_block_types.append(block_type)
-        resolution //= 2
-
-    vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1)
-
-    head_dim = unet_params.num_heads if "num_heads" in unet_params else None
-    use_linear_projection = (
-        unet_params.use_linear_in_transformer if "use_linear_in_transformer" in unet_params else False
-    )
-    if use_linear_projection:
-        # stable diffusion 2-base-512 and 2-768
-        if head_dim is None:
-            head_dim = [5, 10, 20, 20]
-
-    class_embed_type = None
-    projection_class_embeddings_input_dim = None
-
-    if "num_classes" in unet_params:
-        if unet_params.num_classes == "sequential":
-            class_embed_type = "projection"
-            assert "adm_in_channels" in unet_params
-            projection_class_embeddings_input_dim = unet_params.adm_in_channels
-        else:
-            raise NotImplementedError(f"Unknown conditional unet num_classes config: {unet_params.num_classes}")
-
-    config = dict(
-        sample_size=image_size // vae_scale_factor,
-        in_channels=unet_params.in_channels,
-        down_block_types=tuple(down_block_types),
-        block_out_channels=tuple(block_out_channels),
-        layers_per_block=unet_params.num_res_blocks,
-        cross_attention_dim=unet_params.context_dim,
-        attention_head_dim=head_dim,
-        use_linear_projection=use_linear_projection,
-        class_embed_type=class_embed_type,
-        projection_class_embeddings_input_dim=projection_class_embeddings_input_dim,
-    )
-    if not controlnet:
-        config["out_channels"] = unet_params.out_channels
-        config["up_block_types"] = tuple(up_block_types)
-
-    return config
-
-
-def create_vae_diffusers_config(original_config, image_size: int):
-    """
-    Creates a config for the diffusers based on the config of the LDM model.
-    """
-    vae_params = original_config.model.params.first_stage_config.params.ddconfig
-    _ = original_config.model.params.first_stage_config.params.embed_dim
-
-    block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult]
-    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
-    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
-
-    config = dict(
-        sample_size=image_size,
-        in_channels=vae_params.in_channels,
-        out_channels=vae_params.out_ch,
-        down_block_types=tuple(down_block_types),
-        up_block_types=tuple(up_block_types),
-        block_out_channels=tuple(block_out_channels),
-        latent_channels=vae_params.z_channels,
-        layers_per_block=vae_params.num_res_blocks,
-    )
-    return config
-
-
-def get_default(params, key, default):
-    if key in params:
-        return params[key]
-    else:
-        return default
-
-
-def create_ldm_bert_config(original_config):
-    bert_params = dict(original_config.model.params.cond_stage_config.params)
-    config = dict(
-        vocab_size=get_default(bert_params, "vocab_size", 30522),
-        max_position_embeddings=get_default(bert_params, "max_seq_len", 77),
-        encoder_layers=get_default(bert_params, "n_layer", 32),
-        encoder_ffn_dim=get_default(bert_params, "n_embed", 1280) * 4,
-        encoder_attention_heads=8,
-        head_dim=64,
-        activation_function="gelu",
-        d_model=get_default(bert_params, "n_embed", 1280),
-        dropout=0.0,
-        attention_dropout=0.0,
-        activation_dropout=0.0,
-        init_std=0.02,
-        pad_token_id=0,
-    )
-    return LDMBertConfig(**config)
-
-
-def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False, controlnet=False):
-    """
-    Takes a state dict and a config, and returns a converted checkpoint.
-    """
-
-    # extract state_dict for UNet
-    unet_state_dict = {}
-    keys = list(checkpoint.keys())
-
-    if controlnet:
-        unet_key = "control_model."
-    else:
-        unet_key = "model.diffusion_model."
-
-    # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
-    if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
-        print(f"Checkpoint {path} has both EMA and non-EMA weights.")
-        print(
-            "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
-            " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
-        )
-        for key in keys:
-            if key.startswith(unet_key[:-1]):
-                flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
-                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
-    else:
-        if sum(k.startswith("model_ema") for k in keys) > 100:
-            print(
-                "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
-                " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
-            )
-
-        for key in keys:
-            if key.startswith(unet_key):
-                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
-
-    new_checkpoint = {}
-
-    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
-    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
-    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
-    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
-
-    if config["class_embed_type"] is None:
-        # No parameters to port
-        ...
-    elif config["class_embed_type"] == "timestep" or config["class_embed_type"] == "projection":
-        new_checkpoint["class_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"]
-        new_checkpoint["class_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"]
-        new_checkpoint["class_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"]
-        new_checkpoint["class_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"]
-    else:
-        raise NotImplementedError(f"Not implemented `class_embed_type`: {config['class_embed_type']}")
-
-    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
-    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
-
-    if not controlnet:
-        new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
-        new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
-        new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
-        new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
-
-    # Retrieves the keys for the input blocks only
-    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
-    input_blocks = {
-        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
-        for layer_id in range(num_input_blocks)
-    }
-
-    # Retrieves the keys for the middle blocks only
-    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
-    middle_blocks = {
-        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
-        for layer_id in range(num_middle_blocks)
-    }
-
-    # Retrieves the keys for the output blocks only
-    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
-    output_blocks = {
-        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
-        for layer_id in range(num_output_blocks)
-    }
-
-    for i in range(1, num_input_blocks):
-        block_id = (i - 1) // (config["layers_per_block"] + 1)
-        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
-
-        resnets = [
-            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
-        ]
-        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
-
-        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
-            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
-                f"input_blocks.{i}.0.op.weight"
-            )
-            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
-                f"input_blocks.{i}.0.op.bias"
-            )
-
-        paths = renew_resnet_paths(resnets)
-        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
-        assign_to_checkpoint(
-            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-        )
-
-        if len(attentions):
-            paths = renew_attention_paths(attentions)
-            meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
-            assign_to_checkpoint(
-                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-            )
-
-    resnet_0 = middle_blocks[0]
-    attentions = middle_blocks[1]
-    resnet_1 = middle_blocks[2]
-
-    resnet_0_paths = renew_resnet_paths(resnet_0)
-    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
-
-    resnet_1_paths = renew_resnet_paths(resnet_1)
-    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
-
-    attentions_paths = renew_attention_paths(attentions)
-    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
-    assign_to_checkpoint(
-        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-    )
-
-    for i in range(num_output_blocks):
-        block_id = i // (config["layers_per_block"] + 1)
-        layer_in_block_id = i % (config["layers_per_block"] + 1)
-        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
-        output_block_list = {}
-
-        for layer in output_block_layers:
-            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
-            if layer_id in output_block_list:
-                output_block_list[layer_id].append(layer_name)
-            else:
-                output_block_list[layer_id] = [layer_name]
-
-        if len(output_block_list) > 1:
-            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
-            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
-
-            resnet_0_paths = renew_resnet_paths(resnets)
-            paths = renew_resnet_paths(resnets)
-
-            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
-            assign_to_checkpoint(
-                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-            )
-
-            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
-
-            if ["conv.bias", "conv.weight"] in output_block_list.values():
-                index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
-                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
-                    f"output_blocks.{i}.{index}.conv.weight"
-                ]
-                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
-                    f"output_blocks.{i}.{index}.conv.bias"
-                ]
-
-                # Clear attentions as they have been attributed above.
-                if len(attentions) == 2:
-                    attentions = []
-
-            if len(attentions):
-                paths = renew_attention_paths(attentions)
-                meta_path = {
-                    "old": f"output_blocks.{i}.1",
-                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
-                }
-                assign_to_checkpoint(
-                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-                )
-        else:
-            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
-            for path in resnet_0_paths:
-                old_path = ".".join(["output_blocks", str(i), path["old"]])
-                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
-
-                new_checkpoint[new_path] = unet_state_dict[old_path]
-
-    if controlnet:
-        # conditioning embedding
-
-        orig_index = 0
-
-        new_checkpoint["controlnet_cond_embedding.conv_in.weight"] = unet_state_dict.pop(
-            f"input_hint_block.{orig_index}.weight"
-        )
-        new_checkpoint["controlnet_cond_embedding.conv_in.bias"] = unet_state_dict.pop(
-            f"input_hint_block.{orig_index}.bias"
-        )
-
-        orig_index += 2
-
-        diffusers_index = 0
-
-        while diffusers_index < 6:
-            new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.weight"] = unet_state_dict.pop(
-                f"input_hint_block.{orig_index}.weight"
-            )
-            new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.bias"] = unet_state_dict.pop(
-                f"input_hint_block.{orig_index}.bias"
-            )
-            diffusers_index += 1
-            orig_index += 2
-
-        new_checkpoint["controlnet_cond_embedding.conv_out.weight"] = unet_state_dict.pop(
-            f"input_hint_block.{orig_index}.weight"
-        )
-        new_checkpoint["controlnet_cond_embedding.conv_out.bias"] = unet_state_dict.pop(
-            f"input_hint_block.{orig_index}.bias"
-        )
-
-        # down blocks
-        for i in range(num_input_blocks):
-            new_checkpoint[f"controlnet_down_blocks.{i}.weight"] = unet_state_dict.pop(f"zero_convs.{i}.0.weight")
-            new_checkpoint[f"controlnet_down_blocks.{i}.bias"] = unet_state_dict.pop(f"zero_convs.{i}.0.bias")
-
-        # mid block
-        new_checkpoint["controlnet_mid_block.weight"] = unet_state_dict.pop("middle_block_out.0.weight")
-        new_checkpoint["controlnet_mid_block.bias"] = unet_state_dict.pop("middle_block_out.0.bias")
-
-    return new_checkpoint
-
-
-def convert_ldm_vae_checkpoint(checkpoint, config):
-    # extract state dict for VAE
-    vae_state_dict = {}
-    vae_key = "first_stage_model."
-    keys = list(checkpoint.keys())
-    for key in keys:
-        if key.startswith(vae_key):
-            vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key)
-
-    new_checkpoint = {}
-
-    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
-    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
-    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
-    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
-    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
-    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
-
-    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
-    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
-    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
-    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
-    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
-    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
-
-    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
-    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
-    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
-    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
-
-    # Retrieves the keys for the encoder down blocks only
-    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
-    down_blocks = {
-        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
-    }
-
-    # Retrieves the keys for the decoder up blocks only
-    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
-    up_blocks = {
-        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
-    }
-
-    for i in range(num_down_blocks):
-        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
-
-        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
-            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
-                f"encoder.down.{i}.downsample.conv.weight"
-            )
-            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
-                f"encoder.down.{i}.downsample.conv.bias"
-            )
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
-    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
-    num_mid_res_blocks = 2
-    for i in range(1, num_mid_res_blocks + 1):
-        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
-    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
-    paths = renew_vae_attention_paths(mid_attentions)
-    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
-    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-    conv_attn_to_linear(new_checkpoint)
-
-    for i in range(num_up_blocks):
-        block_id = num_up_blocks - 1 - i
-        resnets = [
-            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
-        ]
-
-        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
-            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
-                f"decoder.up.{block_id}.upsample.conv.weight"
-            ]
-            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
-                f"decoder.up.{block_id}.upsample.conv.bias"
-            ]
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
-    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
-    num_mid_res_blocks = 2
-    for i in range(1, num_mid_res_blocks + 1):
-        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
-    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
-    paths = renew_vae_attention_paths(mid_attentions)
-    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
-    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-    conv_attn_to_linear(new_checkpoint)
-    return new_checkpoint
-
-
-def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet, diffusers_vae_unet_checkpoint):
-    import paddle.nn as nn
-
-    need_transpose = []
-    for k, v in vae_or_unet.named_sublayers(include_self=True):
-        if isinstance(v, nn.Linear):
-            need_transpose.append(k + ".weight")
-    new_vae_or_unet = {}
-    for k, v in diffusers_vae_unet_checkpoint.items():
-        if k not in need_transpose:
-            new_vae_or_unet[k] = v
-        else:
-            new_vae_or_unet[k] = v.T
-    return new_vae_or_unet
-
-
-def convert_ldm_bert_checkpoint(checkpoint, config):
-    # extract state dict for bert
-    bert_state_dict = {}
-    bert_key = "cond_stage_model."
-    keys = list(checkpoint.keys())
-    for key in keys:
-        if key.startswith(bert_key):
-            bert_state_dict[key.replace(bert_key, "")] = checkpoint.get(key)
-
-    new_checkpoint = {}
-    new_checkpoint["embeddings.word_embeddings.weight"] = bert_state_dict["transformer.token_emb.weight"]
-    new_checkpoint["embeddings.position_embeddings.weight"] = bert_state_dict["transformer.pos_emb.emb.weight"]
-    for i in range(config.encoder_layers):
-        double_i = 2 * i
-        double_i_plus1 = 2 * i + 1
-        # convert norm
-        new_checkpoint[f"encoder.layers.{i}.norm1.weight"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i}.0.weight"
-        ]
-        new_checkpoint[f"encoder.layers.{i}.norm1.bias"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i}.0.bias"
-        ]
-
-        new_checkpoint[f"encoder.layers.{i}.self_attn.q_proj.weight"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i}.1.to_q.weight"
-        ].T
-        new_checkpoint[f"encoder.layers.{i}.self_attn.k_proj.weight"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i}.1.to_k.weight"
-        ].T
-        new_checkpoint[f"encoder.layers.{i}.self_attn.v_proj.weight"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i}.1.to_v.weight"
-        ].T
-        new_checkpoint[f"encoder.layers.{i}.self_attn.out_proj.weight"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i}.1.to_out.weight"
-        ].T
-        new_checkpoint[f"encoder.layers.{i}.self_attn.out_proj.bias"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i}.1.to_out.bias"
-        ]
-
-        new_checkpoint[f"encoder.layers.{i}.norm2.weight"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i_plus1}.0.weight"
-        ]
-        new_checkpoint[f"encoder.layers.{i}.norm2.bias"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i_plus1}.0.bias"
-        ]
-        new_checkpoint[f"encoder.layers.{i}.linear1.weight"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.weight"
-        ].T
-        new_checkpoint[f"encoder.layers.{i}.linear1.bias"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.bias"
-        ]
-        new_checkpoint[f"encoder.layers.{i}.linear2.weight"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.weight"
-        ].T
-        new_checkpoint[f"encoder.layers.{i}.linear2.bias"] = bert_state_dict[
-            f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.bias"
-        ].T
-
-    new_checkpoint["final_layer_norm.weight"] = bert_state_dict["transformer.norm.weight"]
-    new_checkpoint["final_layer_norm.bias"] = bert_state_dict["transformer.norm.bias"]
-    ldmbert = LDMBertModel(config)
-    ldmbert.eval()
-    ldmbert.load_dict(new_checkpoint)
-    return ldmbert
-
-
-def convert_ldm_clip_checkpoint(checkpoint):
-    text_model = CLIPTextModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="text_encoder")
-    text_model.eval()
-
-    keys = list(checkpoint.keys())
-
-    text_model_dict = {}
-
-    for key in keys:
-        if key.startswith("cond_stage_model.transformer"):
-            text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key]
-
-    if len(text_model_dict) > 0:
-        text_model.load_dict(CLIPTextModel.smart_convert(text_model_dict, text_model))
-
-    return text_model
-
-
-textenc_conversion_lst = [
-    ("cond_stage_model.model.positional_embedding", "text_model.embeddings.position_embedding.weight"),
-    ("cond_stage_model.model.token_embedding.weight", "text_model.embeddings.token_embedding.weight"),
-    ("cond_stage_model.model.ln_final.weight", "text_model.final_layer_norm.weight"),
-    ("cond_stage_model.model.ln_final.bias", "text_model.final_layer_norm.bias"),
-]
-textenc_conversion_map = {x[0]: x[1] for x in textenc_conversion_lst}
-
-textenc_transformer_conversion_lst = [
-    # (stable-diffusion, HF Diffusers)
-    ("resblocks.", "text_model.encoder.layers."),
-    ("ln_1", "layer_norm1"),
-    ("ln_2", "layer_norm2"),
-    (".c_fc.", ".fc1."),
-    (".c_proj.", ".fc2."),
-    (".attn", ".self_attn"),
-    ("ln_final.", "transformer.text_model.final_layer_norm."),
-    ("token_embedding.weight", "transformer.text_model.embeddings.token_embedding.weight"),
-    ("positional_embedding", "transformer.text_model.embeddings.position_embedding.weight"),
-]
-protected = {re.escape(x[0]): x[1] for x in textenc_transformer_conversion_lst}
-textenc_pattern = re.compile("|".join(protected.keys()))
-
-
-def convert_open_clip_checkpoint(checkpoint):
-    text_model = CLIPTextModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="text_encoder")
-    text_model.eval()
-    keys = list(checkpoint.keys())
-
-    text_model_dict = {}
-
-    if "cond_stage_model.model.text_projection" in checkpoint:
-        d_model = int(checkpoint["cond_stage_model.model.text_projection"].shape[0])
-    else:
-        d_model = 1024
-
-    for key in keys:
-        if "resblocks.23" in key:  # Diffusers drops the final layer and only uses the penultimate layer
-            continue
-        if key in textenc_conversion_map:
-            text_model_dict[textenc_conversion_map[key]] = checkpoint[key]
-        if key.startswith("cond_stage_model.model.transformer."):
-            new_key = key[len("cond_stage_model.model.transformer.") :]
-            if new_key.endswith(".in_proj_weight"):
-                new_key = new_key[: -len(".in_proj_weight")]
-                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
-                text_model_dict[new_key + ".q_proj.weight"] = checkpoint[key][:d_model, :]
-                text_model_dict[new_key + ".k_proj.weight"] = checkpoint[key][d_model : d_model * 2, :]
-                text_model_dict[new_key + ".v_proj.weight"] = checkpoint[key][d_model * 2 :, :]
-            elif new_key.endswith(".in_proj_bias"):
-                new_key = new_key[: -len(".in_proj_bias")]
-                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
-                text_model_dict[new_key + ".q_proj.bias"] = checkpoint[key][:d_model]
-                text_model_dict[new_key + ".k_proj.bias"] = checkpoint[key][d_model : d_model * 2]
-                text_model_dict[new_key + ".v_proj.bias"] = checkpoint[key][d_model * 2 :]
-            else:
-                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
-
-                text_model_dict[new_key] = checkpoint[key]
-    if len(text_model_dict) > 0:
-        text_model.load_dict(CLIPTextModel.smart_convert(text_model_dict, text_model))
-    return text_model
-
-
-def load_pipeline_from_original_stable_diffusion_ckpt(
-    checkpoint_path: str,
-    original_config_file: str = None,
-    image_size: int = 512,
-    prediction_type: str = None,
-    model_type: str = None,
-    extract_ema: bool = False,
-    scheduler_type: str = "pndm",
-    num_in_channels: Optional[int] = None,
-    upcast_attention: Optional[bool] = None,
-    paddle_dtype: Optional[bool] = None,
-    requires_safety_checker: bool = False,
-    controlnet: Optional[bool] = None,
-    cls=None,
-    **kwargs,
-) -> StableDiffusionPipeline:
-    """
-    Load a Stable Diffusion pipeline object from a CompVis-style `.ckpt`/`.safetensors` file and (ideally) a `.yaml`
-    config file.
-
-    Although many of the arguments can be automatically inferred, some of these rely on brittle checks against the
-    global step count, which will likely fail for models that have undergone further fine-tuning. Therefore, it is
-    recommended that you override the default values and/or supply an `original_config_file` wherever possible.
-
-    Args:
-        checkpoint_path (`str`): Path to `.ckpt` file.
-        original_config_file (`str`):
-            Path to `.yaml` config file corresponding to the original architecture. If `None`, will be automatically
-            inferred by looking for a key that only exists in SD2.0 models.
-        image_size (`int`, *optional*, defaults to 512):
-            The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable Diffusion v2
-            Base. Use 768 for Stable Diffusion v2.
-        prediction_type (`str`, *optional*):
-            The prediction type that the model was trained on. Use `'epsilon'` for Stable Diffusion v1.X and Stable
-            Diffusion v2 Base. Use `'v_prediction'` for Stable Diffusion v2.
-        num_in_channels (`int`, *optional*, defaults to None):
-            The number of input channels. If `None`, it will be automatically inferred.
-        scheduler_type (`str`, *optional*, defaults to 'pndm'):
-            Type of scheduler to use. Should be one of `["pndm", "lms", "heun", "euler", "euler-ancestral", "dpm",
-            "ddim"]`.
-        model_type (`str`, *optional*, defaults to `None`):
-            The pipeline type. `None` to automatically infer, or one of `["FrozenOpenCLIPEmbedder",
-            "FrozenCLIPEmbedder",]`.
-        extract_ema (`bool`, *optional*, defaults to `False`): Only relevant for
-            checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights or not. Defaults to
-            `False`. Pass `True` to extract the EMA weights. EMA weights usually yield higher quality images for
-            inference. Non-EMA weights are usually better to continue fine-tuning.
-        upcast_attention (`bool`, *optional*, defaults to `None`):
-            Whether the attention computation should always be upcasted. This is necessary when running stable
-            diffusion 2.1.
-    """
-    if cls is None or cls.__name__ == "DiffusionPipeline":
-        cls = StableDiffusionPipeline
-
-    if prediction_type == "v-prediction":
-        prediction_type = "v_prediction"
-
-    if not is_omegaconf_available():
-        raise ValueError(BACKENDS_MAPPING["omegaconf"][1])
-
-    from omegaconf import OmegaConf
-
-    checkpoint = smart_load(checkpoint_path, return_numpy=True, return_global_step=True)
-
-    global_step = int(checkpoint.pop("global_step", -1))
-
-    if global_step == -1:
-        print("global_step key not found in model")
-
-    # must cast them to float32
-    newcheckpoint = {}
-    for k, v in checkpoint.items():
-        try:
-            if "int" in str(v.dtype):
-                continue
-        except Exception:
-            continue
-        newcheckpoint[k] = v.astype("float32")
-    checkpoint = newcheckpoint
-
-    if "state_dict" in checkpoint:
-        checkpoint = checkpoint["state_dict"]
-
-    with tempfile.TemporaryDirectory() as tmpdir:
-        if original_config_file is None:
-            key_name = "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_k.weight"
-
-            original_config_file = os.path.join(tmpdir, "inference.yaml")
-            if key_name in checkpoint and checkpoint[key_name].shape[-1] == 1024:
-                if not os.path.isfile("v2-inference-v.yaml"):
-                    # model_type = "v2"
-                    r = requests.get(
-                        "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/v2-inference-v.yaml"
-                    )
-                    open(original_config_file, "wb").write(r.content)
-
-                if global_step == 110000:
-                    # v2.1 needs to upcast attention
-                    upcast_attention = True
-            else:
-                if not os.path.isfile("v1-inference.yaml"):
-                    # model_type = "v1"
-                    r = requests.get(
-                        "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/v1-inference.yaml"
-                    )
-                    open(original_config_file, "wb").write(r.content)
-
-        original_config = OmegaConf.load(original_config_file)
-
-    if num_in_channels is not None:
-        original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
-
-    if (
-        "parameterization" in original_config["model"]["params"]
-        and original_config["model"]["params"]["parameterization"] == "v"
-    ):
-        if prediction_type is None:
-            # NOTE: For stable diffusion 2 base it is recommended to pass `prediction_type=="epsilon"`
-            # as it relies on a brittle global step parameter here
-            prediction_type = "epsilon" if global_step == 875000 else "v_prediction"
-        if image_size is None:
-            # NOTE: For stable diffusion 2 base one has to pass `image_size==512`
-            # as it relies on a brittle global step parameter here
-            image_size = 512 if global_step == 875000 else 768
-    else:
-        if prediction_type is None:
-            prediction_type = "epsilon"
-        if image_size is None:
-            image_size = 512
-
-    num_train_timesteps = original_config.model.params.timesteps
-    beta_start = original_config.model.params.linear_start
-    beta_end = original_config.model.params.linear_end
-
-    scheduler = DDIMScheduler(
-        beta_end=beta_end,
-        beta_schedule="scaled_linear",
-        beta_start=beta_start,
-        num_train_timesteps=num_train_timesteps,
-        steps_offset=1,
-        clip_sample=False,
-        set_alpha_to_one=False,
-        prediction_type=prediction_type,
-    )
-    # make sure scheduler works correctly with DDIM
-    scheduler.register_to_config(clip_sample=False)
-
-    if scheduler_type == "pndm":
-        config = dict(scheduler.config)
-        config["skip_prk_steps"] = True
-        scheduler = PNDMScheduler.from_config(config)
-    elif scheduler_type == "lms":
-        scheduler = LMSDiscreteScheduler.from_config(scheduler.config)
-    elif scheduler_type == "heun":
-        scheduler = HeunDiscreteScheduler.from_config(scheduler.config)
-    elif scheduler_type == "euler":
-        scheduler = EulerDiscreteScheduler.from_config(scheduler.config)
-    elif scheduler_type == "euler-ancestral":
-        scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config)
-    elif scheduler_type == "dpm":
-        scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
-    elif scheduler_type == "ddim":
-        scheduler = scheduler
-    else:
-        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
-
-    # Convert the UNet2DConditionModel model.
-    unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
-    unet_config["upcast_attention"] = upcast_attention
-    unet = UNet2DConditionModel(**unet_config)
-    unet.eval()
-
-    converted_unet_checkpoint = convert_ldm_unet_checkpoint(
-        checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema
-    )
-    unet.load_dict(convert_diffusers_vae_unet_to_ppdiffusers(unet, converted_unet_checkpoint))
-
-    # Convert the VAE model.
-    vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
-    converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
-
-    vae = AutoencoderKL(**vae_config)
-    vae.eval()
-    vae.load_dict(convert_diffusers_vae_unet_to_ppdiffusers(vae, converted_vae_checkpoint))
-
-    # Convert the text model.
-    if model_type is None:
-        model_type = original_config.model.params.cond_stage_config.target.split(".")[-1]
-        logger.debug(f"no `model_type` given, `model_type` inferred as: {model_type}")
-
-    if controlnet is None:
-        controlnet = "control_stage_config" in original_config.model.params
-
-    if model_type == "FrozenOpenCLIPEmbedder":
-        text_model = convert_open_clip_checkpoint(checkpoint)
-        tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2/tokenizer")
-
-        if paddle_dtype is not None:
-            vae.to(dtype=paddle_dtype)
-            text_model.to(dtype=paddle_dtype)
-            unet.to(dtype=paddle_dtype)
-
-        if controlnet:
-            # Convert the ControlNetModel model.
-            ctrlnet_config = create_unet_diffusers_config(original_config, image_size=image_size, controlnet=True)
-            ctrlnet_config["upcast_attention"] = upcast_attention
-
-            ctrlnet_config.pop("sample_size")
-
-            controlnet_model = ControlNetModel(**ctrlnet_config)
-            controlnet_model.eval()
-
-            converted_ctrl_checkpoint = convert_ldm_unet_checkpoint(
-                checkpoint, ctrlnet_config, path=checkpoint_path, extract_ema=extract_ema, controlnet=True
-            )
-            controlnet_model.load_dict(
-                convert_diffusers_vae_unet_to_ppdiffusers(controlnet_model, converted_ctrl_checkpoint)
-            )
-
-            if paddle_dtype is not None:
-                controlnet_model.to(dtype=paddle_dtype)
-
-            pipe = StableDiffusionControlNetPipeline(
-                vae=vae,
-                text_encoder=text_model,
-                tokenizer=tokenizer,
-                unet=unet,
-                controlnet=controlnet_model,
-                scheduler=scheduler,
-                safety_checker=None,
-                feature_extractor=None,
-                requires_safety_checker=False,
-            )
-        else:
-            pipe = cls(
-                vae=vae,
-                text_encoder=text_model,
-                tokenizer=tokenizer,
-                unet=unet,
-                scheduler=scheduler,
-                safety_checker=None,
-                feature_extractor=None,
-                requires_safety_checker=False,
-            )
-
-    elif model_type == "FrozenCLIPEmbedder":
-        text_model = convert_ldm_clip_checkpoint(checkpoint)
-        tokenizer = CLIPTokenizer.from_pretrained("CompVis/stable-diffusion-v1-4/tokenizer")
-        if requires_safety_checker:
-            safety_checker = StableDiffusionSafetyChecker.from_pretrained(
-                "CompVis/stable-diffusion-v1-4", subfolder="safety_checker"
-            )
-            feature_extractor = CLIPFeatureExtractor.from_pretrained(
-                "CompVis/stable-diffusion-v1-4", subfolder="feature_extractor"
-            )
-        else:
-            safety_checker = feature_extractor = None
-
-        if paddle_dtype is not None:
-            vae.to(dtype=paddle_dtype)
-            text_model.to(dtype=paddle_dtype)
-            unet.to(dtype=paddle_dtype)
-            if requires_safety_checker:
-                safety_checker.to(dtype=paddle_dtype)
-
-        if controlnet:
-            # Convert the ControlNetModel model.
-            ctrlnet_config = create_unet_diffusers_config(original_config, image_size=image_size, controlnet=True)
-            ctrlnet_config["upcast_attention"] = upcast_attention
-
-            ctrlnet_config.pop("sample_size")
-
-            controlnet_model = ControlNetModel(**ctrlnet_config)
-            controlnet_model.eval()
-
-            converted_ctrl_checkpoint = convert_ldm_unet_checkpoint(
-                checkpoint, ctrlnet_config, path=checkpoint_path, extract_ema=extract_ema, controlnet=True
-            )
-            controlnet_model.load_dict(
-                convert_diffusers_vae_unet_to_ppdiffusers(controlnet_model, converted_ctrl_checkpoint)
-            )
-
-            if paddle_dtype is not None:
-                controlnet_model.to(dtype=paddle_dtype)
-
-            pipe = StableDiffusionControlNetPipeline(
-                vae=vae,
-                text_encoder=text_model,
-                tokenizer=tokenizer,
-                unet=unet,
-                controlnet=controlnet_model,
-                scheduler=scheduler,
-                safety_checker=safety_checker,
-                feature_extractor=feature_extractor,
-                requires_safety_checker=requires_safety_checker,
-            )
-        else:
-            pipe = cls(
-                vae=vae,
-                text_encoder=text_model,
-                tokenizer=tokenizer,
-                unet=unet,
-                scheduler=scheduler,
-                safety_checker=safety_checker,
-                feature_extractor=feature_extractor,
-                requires_safety_checker=requires_safety_checker,
-            )
-    else:
-        text_config = create_ldm_bert_config(original_config)
-        text_model = convert_ldm_bert_checkpoint(checkpoint, text_config)
-        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", model_max_length=77)
-        if paddle_dtype is not None:
-            vae.to(dtype=paddle_dtype)
-            text_model.to(dtype=paddle_dtype)
-            unet.to(dtype=paddle_dtype)
-        pipe = LDMTextToImagePipeline(vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
-
-    return pipe
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/hf_clip_model.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/hf_clip_model.py
deleted file mode 100644
index b57b2c4a96a3..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/hf_clip_model.py
+++ /dev/null
@@ -1,1314 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The OpenAI Team Authors and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" HF & Paddle CLIP model."""
-
-import os
-from dataclasses import dataclass
-from typing import Any, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-import paddle.nn.functional as F
-from paddle import nn
-from paddle.distributed.fleet.utils import recompute
-
-from paddlenlp.transformers.activations import ACT2FN
-from paddlenlp.transformers.clip.configuration import (
-    CLIPConfig,
-    CLIPTextConfig,
-    CLIPVisionConfig,
-)
-from paddlenlp.transformers.model_outputs import (
-    BaseModelOutput,
-    BaseModelOutputWithPooling,
-    ModelOutput,
-)
-from paddlenlp.transformers.model_utils import PretrainedModel
-from ppdiffusers.initializer import normal_, ones_
-
-CLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "openai/clip-vit-base-patch32",
-    # See all CLIP models at https://huggingface.co/models?filter=clip
-]
-
-
-def finfo(dtype: paddle.dtype = None):
-    if dtype is None:
-        dtype = paddle.get_default_dtype()
-
-    if dtype == paddle.bfloat16:
-        # Numpy do not support `np.finfo(np.uint16)`, so try to construct a finfo object to fetch min value
-        class BFloatFInfo:
-            min = -3.3895313892515355e38
-
-        return BFloatFInfo
-    if dtype == paddle.float32:
-        return np.finfo(np.float32)
-    if dtype == paddle.float16:
-        return np.finfo(np.float16)
-    if dtype == paddle.float64:
-        return np.finfo(np.float64)
-
-
-def Parameter(data: paddle.Tensor, requires_grad=True):
-    tensor = paddle.create_parameter(data.shape, dtype=data.dtype, default_initializer=nn.initializer.Assign(data))
-    if not requires_grad:
-        tensor.stop_gradient = True
-    return tensor
-
-
-class TorchLinear(nn.Layer):
-    """
-    Same as paddle.layer.Linear, except weight matrix is stored as [out_features, in_features] (same as torch),
-    instead of [in_features, out_features]
-    """
-
-    def __init__(
-        self,
-        in_features,
-        out_features,
-        weight_attr=None,
-        bias_attr=None,
-        name=None,
-        bias=None,
-    ):
-        super().__init__()
-        self._dtype = self._helper.get_default_dtype()
-        self._weight_attr = weight_attr
-        if bias is not None:
-            bias_attr = bias
-        self._bias_attr = bias_attr
-        self.in_features = in_features
-        self.out_features = out_features
-        self.weight = self.create_parameter(
-            shape=[out_features, in_features],  # regular linear has shape [in_features, out_features]
-            attr=self._weight_attr,
-            dtype=self._dtype,
-            is_bias=False,
-        )
-        self.bias = self.create_parameter(
-            shape=[out_features],
-            attr=self._bias_attr,
-            dtype=self._dtype,
-            is_bias=True,
-        )
-        self.name = name
-
-    def forward(self, input):
-        out = F.linear(x=input, weight=self.weight.T, bias=self.bias, name=self.name)
-        return out
-
-    def extra_repr(self):
-        name_str = ", name={}".format(self.name) if self.name else ""
-        return "in_features={}, out_features={}, dtype={}{}".format(
-            self.weight.shape[1], self.weight.shape[0], self._dtype, name_str
-        )
-
-
-if bool(os.getenv("USE_TORCH_LINEAR", False)):
-    LinearClass = TorchLinear
-else:
-    LinearClass = nn.Linear
-
-
-def masked_fill(x, mask, value):
-    y = paddle.full(x.shape, value, x.dtype)
-    return paddle.where(mask, y, x)
-
-
-def _expand_mask(mask: paddle.Tensor, dtype, tgt_len: Optional[int] = None):
-    """
-    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
-    """
-    bsz, src_len = mask.shape
-    tgt_len = tgt_len if tgt_len is not None else src_len
-
-    expanded_mask = mask[:, None, None, :].expand([bsz, 1, tgt_len, src_len]).cast(dtype)
-
-    inverted_mask = 1.0 - expanded_mask
-
-    return masked_fill(inverted_mask, inverted_mask.cast(paddle.bool), finfo(dtype).min)
-
-
-# contrastive loss function, adapted from
-# https://sachinruk.github.io/blog/pypaddle/pypaddle%20lightning/loss%20function/gpu/2021/03/07/CLIP.html
-def contrastive_loss(logits: paddle.Tensor) -> paddle.Tensor:
-    return F.cross_entropy(logits, paddle.arange(len(logits)))
-
-
-def clip_loss(similarity: paddle.Tensor) -> paddle.Tensor:
-    caption_loss = contrastive_loss(similarity)
-    image_loss = contrastive_loss(similarity.t())
-    return (caption_loss + image_loss) / 2.0
-
-
-@dataclass
-class HFCLIPVisionModelOutput(ModelOutput):
-    """
-    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
-
-    Args:
-        image_embeds (`paddle.Tensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
-            The image embeddings obtained by applying the projection layer to the pooler_output.
-        last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
-        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    image_embeds: Optional[paddle.Tensor] = None
-    last_hidden_state: paddle.Tensor = None
-    hidden_states: Optional[Tuple[paddle.Tensor]] = None
-    attentions: Optional[Tuple[paddle.Tensor]] = None
-
-
-@dataclass
-class HFCLIPTextModelOutput(ModelOutput):
-    """
-    Base class for text model's outputs that also contains a pooling of the last hidden states.
-
-    Args:
-        text_embeds (`paddle.Tensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
-            The text embeddings obtained by applying the projection layer to the pooler_output.
-        last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
-        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    text_embeds: Optional[paddle.Tensor] = None
-    last_hidden_state: paddle.Tensor = None
-    hidden_states: Optional[Tuple[paddle.Tensor]] = None
-    attentions: Optional[Tuple[paddle.Tensor]] = None
-
-
-@dataclass
-class HFCLIPOutput(ModelOutput):
-    """
-    Args:
-        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
-            Contrastive loss for image-text similarity.
-        logits_per_image:(`paddle.Tensor` of shape `(image_batch_size, text_batch_size)`):
-            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
-            similarity scores.
-        logits_per_text:(`paddle.Tensor` of shape `(text_batch_size, image_batch_size)`):
-            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
-            similarity scores.
-        text_embeds(`paddle.Tensor` of shape `(batch_size, output_dim`):
-            The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPTextModel`].
-        image_embeds(`paddle.Tensor` of shape `(batch_size, output_dim`):
-            The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPVisionModel`].
-        text_model_output(`BaseModelOutputWithPooling`):
-            The output of the [`CLIPTextModel`].
-        vision_model_output(`BaseModelOutputWithPooling`):
-            The output of the [`CLIPVisionModel`].
-    """
-
-    loss: Optional[paddle.Tensor] = None
-    logits_per_image: paddle.Tensor = None
-    logits_per_text: paddle.Tensor = None
-    text_embeds: paddle.Tensor = None
-    image_embeds: paddle.Tensor = None
-    text_model_output: BaseModelOutputWithPooling = None
-    vision_model_output: BaseModelOutputWithPooling = None
-
-    def to_tuple(self) -> Tuple[Any]:
-        return tuple(
-            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
-            for k in self.keys()
-        )
-
-
-class HFCLIPVisionEmbeddings(nn.Layer):
-    def __init__(self, config: CLIPVisionConfig):
-        super().__init__()
-        self.config = config
-        self.embed_dim = config.hidden_size
-        self.image_size = config.image_size
-        self.patch_size = config.patch_size
-
-        self.class_embedding = Parameter(paddle.randn((self.embed_dim,)))
-
-        self.patch_embedding = nn.Conv2D(
-            in_channels=config.num_channels,
-            out_channels=self.embed_dim,
-            kernel_size=self.patch_size,
-            stride=self.patch_size,
-            bias_attr=False,
-        )
-
-        self.num_patches = (self.image_size // self.patch_size) ** 2
-        self.num_positions = self.num_patches + 1
-        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
-        self.register_buffer(
-            "position_ids", paddle.arange(self.num_positions).expand((1, -1), dtype="int64"), persistable=True
-        )
-
-    def forward(self, pixel_values: paddle.Tensor) -> paddle.Tensor:
-        batch_size = pixel_values.shape[0]
-        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
-        patch_embeds = patch_embeds.flatten(2).transpose([0, 2, 1])
-
-        class_embeds = self.class_embedding.expand([batch_size, 1, -1])
-        embeddings = paddle.concat([class_embeds, patch_embeds], axis=1)
-        embeddings = embeddings + self.position_embedding(self.position_ids)
-        return embeddings
-
-
-class HFCLIPTextEmbeddings(nn.Layer):
-    def __init__(self, config: CLIPTextConfig):
-        super().__init__()
-        embed_dim = config.hidden_size
-
-        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
-        self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
-
-        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer(
-            "position_ids",
-            paddle.arange(config.max_position_embeddings, dtype="int64").expand((1, -1)),
-            persistable=True,
-        )
-
-    def forward(
-        self,
-        input_ids: Optional[paddle.Tensor] = None,
-        position_ids: Optional[paddle.Tensor] = None,
-        inputs_embeds: Optional[paddle.Tensor] = None,
-    ) -> paddle.Tensor:
-        seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
-
-        if position_ids is None:
-            position_ids = self.position_ids[:, :seq_length]
-
-        if inputs_embeds is None:
-            inputs_embeds = self.token_embedding(input_ids)
-
-        position_embeddings = self.position_embedding(position_ids)
-        embeddings = inputs_embeds + position_embeddings
-
-        return embeddings
-
-
-class HFCLIPAttention(nn.Layer):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.embed_dim = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.embed_dim // self.num_heads
-        if self.head_dim * self.num_heads != self.embed_dim:
-            raise ValueError(
-                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
-                f" {self.num_heads})."
-            )
-        self.scale = self.head_dim**-0.5
-        self.dropout = config.attention_dropout
-
-        self.k_proj = LinearClass(self.embed_dim, self.embed_dim)
-        self.v_proj = LinearClass(self.embed_dim, self.embed_dim)
-        self.q_proj = LinearClass(self.embed_dim, self.embed_dim)
-        self.out_proj = LinearClass(self.embed_dim, self.embed_dim)
-
-    def _shape(self, tensor: paddle.Tensor, seq_len: int, bsz: int):
-        return tensor.reshape([bsz, seq_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3])
-
-    def forward(
-        self,
-        hidden_states: paddle.Tensor,
-        attention_mask: Optional[paddle.Tensor] = None,
-        causal_attention_mask: Optional[paddle.Tensor] = None,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
-        """Input shape: Batch x Time x Channel"""
-
-        bsz, tgt_len, embed_dim = hidden_states.shape
-
-        # get query proj
-        query_states = self.q_proj(hidden_states) * self.scale
-        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-
-        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
-        query_states = self._shape(query_states, tgt_len, bsz).reshape(proj_shape)
-        key_states = key_states.reshape(proj_shape)
-        value_states = value_states.reshape(proj_shape)
-
-        src_len = key_states.shape[1]
-        attn_weights = paddle.matmul(query_states, key_states, transpose_y=True)
-
-        if attn_weights.shape != [bsz * self.num_heads, tgt_len, src_len]:
-            raise ValueError(
-                f"Attention weights should be of size {[bsz * self.num_heads, tgt_len, src_len]}, but is"
-                f" {attn_weights.shape}"
-            )
-
-        # apply the causal_attention_mask first
-        if causal_attention_mask is not None:
-            if causal_attention_mask.shape != [bsz, 1, tgt_len, src_len]:
-                raise ValueError(
-                    f"Attention mask should be of size {[bsz, 1, tgt_len, src_len]}, but is"
-                    f" {causal_attention_mask.shape}"
-                )
-            attn_weights = attn_weights.reshape([bsz, self.num_heads, tgt_len, src_len]) + causal_attention_mask
-            attn_weights = attn_weights.reshape([bsz * self.num_heads, tgt_len, src_len])
-
-        if attention_mask is not None:
-            if attention_mask.shape != [bsz, 1, tgt_len, src_len]:
-                raise ValueError(
-                    f"Attention mask should be of size {[bsz, 1, tgt_len, src_len]}, but is {attention_mask.shape}"
-                )
-            attn_weights = attn_weights.reshape([bsz, self.num_heads, tgt_len, src_len]) + attention_mask
-            attn_weights = attn_weights.reshape([bsz * self.num_heads, tgt_len, src_len])
-
-        attn_weights = F.softmax(attn_weights, axis=-1)
-
-        if output_attentions:
-            # this operation is a bit akward, but it's required to
-            # make sure that attn_weights keeps its gradient.
-            # In order to do so, attn_weights have to reshaped
-            # twice and have to be reused in the following
-            attn_weights_reshaped = attn_weights.reshape([bsz, self.num_heads, tgt_len, src_len])
-            attn_weights = attn_weights_reshaped.reshape([bsz * self.num_heads, tgt_len, src_len])
-        else:
-            attn_weights_reshaped = None
-
-        attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training)
-
-        attn_output = paddle.matmul(attn_probs, value_states)
-
-        if attn_output.shape != [bsz * self.num_heads, tgt_len, self.head_dim]:
-            raise ValueError(
-                f"`attn_output` should be of size {[bsz, self.num_heads, tgt_len, self.head_dim]}, but is"
-                f" {attn_output.shape}"
-            )
-
-        attn_output = attn_output.reshape([bsz, self.num_heads, tgt_len, self.head_dim])
-        attn_output = attn_output.transpose([0, 2, 1, 3])
-        attn_output = attn_output.reshape([bsz, tgt_len, embed_dim])
-
-        attn_output = self.out_proj(attn_output)
-
-        return attn_output, attn_weights_reshaped
-
-
-class HFCLIPMLP(nn.Layer):
-    def __init__(self, config: CLIPTextConfig):
-        super().__init__()
-        self.config = config
-        self.activation_fn = ACT2FN[config.hidden_act]
-        self.fc1 = LinearClass(config.hidden_size, config.intermediate_size)
-        self.fc2 = LinearClass(config.intermediate_size, config.hidden_size)
-
-    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
-        hidden_states = self.fc1(hidden_states)
-        hidden_states = self.activation_fn(hidden_states)
-        hidden_states = self.fc2(hidden_states)
-        return hidden_states
-
-
-class HFCLIPEncoderLayer(nn.Layer):
-    def __init__(self, config: CLIPTextConfig):
-        super().__init__()
-        self.embed_dim = config.hidden_size
-        self.self_attn = HFCLIPAttention(config)
-        self.layer_norm1 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps)
-        self.mlp = HFCLIPMLP(config)
-        self.layer_norm2 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: paddle.Tensor,
-        attention_mask: paddle.Tensor,
-        causal_attention_mask: paddle.Tensor,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[paddle.Tensor]:
-        """
-        Args:
-            hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`paddle.Tensor`): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-                `(config.encoder_attention_heads,)`.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-        """
-        residual = hidden_states
-
-        hidden_states = self.layer_norm1(hidden_states)
-        hidden_states, attn_weights = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            causal_attention_mask=causal_attention_mask,
-            output_attentions=output_attentions,
-        )
-        hidden_states = residual + hidden_states
-
-        residual = hidden_states
-        hidden_states = self.layer_norm2(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (attn_weights,)
-
-        return outputs
-
-
-class HFCLIPPretrainedModel(PretrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-
-    config_class = CLIPConfig
-    base_model_prefix = "clip"
-    supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
-    @paddle.no_grad()
-    def _init_weights(self, module):
-        """Initialize the weights"""
-        factor = self.config.initializer_factor
-        if isinstance(module, HFCLIPTextEmbeddings):
-            normal_(module.token_embedding.weight, mean=0.0, std=factor * 0.02)
-            normal_(module.position_embedding.weight, mean=0.0, std=factor * 0.02)
-        elif isinstance(module, HFCLIPVisionEmbeddings):
-            factor = self.config.initializer_factor
-            normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
-            normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
-            normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
-        elif isinstance(module, HFCLIPAttention):
-            factor = self.config.initializer_factor
-            in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
-            out_proj_std = (module.embed_dim**-0.5) * factor
-            normal_(module.q_proj.weight, std=in_proj_std)
-            normal_(module.k_proj.weight, std=in_proj_std)
-            normal_(module.v_proj.weight, std=in_proj_std)
-            normal_(module.out_proj.weight, std=out_proj_std)
-        elif isinstance(module, HFCLIPMLP):
-            factor = self.config.initializer_factor
-            in_proj_std = (
-                (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
-            )
-            fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
-            normal_(module.fc1.weight, std=fc_std)
-            normal_(module.fc2.weight, std=in_proj_std)
-        elif isinstance(module, HFCLIPModel):
-            normal_(
-                module.text_projection.weight,
-                std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
-            )
-            normal_(
-                module.visual_projection.weight,
-                std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
-            )
-        elif isinstance(module, HFCLIPVisionModelWithProjection):
-            normal_(
-                module.visual_projection.weight,
-                std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
-            )
-        elif isinstance(module, HFCLIPTextModelWithProjection):
-            normal_(
-                module.text_projection.weight,
-                std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
-            )
-
-        if isinstance(module, nn.LayerNorm):
-            module.bias.zero_()
-            ones_(module.weight)
-        if isinstance(module, LinearClass) and module.bias is not None:
-            module.bias.zero_()
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, HFCLIPEncoder):
-            module.gradient_checkpointing = value
-
-    def post_init(self):
-        self.apply(self._init_weights)
-        # register_load_torch_hook
-        # self.register_load_torch_hook()
-
-    def register_load_torch_hook(self, function=None):
-        if hasattr(self, "load_torch_hook"):
-            self.load_torch_hook.remove()
-        if function is None:
-
-            def map_from(module, state_dict, *args, **kwargs):
-                if state_dict.pop("is_torch_weight", False):
-                    need_transposed = []
-                    for name, layer in module.named_sublayers(include_self=True):
-                        if isinstance(layer, nn.Linear):
-                            need_transposed.append(name + ".weight")
-                    module.need_transposed = need_transposed
-                    for key in need_transposed:
-                        state_dict[key] = state_dict[key].T
-
-        else:
-            map_from = function
-        self.load_torch_hook = self.register_load_state_dict_pre_hook(map_from, with_module=True)
-        return self.load_torch_hook
-
-    def remove_load_torch_hook(self):
-        if hasattr(self, "load_torch_hook"):
-            self.load_torch_hook.remove()
-
-    def to(self=None, device=None, dtype=None, blocking=None):
-        return self._to_impl(
-            device=device,
-            dtype=dtype,
-            blocking=blocking,
-            include_sublayers=True,
-            floating_only=True,
-        )
-
-
-class HFCLIPEncoder(nn.Layer):
-    """
-    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
-    [`CLIPEncoderLayer`].
-
-    Args:
-        config: CLIPConfig
-    """
-
-    def __init__(self, config: CLIPConfig):
-        super().__init__()
-        self.config = config
-        self.layers = nn.LayerList([HFCLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)])
-        self.gradient_checkpointing = False
-
-    def forward(
-        self,
-        inputs_embeds,
-        attention_mask: Optional[paddle.Tensor] = None,
-        causal_attention_mask: Optional[paddle.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutput]:
-        r"""
-        Args:
-            inputs_embeds (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
-                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
-                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-                than the model's internal embedding lookup matrix.
-            attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-
-                [What are attention masks?](../glossary#attention-mask)
-            causal_attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Causal mask for the text model. Mask values selected in `[0, 1]`:
-
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-
-                [What are attention masks?](../glossary#attention-mask)
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            output_hidden_states (`bool`, *optional*):
-                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
-                for more detail.
-            return_dict (`bool`, *optional*):
-                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        encoder_states = () if output_hidden_states else None
-        all_attentions = () if output_attentions else None
-
-        hidden_states = inputs_embeds
-        for idx, encoder_layer in enumerate(self.layers):
-            if output_hidden_states:
-                encoder_states = encoder_states + (hidden_states,)
-            if self.gradient_checkpointing and self.training:
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs, output_attentions)
-
-                    return custom_forward
-
-                layer_outputs = recompute(
-                    create_custom_forward(encoder_layer),
-                    hidden_states,
-                    attention_mask,
-                    causal_attention_mask,
-                )
-            else:
-                layer_outputs = encoder_layer(
-                    hidden_states,
-                    attention_mask,
-                    causal_attention_mask,
-                    output_attentions=output_attentions,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if output_attentions:
-                all_attentions = all_attentions + (layer_outputs[1],)
-
-        if output_hidden_states:
-            encoder_states = encoder_states + (hidden_states,)
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
-        return BaseModelOutput(
-            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
-        )
-
-
-class HFCLIPTextTransformer(nn.Layer):
-    def __init__(self, config: CLIPTextConfig):
-        super().__init__()
-        self.config = config
-        embed_dim = config.hidden_size
-        self.embeddings = HFCLIPTextEmbeddings(config)
-        self.encoder = HFCLIPEncoder(config)
-        self.final_layer_norm = nn.LayerNorm(embed_dim, epsilon=config.layer_norm_eps)
-
-    def forward(
-        self,
-        input_ids: Optional[paddle.Tensor] = None,
-        attention_mask: Optional[paddle.Tensor] = None,
-        position_ids: Optional[paddle.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPooling]:
-        r"""
-        Returns:
-
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if input_ids is None:
-            raise ValueError("You have to specify either input_ids")
-
-        input_shape = input_ids.shape
-        input_ids = input_ids.reshape([-1, input_shape[-1]])
-
-        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
-
-        bsz, seq_len = input_shape
-        # CLIP's text model uses causal mask, prepare it here.
-        # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
-        causal_attention_mask = self._build_causal_attention_mask(
-            bsz,
-            seq_len,
-            hidden_states.dtype,
-        )
-        # expand attention_mask
-        if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
-
-        encoder_outputs = self.encoder(
-            inputs_embeds=hidden_states,
-            attention_mask=attention_mask,
-            causal_attention_mask=causal_attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        last_hidden_state = encoder_outputs[0]
-        last_hidden_state = self.final_layer_norm(last_hidden_state)
-
-        # text_embeds.shape = [batch_size, sequence_length, transformer.width]
-        # take features from the eot embedding (eot_token is the highest number in each sequence)
-        # casting to paddle.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
-        pooled_output = last_hidden_state[
-            paddle.arange(last_hidden_state.shape[0], dtype=paddle.int32), input_ids.cast(paddle.int32).argmax(-1)
-        ]
-
-        if not return_dict:
-            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
-
-        return BaseModelOutputWithPooling(
-            last_hidden_state=last_hidden_state,
-            pooler_output=pooled_output,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-        )
-
-    def _build_causal_attention_mask(self, bsz, seq_len, dtype):
-        mask = paddle.triu(
-            paddle.full((bsz, 1, seq_len, seq_len), finfo(dtype).min),
-            diagonal=1,
-        )
-        return mask
-
-
-class HFCLIPTextModel(HFCLIPPretrainedModel):
-    config_class = CLIPTextConfig
-
-    _no_split_modules = ["HFCLIPEncoderLayer"]
-
-    def __init__(self, config: CLIPTextConfig):
-        super().__init__(config)
-        self.text_model = HFCLIPTextTransformer(config)
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self) -> nn.Layer:
-        return self.text_model.embeddings.token_embedding
-
-    def set_input_embeddings(self, value):
-        self.text_model.embeddings.token_embedding = value
-
-    def forward(
-        self,
-        input_ids: Optional[paddle.Tensor] = None,
-        attention_mask: Optional[paddle.Tensor] = None,
-        position_ids: Optional[paddle.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPooling]:
-        r"""
-        Returns:
-
-        Examples:
-
-        ```python
-        >>> from paddlenlp.transformers import CLIPTokenizer, CLIPTextModel
-
-        >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
-        >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
-
-        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pd")
-
-        >>> outputs = model(**inputs)
-        >>> last_hidden_state = outputs.last_hidden_state
-        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
-        ```"""
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        return self.text_model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-
-class HFCLIPVisionTransformer(nn.Layer):
-    def __init__(self, config: CLIPVisionConfig):
-        super().__init__()
-        self.config = config
-        embed_dim = config.hidden_size
-
-        self.embeddings = HFCLIPVisionEmbeddings(config)
-        self.pre_layrnorm = nn.LayerNorm(embed_dim, epsilon=config.layer_norm_eps)
-        self.encoder = HFCLIPEncoder(config)
-        self.post_layernorm = nn.LayerNorm(embed_dim, epsilon=config.layer_norm_eps)
-
-    def forward(
-        self,
-        pixel_values: Optional[paddle.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPooling]:
-        r"""
-        Returns:
-
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if pixel_values is None:
-            raise ValueError("You have to specify pixel_values")
-
-        hidden_states = self.embeddings(pixel_values)
-        hidden_states = self.pre_layrnorm(hidden_states)
-
-        encoder_outputs = self.encoder(
-            inputs_embeds=hidden_states,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        last_hidden_state = encoder_outputs[0]
-        pooled_output = last_hidden_state[:, 0, :]
-        pooled_output = self.post_layernorm(pooled_output)
-
-        if not return_dict:
-            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
-
-        return BaseModelOutputWithPooling(
-            last_hidden_state=last_hidden_state,
-            pooler_output=pooled_output,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-        )
-
-
-class HFCLIPVisionModel(HFCLIPPretrainedModel):
-    config_class = CLIPVisionConfig
-    main_input_name = "pixel_values"
-
-    def __init__(self, config: CLIPVisionConfig):
-        super().__init__(config)
-        self.vision_model = HFCLIPVisionTransformer(config)
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self) -> nn.Layer:
-        return self.vision_model.embeddings.patch_embedding
-
-    def forward(
-        self,
-        pixel_values: Optional[paddle.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPooling]:
-        r"""
-        Returns:
-
-        Examples:
-
-        ```python
-        >>> from PIL import Image
-        >>> import requests
-        >>> from paddlenlp.transformers import CLIPProcessor, CLIPVisionModel
-
-        >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
-        >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
-
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-
-        >>> inputs = processor(images=image, return_tensors="pd")
-
-        >>> outputs = model(**inputs)
-        >>> last_hidden_state = outputs.last_hidden_state
-        >>> pooled_output = outputs.pooler_output  # pooled CLS states
-        ```"""
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        return self.vision_model(
-            pixel_values=pixel_values,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-
-class HFCLIPModel(HFCLIPPretrainedModel):
-    config_class = CLIPConfig
-
-    def __init__(self, config: CLIPConfig):
-        super().__init__(config)
-
-        if not isinstance(config.text_config, CLIPTextConfig):
-            raise ValueError(
-                "config.text_config is expected to be of type CLIPTextConfig but is of type"
-                f" {type(config.text_config)}."
-            )
-
-        if not isinstance(config.vision_config, CLIPVisionConfig):
-            raise ValueError(
-                "config.vision_config is expected to be of type CLIPVisionConfig but is of type"
-                f" {type(config.vision_config)}."
-            )
-
-        text_config = config.text_config
-        vision_config = config.vision_config
-
-        self.projection_dim = config.projection_dim
-        self.text_embed_dim = text_config.hidden_size
-        self.vision_embed_dim = vision_config.hidden_size
-
-        self.text_model = HFCLIPTextTransformer(text_config)
-        self.vision_model = HFCLIPVisionTransformer(vision_config)
-
-        self.visual_projection = LinearClass(self.vision_embed_dim, self.projection_dim, bias_attr=False)
-        self.text_projection = LinearClass(self.text_embed_dim, self.projection_dim, bias_attr=False)
-        self.logit_scale = Parameter(paddle.ones((1,)) * self.config.logit_scale_init_value)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_text_features(
-        self,
-        input_ids: Optional[paddle.Tensor] = None,
-        attention_mask: Optional[paddle.Tensor] = None,
-        position_ids: Optional[paddle.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> paddle.Tensor:
-        r"""
-        Returns:
-            text_features (`paddle.Tensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
-            applying the projection layer to the pooled output of [`CLIPTextModel`].
-
-        Examples:
-
-        ```python
-        >>> from paddlenlp.transformers import CLIPTokenizer, CLIPModel
-
-        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
-        >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
-
-        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pd")
-        >>> text_features = model.get_text_features(**inputs)
-        ```"""
-        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        text_outputs = self.text_model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        pooled_output = text_outputs[1]
-        text_features = self.text_projection(pooled_output)
-
-        return text_features
-
-    def get_image_features(
-        self,
-        pixel_values: Optional[paddle.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> paddle.Tensor:
-        r"""
-        Returns:
-            image_features (`paddle.Tensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
-            applying the projection layer to the pooled output of [`CLIPVisionModel`].
-
-        Examples:
-
-        ```python
-        >>> from PIL import Image
-        >>> import requests
-        >>> from paddlenlp.transformers import CLIPProcessor, CLIPModel
-
-        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
-        >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
-
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-
-        >>> inputs = processor(images=image, return_tensors="pd")
-
-        >>> image_features = model.get_image_features(**inputs)
-        ```"""
-        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        vision_outputs = self.vision_model(
-            pixel_values=pixel_values,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        pooled_output = vision_outputs[1]  # pooled_output
-        image_features = self.visual_projection(pooled_output)
-
-        return image_features
-
-    def forward(
-        self,
-        input_ids: Optional[paddle.Tensor] = None,
-        pixel_values: Optional[paddle.Tensor] = None,
-        attention_mask: Optional[paddle.Tensor] = None,
-        position_ids: Optional[paddle.Tensor] = None,
-        return_loss: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, HFCLIPOutput]:
-        r"""
-        Returns:
-
-        Examples:
-
-        ```python
-        >>> import paddle.nn.functional as F
-        >>> import requests
-        >>> from PIL import Image
-        >>> from paddlenlp.transformers import CLIPProcessor, CLIPModel
-
-        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
-        >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
-
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-
-        >>> inputs = processor(
-        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pd", padding=True
-        ... )
-
-        >>> outputs = model(**inputs)
-        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
-        >>> probs = F.softmax(logits_per_image.softmax, axis=1)  # we can take the softmax to get the label probabilities
-        ```"""
-        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        vision_outputs = self.vision_model(
-            pixel_values=pixel_values,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        text_outputs = self.text_model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        image_embeds = vision_outputs[1]
-        image_embeds = self.visual_projection(image_embeds)
-
-        text_embeds = text_outputs[1]
-        text_embeds = self.text_projection(text_embeds)
-
-        # normalized features
-        image_embeds = image_embeds / image_embeds.norm(p=2, axis=-1, keepdim=True)
-        text_embeds = text_embeds / text_embeds.norm(p=2, axis=-1, keepdim=True)
-
-        # cosine similarity as logits
-        logit_scale = self.logit_scale.exp()
-        logits_per_text = paddle.matmul(text_embeds, image_embeds.t()) * logit_scale
-        logits_per_image = logits_per_text.t()
-
-        loss = None
-        if return_loss:
-            loss = clip_loss(logits_per_text)
-
-        if not return_dict:
-            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
-            return ((loss,) + output) if loss is not None else output
-
-        return HFCLIPOutput(
-            loss=loss,
-            logits_per_image=logits_per_image,
-            logits_per_text=logits_per_text,
-            text_embeds=text_embeds,
-            image_embeds=image_embeds,
-            text_model_output=text_outputs,
-            vision_model_output=vision_outputs,
-        )
-
-
-class HFCLIPTextModelWithProjection(HFCLIPPretrainedModel):
-    config_class = CLIPTextConfig
-
-    _no_split_modules = ["HFCLIPEncoderLayer"]
-
-    def __init__(self, config: CLIPTextConfig):
-        super().__init__(config)
-
-        self.text_model = HFCLIPTextTransformer(config)
-
-        self.text_projection = LinearClass(config.hidden_size, config.projection_dim, bias_attr=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self) -> nn.Layer:
-        return self.text_model.embeddings.token_embedding
-
-    def set_input_embeddings(self, value):
-        self.text_model.embeddings.token_embedding = value
-
-    def forward(
-        self,
-        input_ids: Optional[paddle.Tensor] = None,
-        attention_mask: Optional[paddle.Tensor] = None,
-        position_ids: Optional[paddle.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, HFCLIPTextModelOutput]:
-        r"""
-        Returns:
-
-        Examples:
-
-        ```python
-        >>> from paddlenlp.transformers import CLIPTokenizer, CLIPTextModelWithProjection
-
-        >>> model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
-        >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
-
-        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pd")
-
-        >>> outputs = model(**inputs)
-        >>> text_embeds = outputs.text_embeds
-        ```"""
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        text_outputs = self.text_model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        pooled_output = text_outputs[1]
-
-        text_embeds = self.text_projection(pooled_output)
-
-        if not return_dict:
-            outputs = (text_embeds, text_outputs[0]) + text_outputs[2:]
-            return tuple(output for output in outputs if output is not None)
-
-        return HFCLIPTextModelOutput(
-            text_embeds=text_embeds,
-            last_hidden_state=text_outputs.last_hidden_state,
-            hidden_states=text_outputs.hidden_states,
-            attentions=text_outputs.attentions,
-        )
-
-
-class HFCLIPVisionModelWithProjection(HFCLIPPretrainedModel):
-    config_class = CLIPVisionConfig
-    main_input_name = "pixel_values"
-
-    def __init__(self, config: CLIPVisionConfig):
-        super().__init__(config)
-
-        self.vision_model = HFCLIPVisionTransformer(config)
-
-        self.visual_projection = LinearClass(config.hidden_size, config.projection_dim, bias_attr=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self) -> nn.Layer:
-        return self.vision_model.embeddings.patch_embedding
-
-    def forward(
-        self,
-        pixel_values: Optional[paddle.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, HFCLIPVisionModelOutput]:
-        r"""
-        Returns:
-
-        Examples:
-
-        ```python
-        >>> from PIL import Image
-        >>> import requests
-        >>> from paddlenlp.transformers import CLIPProcessor, CLIPVisionModelWithProjection
-
-        >>> model = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
-        >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
-
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-
-        >>> inputs = processor(images=image, return_tensors="pd")
-
-        >>> outputs = model(**inputs)
-        >>> image_embeds = outputs.image_embeds
-        ```"""
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        vision_outputs = self.vision_model(
-            pixel_values=pixel_values,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        pooled_output = vision_outputs[1]  # pooled_output
-
-        image_embeds = self.visual_projection(pooled_output)
-
-        if not return_dict:
-            outputs = (image_embeds, vision_outputs[0]) + vision_outputs[2:]
-            return tuple(output for output in outputs if output is not None)
-
-        return HFCLIPVisionModelOutput(
-            image_embeds=image_embeds,
-            last_hidden_state=vision_outputs.last_hidden_state,
-            hidden_states=vision_outputs.hidden_states,
-            attentions=vision_outputs.attentions,
-        )
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
deleted file mode 100644
index e102e022a55c..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
+++ /dev/null
@@ -1,717 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import Callable, List, Optional, Union
-
-import numpy as np
-import paddle
-import PIL
-from packaging import version
-
-from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
-
-from ...configuration_utils import FrozenDict
-from ...loaders import TextualInversionLoaderMixin
-from ...models import AutoencoderKL, UNet2DConditionModel
-from ...schedulers import DDIMScheduler
-from ...utils import PIL_INTERPOLATION, deprecate, logging, randn_tensor
-from ..pipeline_utils import DiffusionPipeline
-from . import StableDiffusionPipelineOutput
-from .safety_checker import StableDiffusionSafetyChecker
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
-def preprocess(image):
-    if isinstance(image, paddle.Tensor):
-        return image
-    elif isinstance(image, PIL.Image.Image):
-        image = [image]
-
-    if isinstance(image[0], PIL.Image.Image):
-        w, h = image[0].size
-        w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
-
-        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
-        image = np.concatenate(image, axis=0)
-        image = np.array(image).astype(np.float32) / 255.0
-        image = image.transpose(0, 3, 1, 2)
-        image = 2.0 * image - 1.0
-        image = paddle.to_tensor(image)
-    elif isinstance(image[0], paddle.Tensor):
-        image = paddle.concat(image, axis=0)
-    return image
-
-
-def posterior_sample(scheduler, latents, timestep, clean_latents, generator, eta):
-    # 1. get previous step value (=t-1)
-    prev_timestep = timestep - scheduler.config.num_train_timesteps // scheduler.num_inference_steps
-
-    if prev_timestep <= 0:
-        return clean_latents
-
-    # 2. compute alphas, betas
-    alpha_prod_t = scheduler.alphas_cumprod[timestep]
-    alpha_prod_t_prev = (
-        scheduler.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else scheduler.final_alpha_cumprod
-    )
-
-    variance = scheduler._get_variance(timestep, prev_timestep)
-    std_dev_t = eta * variance ** (0.5)
-
-    # direction pointing to x_t
-    e_t = (latents - alpha_prod_t ** (0.5) * clean_latents) / (1 - alpha_prod_t) ** (0.5)
-    dir_xt = (1.0 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * e_t
-    noise = std_dev_t * randn_tensor(clean_latents.shape, dtype=clean_latents.dtype, generator=generator)
-    prev_latents = alpha_prod_t_prev ** (0.5) * clean_latents + dir_xt + noise
-
-    return prev_latents
-
-
-def compute_noise(scheduler, prev_latents, latents, timestep, noise_pred, eta):
-    # 1. get previous step value (=t-1)
-    prev_timestep = timestep - scheduler.config.num_train_timesteps // scheduler.num_inference_steps
-
-    # 2. compute alphas, betas
-    alpha_prod_t = scheduler.alphas_cumprod[timestep]
-    alpha_prod_t_prev = (
-        scheduler.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else scheduler.final_alpha_cumprod
-    )
-
-    beta_prod_t = 1 - alpha_prod_t
-
-    # 3. compute predicted original sample from predicted noise also called
-    # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-    pred_original_sample = (latents - beta_prod_t ** (0.5) * noise_pred) / alpha_prod_t ** (0.5)
-
-    # 4. Clip "predicted x_0"
-    if scheduler.config.clip_sample:
-        pred_original_sample = pred_original_sample.clip(-1, 1)
-
-    # 5. compute variance: "sigma_t(η)" -> see formula (16)
-    # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
-    variance = scheduler._get_variance(timestep, prev_timestep)
-    std_dev_t = eta * variance ** (0.5)
-
-    # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-    pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * noise_pred
-
-    noise = (prev_latents - (alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction)) / (
-        variance ** (0.5) * eta
-    )
-    return noise
-
-
-class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
-    r"""
-    Pipeline for text-guided image to image generation using Stable Diffusion.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular etc.)
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
-        feature_extractor ([`CLIPImageProcessor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-    _optional_components = ["safety_checker", "feature_extractor"]
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: DDIMScheduler,
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPImageProcessor,
-        requires_safety_checker: bool = True,
-    ):
-        super().__init__()
-
-        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
-                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
-                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
-                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
-                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file"
-            )
-            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["steps_offset"] = 1
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. PaddleNLP team, diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                f"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-        is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse(
-            version.parse(unet.config._ppdiffusers_version).base_version
-        ) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
-        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
-            deprecation_message = (
-                "The configuration file of the unet has set the default `sample_size` to smaller than"
-                " 64 which seems highly unlikely .If your checkpoint is a fine-tuned version of any of the"
-                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
-                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
-                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
-                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
-                " in the config might lead to incorrect results in future versions. If you have downloaded this"
-                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
-                " the `unet/config.json` file"
-            )
-            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(unet.config)
-            new_config["sample_size"] = 64
-            unet._internal_dict = FrozenDict(new_config)
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
-    def _encode_prompt(
-        self,
-        prompt,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-        """
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
-
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask
-            else:
-                attention_mask = None
-
-            prompt_embeds = self.text_encoder(
-                text_input_ids,
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
-
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = uncond_input.attention_mask
-            else:
-                attention_mask = None
-
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids,
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
-
-        return prompt_embeds
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.check_inputs
-    def check_inputs(
-        self, prompt, strength, callback_steps, negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None
-    ):
-        if strength < 0 or strength > 1:
-            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
-    def run_safety_checker(self, image, dtype):
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
-            )
-        else:
-            has_nsfw_concept = None
-        return image, has_nsfw_concept
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
-    def decode_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
-        image = self.vae.decode(latents).sample
-        image = (image / 2 + 0.5).clip(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        image = image.transpose([0, 2, 3, 1]).cast("float32").numpy()
-        return image
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
-    def get_timesteps(self, num_inference_steps, strength):
-        # get the original timestep using init_timestep
-        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
-
-        t_start = max(num_inference_steps - init_timestep, 0)
-        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
-
-        return timesteps, num_inference_steps - t_start
-
-    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None):
-        image = image.cast(dtype)
-
-        batch_size = image.shape[0]
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if isinstance(generator, list):
-            init_latents = [
-                self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
-            ]
-            init_latents = paddle.concat(init_latents, axis=0)
-        else:
-            init_latents = self.vae.encode(image).latent_dist.sample(generator)
-
-        init_latents = self.vae.config.scaling_factor * init_latents
-
-        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
-            # expand init_latents for batch_size
-            deprecation_message = (
-                f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
-                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
-                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
-                " your script to pass as many initial images as text prompts to suppress this warning."
-            )
-            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
-            additional_image_per_prompt = batch_size // init_latents.shape[0]
-            init_latents = paddle.concat([init_latents] * additional_image_per_prompt * num_images_per_prompt, axis=0)
-        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
-            raise ValueError(
-                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
-            )
-        else:
-            init_latents = paddle.concat([init_latents] * num_images_per_prompt, axis=0)
-
-        # add noise to latents using the timestep
-        shape = init_latents.shape
-        noise = randn_tensor(shape, generator=generator, dtype=dtype)
-
-        # get latents
-        clean_latents = init_latents
-        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
-        latents = init_latents
-
-        return latents, clean_latents
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]],
-        source_prompt: Union[str, List[str]],
-        image: Union[paddle.Tensor, PIL.Image.Image] = None,
-        strength: float = 0.8,
-        num_inference_steps: Optional[int] = 50,
-        guidance_scale: Optional[float] = 7.5,
-        negative_prompt: Optional[paddle.Tensor] = None,
-        source_guidance_scale: Optional[float] = 1,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: Optional[float] = 0.1,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`):
-                The target prompt or prompts to guide the image generation.
-            source_prompt (`str` or `List[str]`):
-                The source prompt or prompts describe the input image.
-            image (`paddle.Tensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, that will be used as the starting point for the
-                process.
-            strength (`float`, *optional*, defaults to 0.8):
-                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
-                `image` will be used as a starting point, adding more noise to it the larger the `strength`. The
-                number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
-                noise will be maximum and the denoising process will run for the full number of iterations specified in
-                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference. This parameter will be modulated by `strength`.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The negative prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
-                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
-            source_guidance_scale (`float`, *optional*, defaults to 1):
-                Guidance scale for the source prompt. This is useful to control the amount of influence the source
-                prompt for encoding.
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.1):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        # 1. Check inputs
-        self.check_inputs(prompt, strength, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds)
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode target prompt and source prompt
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt=negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-        )
-        source_prompt_embeds = self._encode_prompt(
-            source_prompt, num_images_per_prompt, do_classifier_free_guidance, None
-        )
-
-        # 4. Preprocess image
-        image = preprocess(image)
-
-        # 5. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
-        latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
-
-        # 6. Prepare latent variables
-        latents, clean_latents = self.prepare_latents(
-            image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, generator
-        )
-        source_latents = latents
-
-        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-        generator = extra_step_kwargs.pop("generator", None)
-
-        # 8. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2)
-                source_latent_model_input = paddle.concat([source_latents] * 2)
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-                source_latent_model_input = self.scheduler.scale_model_input(source_latent_model_input, t)
-
-                # predict the noise residual
-                concat_latent_model_input = paddle.stack(
-                    [
-                        source_latent_model_input[0],
-                        latent_model_input[0],
-                        source_latent_model_input[1],
-                        latent_model_input[1],
-                    ],
-                    axis=0,
-                )
-                concat_prompt_embeds = paddle.stack(
-                    [
-                        source_prompt_embeds[0],
-                        prompt_embeds[0],
-                        source_prompt_embeds[1],
-                        prompt_embeds[1],
-                    ],
-                    axis=0,
-                )
-                concat_noise_pred = self.unet(
-                    concat_latent_model_input, t, encoder_hidden_states=concat_prompt_embeds
-                ).sample
-
-                # perform guidance
-                (
-                    source_noise_pred_uncond,
-                    noise_pred_uncond,
-                    source_noise_pred_text,
-                    noise_pred_text,
-                ) = concat_noise_pred.chunk(4, axis=0)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                source_noise_pred = source_noise_pred_uncond + source_guidance_scale * (
-                    source_noise_pred_text - source_noise_pred_uncond
-                )
-
-                # Sample source_latents from the posterior distribution.
-                prev_source_latents = posterior_sample(
-                    self.scheduler, source_latents, t, clean_latents, generator=generator, **extra_step_kwargs
-                )
-                # Compute noise.
-                noise = compute_noise(
-                    self.scheduler, prev_source_latents, source_latents, t, source_noise_pred, **extra_step_kwargs
-                )
-                source_latents = prev_source_latents
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(
-                    noise_pred, t, latents, variance_noise=noise, **extra_step_kwargs
-                ).prev_sample
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        # 9. Post-processing
-        image = self.decode_latents(latents)
-
-        # 10. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
-
-        # 11. Convert to PIL
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py
deleted file mode 100644
index 9f47de32c691..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py
+++ /dev/null
@@ -1,434 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Callable, Dict, List, Optional, Union
-
-import paddle
-import PIL
-
-from paddlenlp.transformers import CLIPImageProcessor, CLIPTokenizer
-
-from ...pipeline_utils import DiffusionPipeline
-from ...schedulers import DDIMScheduler
-from ...utils import logging, randn_tensor
-from ..fastdeploy_utils import FastDeployDiffusionPipelineMixin, FastDeployRuntimeModel
-from . import StableDiffusionPipelineOutput
-
-logger = logging.get_logger(__name__)
-
-
-def posterior_sample(scheduler, latents, timestep, clean_latents, generator, eta):
-    # 1. get previous step value (=t-1)
-    prev_timestep = timestep - scheduler.config.num_train_timesteps // scheduler.num_inference_steps
-
-    if prev_timestep <= 0:
-        return clean_latents
-
-    # 2. compute alphas, betas
-    alpha_prod_t = scheduler.alphas_cumprod[timestep]
-    alpha_prod_t_prev = (
-        scheduler.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else scheduler.final_alpha_cumprod
-    )
-
-    variance = scheduler._get_variance(timestep, prev_timestep)
-    std_dev_t = eta * variance ** (0.5)
-
-    # direction pointing to x_t
-    e_t = (latents - alpha_prod_t ** (0.5) * clean_latents) / (1 - alpha_prod_t) ** (0.5)
-    dir_xt = (1.0 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * e_t
-    noise = std_dev_t * randn_tensor(clean_latents.shape, dtype=clean_latents.dtype, generator=generator)
-    prev_latents = alpha_prod_t_prev ** (0.5) * clean_latents + dir_xt + noise
-
-    return prev_latents
-
-
-def compute_noise(scheduler, prev_latents, latents, timestep, noise_pred, eta):
-    # 1. get previous step value (=t-1)
-    prev_timestep = timestep - scheduler.config.num_train_timesteps // scheduler.num_inference_steps
-
-    # 2. compute alphas, betas
-    alpha_prod_t = scheduler.alphas_cumprod[timestep]
-    alpha_prod_t_prev = (
-        scheduler.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else scheduler.final_alpha_cumprod
-    )
-
-    beta_prod_t = 1 - alpha_prod_t
-
-    # 3. compute predicted original sample from predicted noise also called
-    # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-    pred_original_sample = (latents - beta_prod_t ** (0.5) * noise_pred) / alpha_prod_t ** (0.5)
-
-    # 4. Clip "predicted x_0"
-    if scheduler.config.clip_sample:
-        pred_original_sample = pred_original_sample.clip(-1, 1)
-
-    # 5. compute variance: "sigma_t(η)" -> see formula (16)
-    # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
-    variance = scheduler._get_variance(timestep, prev_timestep)
-    std_dev_t = eta * variance ** (0.5)
-
-    # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-    pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * noise_pred
-
-    noise = (prev_latents - (alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction)) / (
-        variance ** (0.5) * eta
-    )
-    return noise
-
-
-class FastDeployCycleDiffusionPipeline(DiffusionPipeline, FastDeployDiffusionPipelineMixin):
-    r"""
-    Pipeline for text-guided image to image generation using Stable Diffusion.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular xxxx, etc.)
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
-        feature_extractor ([`CLIPImageProcessor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-    _optional_components = ["safety_checker", "feature_extractor"]
-
-    def __init__(
-        self,
-        vae_encoder: FastDeployRuntimeModel,
-        vae_decoder: FastDeployRuntimeModel,
-        text_encoder: FastDeployRuntimeModel,
-        tokenizer: CLIPTokenizer,
-        unet: FastDeployRuntimeModel,
-        scheduler: DDIMScheduler,
-        safety_checker: FastDeployRuntimeModel,
-        feature_extractor: CLIPImageProcessor,
-        requires_safety_checker: bool = False,
-    ):
-        super().__init__()
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. PaddleNLP team, diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                f"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-
-        self.register_modules(
-            vae_encoder=vae_encoder,
-            vae_decoder=vae_decoder,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-        self.post_init()
-        self.change_scheduler("ddim")
-
-    def __call__(
-        self,
-        prompt: Union[str, List[str]],
-        source_prompt: Union[str, List[str]],
-        image: Union[paddle.Tensor, PIL.Image.Image] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        strength: float = 0.8,
-        num_inference_steps: Optional[int] = 50,
-        guidance_scale: Optional[float] = 7.5,
-        negative_prompt: Optional[paddle.Tensor] = None,
-        source_guidance_scale: Optional[float] = 1,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: Optional[float] = 0.1,
-        latents: Optional[paddle.Tensor] = None,
-        parse_prompt_type: Optional[str] = "lpw",
-        max_embeddings_multiples: Optional[int] = 3,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        infer_op_dict: Dict[str, str] = None,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`):
-                The target prompt or prompts to guide the image generation.
-            source_prompt (`str` or `List[str]`):
-                The source prompt or prompts describe the input image.
-            height (`int`, *optional*, defaults to None):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to None):
-                The width in pixels of the generated image.
-            image (`paddle.Tensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, that will be used as the starting point for the
-                process.
-            strength (`float`, *optional*, defaults to 0.8):
-                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
-                `image` will be used as a starting point, adding more noise to it the larger the `strength`. The
-                number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
-                noise will be maximum and the denoising process will run for the full number of iterations specified in
-                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference. This parameter will be modulated by `strength`.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The negative prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
-                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
-            source_guidance_scale (`float`, *optional*, defaults to 1):
-                Guidance scale for the source prompt. This is useful to control the amount of influence the source
-                prompt for encoding.
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.1):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        # 0. Preprocess image
-        init_image = self.image_processor.preprocess(image, height=height, width=width)
-        height, width = init_image.shape[-2:]
-
-        # 1. Check inputs
-        self.check_inputs(
-            prompt,
-            height,
-            width,
-            callback_steps,
-            negative_prompt,
-            prompt_embeds,
-            negative_prompt_embeds,
-            strength,
-        )
-        infer_op_dict = self.prepare_infer_op_dict(infer_op_dict)
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode target prompt and source prompt
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt=negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            parse_prompt_type=parse_prompt_type,
-            max_embeddings_multiples=max_embeddings_multiples,
-            infer_op=infer_op_dict.get("text_encoder", None),
-        )
-        source_prompt_embeds = self._encode_prompt(
-            source_prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            parse_prompt_type=parse_prompt_type,
-            max_embeddings_multiples=max_embeddings_multiples,
-            infer_op=infer_op_dict.get("text_encoder", None),
-        )
-
-        # 5. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
-        # 6. Prepare latent variables
-        # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
-        latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
-        is_strength_max = strength == 1.0
-        latents, clean_latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            height,
-            width,
-            generator,
-            latents,
-            image=init_image,
-            timestep=latent_timestep,
-            is_strength_max=is_strength_max,
-            return_image_latents=True,
-            infer_op=infer_op_dict.get("vae_encoder", None),
-        )
-        source_latents = latents
-
-        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-        generator = extra_step_kwargs.pop("generator", None)
-
-        # 8. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2)
-                source_latent_model_input = paddle.concat([source_latents] * 2)
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-                source_latent_model_input = self.scheduler.scale_model_input(source_latent_model_input, t)
-
-                # predict the noise residual
-                concat_latent_model_input = paddle.stack(
-                    [
-                        source_latent_model_input[0],
-                        latent_model_input[0],
-                        source_latent_model_input[1],
-                        latent_model_input[1],
-                    ],
-                    axis=0,
-                )
-                concat_prompt_embeds = paddle.stack(
-                    [
-                        source_prompt_embeds[0],
-                        prompt_embeds[0],
-                        source_prompt_embeds[1],
-                        prompt_embeds[1],
-                    ],
-                    axis=0,
-                )
-
-                unet_inputs = dict(
-                    sample=concat_latent_model_input,
-                    timestep=t,
-                    encoder_hidden_states=concat_prompt_embeds,
-                    infer_op=infer_op_dict.get("unet", None),
-                    output_shape=concat_latent_model_input.shape,
-                )
-                # predict the noise residual
-                concat_noise_pred = self.unet(**unet_inputs)[0]
-
-                # perform guidance
-                (
-                    source_noise_pred_uncond,
-                    noise_pred_uncond,
-                    source_noise_pred_text,
-                    noise_pred_text,
-                ) = concat_noise_pred.chunk(4, axis=0)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                source_noise_pred = source_noise_pred_uncond + source_guidance_scale * (
-                    source_noise_pred_text - source_noise_pred_uncond
-                )
-
-                # Sample source_latents from the posterior distribution.
-                prev_source_latents = posterior_sample(
-                    self.scheduler, source_latents, t, clean_latents, generator=generator, **extra_step_kwargs
-                )
-                # Compute noise.
-                noise = compute_noise(
-                    self.scheduler, prev_source_latents, source_latents, t, source_noise_pred, **extra_step_kwargs
-                )
-                source_latents = prev_source_latents
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(
-                    noise_pred, t, latents, variance_noise=noise, **extra_step_kwargs
-                ).prev_sample
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-                    if i == len(timesteps) - 1:
-                        # sync for accuracy it/s measure
-                        paddle.device.cuda.synchronize()
-
-        if not output_type == "latent":
-            image = self._decode_vae_latents(
-                latents / self.vae_scaling_factor, infer_op=infer_op_dict.get("vae_decoder", None)
-            )
-            image, has_nsfw_concept = self.run_safety_checker(image)
-        else:
-            image = latents
-            has_nsfw_concept = None
-
-        if has_nsfw_concept is None:
-            do_denormalize = [True] * image.shape[0]
-        else:
-            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
-
-        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion.py
deleted file mode 100644
index 521171611413..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion.py
+++ /dev/null
@@ -1,329 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Callable, Dict, List, Optional, Union
-
-import paddle
-import PIL
-
-from paddlenlp.transformers import CLIPImageProcessor, CLIPTokenizer
-
-from ...pipeline_utils import DiffusionPipeline
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import logging
-from ..fastdeploy_utils import FastDeployDiffusionPipelineMixin, FastDeployRuntimeModel
-from . import StableDiffusionPipelineOutput
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-class FastDeployStableDiffusionPipeline(DiffusionPipeline, FastDeployDiffusionPipelineMixin):
-    r"""
-    Pipeline for text-to-image generation using Stable Diffusion.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving etc.)
-
-    Args:
-        vae_encoder ([`FastDeployRuntimeModel`]):
-            Variational Auto-Encoder (VAE) Model to encode images to latent representations.
-        vae_decoder ([`FastDeployRuntimeModel`]):
-            Variational Auto-Encoder (VAE) Model to decode images from latent representations.
-        text_encoder ([`FastDeployRuntimeModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`FastDeployRuntimeModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], [`PNDMScheduler`], [`EulerDiscreteScheduler`], [`EulerAncestralDiscreteScheduler`]
-            or [`DPMSolverMultistepScheduler`].
-        safety_checker ([`FastDeployRuntimeModel`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPImageProcessor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-    _optional_components = ["vae_encoder", "safety_checker", "feature_extractor"]
-
-    def __init__(
-        self,
-        vae_encoder: FastDeployRuntimeModel,
-        vae_decoder: FastDeployRuntimeModel,
-        text_encoder: FastDeployRuntimeModel,
-        tokenizer: CLIPTokenizer,
-        unet: FastDeployRuntimeModel,
-        scheduler: KarrasDiffusionSchedulers,
-        safety_checker: FastDeployRuntimeModel,
-        feature_extractor: CLIPImageProcessor,
-        requires_safety_checker: bool = False,
-    ):
-        super().__init__()
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. PaddleNLP team, diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                f"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-
-        self.register_modules(
-            vae_encoder=vae_encoder,
-            vae_decoder=vae_decoder,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-        self.post_init()
-
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        parse_prompt_type: Optional[str] = "lpw",
-        max_embeddings_multiples: Optional[int] = 3,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
-        controlnet_conditioning_scale: float = 1.0,
-        infer_op_dict: Dict[str, str] = None,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            height (`int`, *optional*, defaults to None):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to None):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
-                is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        # 0. Default height and width to unet
-        height = height or 512
-        width = width or 512
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt,
-            height,
-            width,
-            callback_steps,
-            negative_prompt,
-            prompt_embeds,
-            negative_prompt_embeds,
-        )
-        infer_op_dict = self.prepare_infer_op_dict(infer_op_dict)
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # do_controlnet
-        do_controlnet = controlnet_cond is not None
-        if do_controlnet:
-            control_image, control_conditioning_scale = self.prepare_controlnet_cond(
-                controlnet_cond=controlnet_cond,
-                controlnet_conditioning_scale=controlnet_conditioning_scale,
-                width=width,
-                height=height,
-                batch_size=batch_size,
-                num_images_per_prompt=num_images_per_prompt,
-                do_classifier_free_guidance=do_classifier_free_guidance,
-            )
-
-        # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            parse_prompt_type=parse_prompt_type,
-            max_embeddings_multiples=max_embeddings_multiples,
-            infer_op=infer_op_dict.get("text_encoder", None),
-        )
-
-        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps)
-
-        # 5. Prepare latent variables
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            height,
-            width,
-            generator,
-            latents,
-        )
-
-        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 7. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-
-        is_scheduler_support_step_index = self.is_scheduler_support_step_index()
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-                if is_scheduler_support_step_index:
-                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i)
-                else:
-                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                unet_inputs = dict(
-                    sample=latent_model_input,
-                    timestep=t,
-                    encoder_hidden_states=prompt_embeds,
-                    infer_op=infer_op_dict.get("unet", None),
-                    output_shape=latent_model_input.shape,
-                )
-                if do_controlnet:
-                    unet_inputs["controlnet_cond"] = control_image
-                    unet_inputs["controlnet_conditioning_scale"] = control_conditioning_scale
-                # predict the noise residual
-                noise_pred_unet = self.unet(**unet_inputs)[0]
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                else:
-                    noise_pred = noise_pred_unet
-                # compute the previous noisy sample x_t -> x_t-1
-                if is_scheduler_support_step_index:
-                    scheduler_output = self.scheduler.step(
-                        noise_pred, t, latents, step_index=i, return_pred_original_sample=False, **extra_step_kwargs
-                    )
-                else:
-                    scheduler_output = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)
-                latents = scheduler_output.prev_sample
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-                    if i == len(timesteps) - 1:
-                        # sync for accuracy it/s measure
-                        paddle.device.cuda.synchronize()
-
-        if not output_type == "latent":
-            image = self._decode_vae_latents(
-                latents / self.vae_scaling_factor, infer_op=infer_op_dict.get("vae_decoder", None)
-            )
-            image, has_nsfw_concept = self.run_safety_checker(image)
-        else:
-            image = latents
-            has_nsfw_concept = None
-
-        if has_nsfw_concept is None:
-            do_denormalize = [True] * image.shape[0]
-        else:
-            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
-
-        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_controlnet.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_controlnet.py
deleted file mode 100644
index 324d66f3e018..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_controlnet.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .pipeline_fastdeploy_stable_diffusion import FastDeployStableDiffusionPipeline
-
-
-class FastDeployStableDiffusionControlNetPipeline(FastDeployStableDiffusionPipeline):
-    def __call__(
-        self,
-        *args,
-        **kwargs,
-    ):
-        controlnet_cond = kwargs.pop("controlnet_cond", None)
-        image = kwargs.pop("image", None)
-        if controlnet_cond is None:
-            kwargs["controlnet_cond"] = image
-        else:
-            kwargs["controlnet_cond"] = controlnet_cond
-        return super().__call__(*args, **kwargs)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_image_variation.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_image_variation.py
deleted file mode 100644
index 99bfe4f96ca2..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_image_variation.py
+++ /dev/null
@@ -1,328 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Callable, Dict, List, Optional, Union
-
-import paddle
-import PIL
-
-from paddlenlp.transformers import CLIPImageProcessor
-from ppdiffusers.pipelines.fastdeploy_utils import (
-    FastDeployDiffusionPipelineMixin,
-    FastDeployRuntimeModel,
-)
-
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import logging
-from ..pipeline_utils import DiffusionPipeline
-from . import StableDiffusionPipelineOutput
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-class FastDeployStableDiffusionImageVariationPipeline(DiffusionPipeline, FastDeployDiffusionPipelineMixin):
-    r"""
-    Pipeline to generate variations from an input image using Stable Diffusion.
-
-    Args:
-        vae_encoder ([`FastDeployRuntimeModel`]):
-            Variational Auto-Encoder (VAE) Model to encode images to latent representations.
-        vae_decoder ([`FastDeployRuntimeModel`]):
-            Variational Auto-Encoder (VAE) Model to decode images from latent representations.
-        image_encoder ([`CLIPVisionModelWithProjection`]):
-            Frozen CLIP image-encoder. Stable Diffusion Image Variation uses the vision portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPVisionModelWithProjection),
-            specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        unet ([`FastDeployRuntimeModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        safety_checker ([`FastDeployRuntimeModel`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPImageProcessor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-    # TODO: feature_extractor is required to encode images (if they are in PIL format),
-    # we should give a descriptive message if the pipeline doesn't have one.
-    _optional_components = ["safety_checker"]
-
-    def __init__(
-        self,
-        vae_encoder: FastDeployRuntimeModel,
-        vae_decoder: FastDeployRuntimeModel,
-        image_encoder: FastDeployRuntimeModel,
-        unet: FastDeployRuntimeModel,
-        scheduler: KarrasDiffusionSchedulers,
-        safety_checker: FastDeployRuntimeModel,
-        feature_extractor: CLIPImageProcessor,
-        requires_safety_checker: bool = False,
-    ):
-        super().__init__()
-
-        if safety_checker is None and requires_safety_checker:
-            logger.warn(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. PaddleNLP team, diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                f"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-
-        self.register_modules(
-            vae_encoder=vae_encoder,
-            vae_decoder=vae_decoder,
-            image_encoder=image_encoder,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-        self.post_init()
-
-    def _encode_image(self, image, num_images_per_prompt, do_classifier_free_guidance, infer_op_dict):
-        if not isinstance(image, paddle.Tensor):
-            image = self.feature_extractor(images=image, return_tensors="pd").pixel_values
-
-        image_encoder_inputs = dict(
-            pixel_values=image,
-            infer_op=infer_op_dict.get("image_encoder", None),
-            output_shape=[image.shape[0], 768],
-        )
-        image_embeddings = self.image_encoder(**image_encoder_inputs)[0]
-        image_embeddings = image_embeddings.unsqueeze(1)
-
-        # duplicate image embeddings for each generation per prompt, using mps friendly method
-        bs_embed, seq_len, _ = image_embeddings.shape
-        image_embeddings = image_embeddings.tile([1, num_images_per_prompt, 1])
-        image_embeddings = image_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        if do_classifier_free_guidance:
-            negative_prompt_embeds = paddle.zeros_like(image_embeddings)
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            image_embeddings = paddle.concat([negative_prompt_embeds, image_embeddings])
-
-        return image_embeddings
-
-    def check_inputs(self, image, height, width, callback_steps):
-        if (
-            not isinstance(image, paddle.Tensor)
-            and not isinstance(image, PIL.Image.Image)
-            and not isinstance(image, list)
-        ):
-            raise ValueError(
-                "`image` has to be of type `paddle.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
-                f" {type(image)}"
-            )
-
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        image: Union[PIL.Image.Image, List[PIL.Image.Image], paddle.Tensor],
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        infer_op_dict: Dict[str, str] = None,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `paddle.Tensor`):
-                The image or images to guide the image generation. If you provide a tensor, it needs to comply with the
-                configuration of
-                [this](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json)
-                `CLIPImageProcessor`
-            height (`int`, *optional*, defaults to None):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to None):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            infer_op_dict: The parameter infer_op_dict is a dictionary that maps module to it's inference op. The purpose of this dictionary is to store inferred operations or operations that have been deduced or determined during some process. The op are choosen from the following: 'None', 'zero_copy_infer', 'raw'.
-
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        infer_op_dict = self.prepare_infer_op_dict(infer_op_dict)
-
-        # 0. Default height and width to unet
-        height = height or 512
-        width = width or 512
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(image, height, width, callback_steps)
-
-        # 2. Define call parameters
-        if isinstance(image, PIL.Image.Image):
-            batch_size = 1
-        elif isinstance(image, list):
-            batch_size = len(image)
-        else:
-            batch_size = image.shape[0]
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input image
-        image_embeddings = self._encode_image(image, num_images_per_prompt, do_classifier_free_guidance, infer_op_dict)
-
-        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps)
-
-        # 5. Prepare latent variables
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            height,
-            width,
-            generator,
-            latents,
-        )
-
-        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 7. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        is_scheduler_support_step_index = self.is_scheduler_support_step_index()
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-                if is_scheduler_support_step_index:
-                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i)
-                else:
-                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                # predict the noise residual
-                unet_inputs = dict(
-                    sample=latent_model_input,
-                    timestep=t,
-                    encoder_hidden_states=image_embeddings,
-                    infer_op=infer_op_dict.get("unet", None),
-                    output_shape=latent_model_input.shape,
-                )
-                noise_pred = self.unet(**unet_inputs)[0]
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                if is_scheduler_support_step_index:
-                    scheduler_output = self.scheduler.step(
-                        noise_pred, t, latents, step_index=i, return_pred_original_sample=False, **extra_step_kwargs
-                    )
-                else:
-                    scheduler_output = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)
-                latents = scheduler_output.prev_sample
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-                    if i == len(timesteps) - 1:
-                        # sync for accuracy it/s measure
-                        paddle.device.cuda.synchronize()
-
-        # 8. Post-processing
-        image = self._decode_vae_latents(
-            latents / self.vae_scaling_factor, infer_op=infer_op_dict.get("vae_decoder", None)
-        )
-
-        # 9. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image)
-
-        if has_nsfw_concept is None:
-            do_denormalize = [True] * image.shape[0]
-        else:
-            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
-
-        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_img2img.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_img2img.py
deleted file mode 100644
index 1a26e8f82a0d..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_img2img.py
+++ /dev/null
@@ -1,352 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Callable, Dict, List, Optional, Union
-
-import paddle
-import PIL
-
-from paddlenlp.transformers import CLIPImageProcessor, CLIPTokenizer
-
-from ...pipeline_utils import DiffusionPipeline
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import logging
-from ..fastdeploy_utils import FastDeployDiffusionPipelineMixin, FastDeployRuntimeModel
-from . import StableDiffusionPipelineOutput
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-class FastDeployStableDiffusionImg2ImgPipeline(DiffusionPipeline, FastDeployDiffusionPipelineMixin):
-    r"""
-    Pipeline for text-guided image-to-image generation using Stable Diffusion.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving etc.)
-
-    Args:
-        vae_encoder ([`FastDeployRuntimeModel`]):
-            Variational Auto-Encoder (VAE) Model to encode images to latent representations.
-        vae_decoder ([`FastDeployRuntimeModel`]):
-            Variational Auto-Encoder (VAE) Model to decode images from latent representations.
-        text_encoder ([`FastDeployRuntimeModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`FastDeployRuntimeModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], [`PNDMScheduler`], [`EulerDiscreteScheduler`], [`EulerAncestralDiscreteScheduler`]
-            or [`DPMSolverMultistepScheduler`].
-        safety_checker ([`FastDeployRuntimeModel`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPImageProcessor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-    _optional_components = ["safety_checker", "feature_extractor"]
-
-    def __init__(
-        self,
-        vae_encoder: FastDeployRuntimeModel,
-        vae_decoder: FastDeployRuntimeModel,
-        text_encoder: FastDeployRuntimeModel,
-        tokenizer: CLIPTokenizer,
-        unet: FastDeployRuntimeModel,
-        scheduler: KarrasDiffusionSchedulers,
-        safety_checker: FastDeployRuntimeModel,
-        feature_extractor: CLIPImageProcessor,
-        requires_safety_checker: bool = False,
-    ):
-        super().__init__()
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. PaddleNLP team, diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                f"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-
-        self.register_modules(
-            vae_encoder=vae_encoder,
-            vae_decoder=vae_decoder,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-        self.post_init()
-
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        image: Union[paddle.Tensor, PIL.Image.Image] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        strength: float = 0.8,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        parse_prompt_type: Optional[str] = "lpw",
-        max_embeddings_multiples: Optional[int] = 3,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
-        controlnet_conditioning_scale: float = 1.0,
-        infer_op_dict: Dict[str, str] = None,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            height (`int`, *optional*, defaults to None):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to None):
-                The width in pixels of the generated image.
-            image (`paddle.Tensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, that will be used as the starting point for the
-                process.
-            strength (`float`, *optional*, defaults to 0.8):
-                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
-                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
-                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
-                be maximum and the denoising process will run for the full number of iterations specified in
-                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference. This parameter will be modulated by `strength`.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
-                is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        # 0. Preprocess image
-        init_image = self.image_processor.preprocess(image, height=height, width=width)
-        height, width = init_image.shape[-2:]
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt,
-            height,
-            width,
-            callback_steps,
-            negative_prompt,
-            prompt_embeds,
-            negative_prompt_embeds,
-            strength,
-        )
-        infer_op_dict = self.prepare_infer_op_dict(infer_op_dict)
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # do_controlnet
-        do_controlnet = controlnet_cond is not None
-        if do_controlnet:
-            control_image, control_conditioning_scale = self.prepare_controlnet_cond(
-                controlnet_cond=controlnet_cond,
-                controlnet_conditioning_scale=controlnet_conditioning_scale,
-                width=width,
-                height=height,
-                batch_size=batch_size,
-                num_images_per_prompt=num_images_per_prompt,
-                do_classifier_free_guidance=do_classifier_free_guidance,
-            )
-
-        # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            parse_prompt_type=parse_prompt_type,
-            max_embeddings_multiples=max_embeddings_multiples,
-            infer_op=infer_op_dict.get("text_encoder", None),
-        )
-
-        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
-
-        # 5. Prepare latent variables
-        # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
-        latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
-        # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
-        is_strength_max = strength == 1.0
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            height,
-            width,
-            generator,
-            latents,
-            image=init_image,
-            timestep=latent_timestep,
-            is_strength_max=is_strength_max,
-            infer_op=infer_op_dict.get("vae_encoder", None),
-        )
-
-        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 7. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-
-        is_scheduler_support_step_index = self.is_scheduler_support_step_index()
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-                if is_scheduler_support_step_index:
-                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i)
-                else:
-                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                unet_inputs = dict(
-                    sample=latent_model_input,
-                    timestep=t,
-                    encoder_hidden_states=prompt_embeds,
-                    infer_op=infer_op_dict.get("unet", None),
-                    output_shape=latent_model_input.shape,
-                )
-                if do_controlnet:
-                    unet_inputs["controlnet_cond"] = control_image
-                    unet_inputs["controlnet_conditioning_scale"] = control_conditioning_scale
-                # predict the noise residual
-                noise_pred_unet = self.unet(**unet_inputs)[0]
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                else:
-                    noise_pred = noise_pred_unet
-
-                # compute the previous noisy sample x_t -> x_t-1
-                if is_scheduler_support_step_index:
-                    scheduler_output = self.scheduler.step(
-                        noise_pred, t, latents, step_index=i, return_pred_original_sample=False, **extra_step_kwargs
-                    )
-                else:
-                    scheduler_output = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)
-                latents = scheduler_output.prev_sample
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-                    if i == len(timesteps) - 1:
-                        # sync for accuracy it/s measure
-                        paddle.device.cuda.synchronize()
-
-        if not output_type == "latent":
-            image = self._decode_vae_latents(
-                latents / self.vae_scaling_factor, infer_op=infer_op_dict.get("vae_decoder", None)
-            )
-            image, has_nsfw_concept = self.run_safety_checker(image)
-        else:
-            image = latents
-            has_nsfw_concept = None
-
-        if has_nsfw_concept is None:
-            do_denormalize = [True] * image.shape[0]
-        else:
-            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
-
-        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_inpaint.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_inpaint.py
deleted file mode 100644
index 99542aa97623..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_inpaint.py
+++ /dev/null
@@ -1,556 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Callable, Dict, List, Optional, Union
-
-import numpy as np
-import paddle
-import PIL
-
-from paddlenlp.transformers import CLIPImageProcessor, CLIPTokenizer
-
-from ...pipeline_utils import DiffusionPipeline
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import PIL_INTERPOLATION, logging
-from ..fastdeploy_utils import FastDeployDiffusionPipelineMixin, FastDeployRuntimeModel
-from . import StableDiffusionPipelineOutput
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-def prepare_mask_and_masked_image(image, mask, height=None, width=None, return_image: bool = False):
-    """
-    Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be
-    converted to ``paddle.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
-    ``image`` and ``1`` for the ``mask``.
-
-    The ``image`` will be converted to ``paddle.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be
-    binarized (``mask > 0.5``) and cast to ``paddle.float32`` too.
-
-    Args:
-        image (Union[np.array, PIL.Image, paddle.Tensor]): The image to inpaint.
-            It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` or a ``channels x height x width``
-            ``paddle.Tensor`` or a ``batch x channels x height x width`` ``paddle.Tensor``.
-        mask (_type_): The mask to apply to the image, i.e. regions to inpaint.
-            It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width``
-            ``paddle.Tensor`` or a ``batch x 1 x height x width`` ``paddle.Tensor``.
-
-
-    Raises:
-        ValueError: ``paddle.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``paddle.Tensor`` mask
-        should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions.
-        TypeError: ``mask`` is a ``paddle.Tensor`` but ``image`` is not
-            (ot the other way around).
-
-    Returns:
-        tuple[paddle.Tensor]: The pair (mask, masked_image) as ``paddle.Tensor`` with 4
-            dimensions: ``batch x channels x height x width``.
-    """
-
-    if image is None:
-        raise ValueError("`image` input cannot be undefined.")
-
-    if mask is None:
-        raise ValueError("`mask_image` input cannot be undefined.")
-
-    if isinstance(image, paddle.Tensor):
-        if not isinstance(mask, paddle.Tensor):
-            raise TypeError(f"`image` is a paddle.Tensor but `mask` (type: {type(mask)} is not")
-
-        # Batch single image
-        if image.ndim == 3:
-            assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
-            image = image.unsqueeze(0)
-
-        # Batch and add channel dim for single mask
-        if mask.ndim == 2:
-            mask = mask.unsqueeze(0).unsqueeze(0)
-
-        # Batch single mask or add channel dim
-        if mask.ndim == 3:
-            # Single batched mask, no channel dim or single mask not batched but channel dim
-            if mask.shape[0] == 1:
-                mask = mask.unsqueeze(0)
-
-            # Batched masks no channel dim
-            else:
-                mask = mask.unsqueeze(1)
-
-        assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
-        assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
-        assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
-
-        # Check image is in [-1, 1]
-        if image.min() < -1 or image.max() > 1:
-            raise ValueError("Image should be in [-1, 1] range")
-
-        # Check mask is in [0, 1]
-        if mask.min() < 0 or mask.max() > 1:
-            raise ValueError("Mask should be in [0, 1] range")
-
-        # Binarize mask
-        mask[mask < 0.5] = 0
-        mask[mask >= 0.5] = 1
-
-        # Image as float32
-        image = image.cast(dtype=paddle.float32)
-    elif isinstance(mask, paddle.Tensor):
-        raise TypeError(f"`mask` is a paddle.Tensor but `image` (type: {type(image)} is not")
-    else:
-        # preprocess image
-        if isinstance(image, (PIL.Image.Image, np.ndarray)):
-            image = [image]
-        if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
-            # resize all images w.r.t passed height an width
-            if width is None or height is None:
-                w, h = image[0].size
-            else:
-                w, h = width, height
-            w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
-            image = [i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]) for i in image]
-            image = [np.array(i.convert("RGB"))[None, :] for i in image]
-            image = np.concatenate(image, axis=0)
-        elif isinstance(image, list) and isinstance(image[0], np.ndarray):
-            image = np.concatenate([i[None, :] for i in image], axis=0)
-
-        image = image.transpose(0, 3, 1, 2)
-        image = paddle.to_tensor(image, dtype=paddle.float32) / 127.5 - 1.0
-
-        # preprocess mask
-        if isinstance(mask, (PIL.Image.Image, np.ndarray)):
-            mask = [mask]
-
-        if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image):
-            # resize all images w.r.t passed height an width
-            if width is None or height is None:
-                w, h = mask[0].size
-            else:
-                w, h = width, height
-            w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
-            mask = [i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]) for i in mask]
-            mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
-            mask = mask.astype(np.float32) / 255.0
-        elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
-            mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
-
-        mask[mask < 0.5] = 0
-        mask[mask >= 0.5] = 1
-        mask = paddle.to_tensor(mask)
-
-    masked_image = image * (mask < 0.5)
-
-    # n.b. ensure backwards compatibility as old function does not return image
-    if return_image:
-        return mask, masked_image, image
-
-    return mask, masked_image
-
-
-class FastDeployStableDiffusionInpaintPipeline(DiffusionPipeline, FastDeployDiffusionPipelineMixin):
-    r"""
-    Pipeline for text-guided image inpainting using Stable Diffusion.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving etc.)
-
-    Args:
-        vae_encoder ([`FastDeployRuntimeModel`]):
-            Variational Auto-Encoder (VAE) Model to encode images to latent representations.
-        vae_decoder ([`FastDeployRuntimeModel`]):
-            Variational Auto-Encoder (VAE) Model to decode images from latent representations.
-        text_encoder ([`FastDeployRuntimeModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`FastDeployRuntimeModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], [`PNDMScheduler`], [`EulerDiscreteScheduler`], [`EulerAncestralDiscreteScheduler`]
-            or [`DPMSolverMultistepScheduler`].
-        safety_checker ([`FastDeployRuntimeModel`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPImageProcessor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-    _optional_components = ["safety_checker", "feature_extractor"]
-
-    def __init__(
-        self,
-        vae_encoder: FastDeployRuntimeModel,
-        vae_decoder: FastDeployRuntimeModel,
-        text_encoder: FastDeployRuntimeModel,
-        tokenizer: CLIPTokenizer,
-        unet: FastDeployRuntimeModel,
-        scheduler: KarrasDiffusionSchedulers,
-        safety_checker: FastDeployRuntimeModel,
-        feature_extractor: CLIPImageProcessor,
-        requires_safety_checker: bool = False,
-    ):
-        super().__init__()
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. PaddleNLP team, diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                f"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-
-        self.register_modules(
-            vae_encoder=vae_encoder,
-            vae_decoder=vae_decoder,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-        self.post_init()
-
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        image: Union[paddle.Tensor, PIL.Image.Image] = None,
-        mask_image: Union[paddle.Tensor, PIL.Image.Image] = None,
-        height: int = None,
-        width: int = None,
-        strength: float = 1.0,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        add_predicted_noise: Optional[bool] = False,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        parse_prompt_type: Optional[str] = "lpw",
-        max_embeddings_multiples: Optional[int] = 3,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
-        controlnet_conditioning_scale: float = 1.0,
-        infer_op_dict: Dict[str, str] = None,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            image (`paddle.Tensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, that will be used as the starting point for the
-                process. This is the image whose masked region will be inpainted.
-            mask_image (`paddle.Tensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
-                replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
-                PIL image, it will be converted to a single channel (luminance) before use. If mask is a tensor, the
-                expected shape should be either `(B, H, W, C)` or `(B, C, H, W)`, where C is 1 or 3.
-            height (`int`, *optional*, defaults to None):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to None):
-                The width in pixels of the generated image.
-            strength (`float`, *optional*, defaults to 1.0):
-                Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
-                is 1, the denoising process will be run on the masked area for the full number of iterations specified
-                in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more noise to
-                that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
-                the expense of slower inference. This parameter will be modulated by `strength`, as explained above.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
-                is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            add_predicted_noise (`bool`, *optional*, defaults to False):
-                Use predicted noise instead of random noise when constructing noisy versions of the original image in
-                the reverse diffusion process
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator`, *optional*):
-                One or a list of [paddle generator(s)] to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noise tensor, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. If not provided, a noise tensor will ge generated by sampling using the supplied random
-                `generator`.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        # 0. Preprocess mask and image
-        mask, masked_image, init_image = prepare_mask_and_masked_image(
-            image,
-            mask_image,
-            height,
-            width,
-            return_image=True,
-        )
-        height, width = init_image.shape[-2:]
-
-        # 1. Check inputs
-        self.check_inputs(
-            prompt,
-            height,
-            width,
-            callback_steps,
-            negative_prompt,
-            prompt_embeds,
-            negative_prompt_embeds,
-            strength,
-        )
-        infer_op_dict = self.prepare_infer_op_dict(infer_op_dict)
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            parse_prompt_type=parse_prompt_type,
-            max_embeddings_multiples=max_embeddings_multiples,
-            infer_op=infer_op_dict.get("text_encoder", None),
-        )
-
-        # 4. set timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
-        # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
-        latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
-        # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
-        is_strength_max = strength == 1.0
-
-        # 5. Prepare latent variables
-        num_channels_latents = self.vae_decoder_num_latent_channels
-        num_channels_unet = self.unet_num_latent_channels
-        is_legacy = return_image_latents = num_channels_unet == 4
-
-        latents_outputs = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            height,
-            width,
-            generator,
-            latents,
-            image=init_image,
-            timestep=latent_timestep,
-            is_strength_max=is_strength_max,
-            return_noise=True,
-            return_image_latents=return_image_latents,
-            infer_op=infer_op_dict.get("vae_encoder", None),
-        )
-
-        if return_image_latents:
-            latents, noise, image_latents = latents_outputs
-        else:
-            latents, noise = latents_outputs
-
-        # 6. Prepare mask latent variables
-        mask, masked_image_latents = self.prepare_mask_latents(
-            mask,
-            masked_image,
-            batch_size * num_images_per_prompt,
-            height,
-            width,
-            do_classifier_free_guidance,
-            return_masked_image_latents=True,
-            infer_op=infer_op_dict.get("vae_encoder", None),
-        )
-
-        # 7. Check that sizes of mask, masked image and latents match
-        if num_channels_unet == 9:
-            # default case for runwayml/stable-diffusion-inpainting
-            num_channels_mask = mask.shape[1]
-            num_channels_masked_image = masked_image_latents.shape[1]
-            if num_channels_latents + num_channels_mask + num_channels_masked_image != num_channels_unet:
-                raise ValueError(
-                    f"Incorrect configuration settings! Received `num_channels_latents`: {num_channels_latents} +"
-                    f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
-                    f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
-                    " `pipeline.unet` or your `mask_image` or `image` input."
-                )
-        elif num_channels_unet != 4:
-            raise ValueError(f"The unet should have either 4 or 9 input channels, not {num_channels_unet}.")
-        # do_controlnet
-        do_controlnet = controlnet_cond is not None and num_channels_unet == 4
-        if do_controlnet:
-            control_image, control_conditioning_scale = self.prepare_controlnet_cond(
-                controlnet_cond=controlnet_cond,
-                controlnet_conditioning_scale=controlnet_conditioning_scale,
-                width=width,
-                height=height,
-                batch_size=batch_size,
-                num_images_per_prompt=num_images_per_prompt,
-                do_classifier_free_guidance=do_classifier_free_guidance,
-            )
-
-        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        if do_classifier_free_guidance:
-            init_mask = mask[: mask.shape[0] // 2]
-        else:
-            init_mask = mask
-
-        # 9. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        is_scheduler_support_step_index = self.is_scheduler_support_step_index()
-
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-                if is_scheduler_support_step_index:
-                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i)
-                else:
-                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                output_shape = latent_model_input.shape
-                if not is_legacy:
-                    # concat latents, mask, masked_image_latents in the channel dimension
-                    latent_model_input = paddle.concat([latent_model_input, mask, masked_image_latents], axis=1)
-
-                unet_inputs = dict(
-                    sample=latent_model_input,
-                    timestep=t,
-                    encoder_hidden_states=prompt_embeds,
-                    infer_op=infer_op_dict.get("unet", None),
-                    output_shape=output_shape,
-                )
-                if do_controlnet:
-                    unet_inputs["controlnet_cond"] = control_image
-                    unet_inputs["controlnet_conditioning_scale"] = control_conditioning_scale
-                # predict the noise residual
-                noise_pred_unet = self.unet(**unet_inputs)[0]
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                else:
-                    noise_pred = noise_pred_unet
-
-                # compute the previous noisy sample x_t -> x_t-1
-                if is_scheduler_support_step_index:
-                    scheduler_output = self.scheduler.step(
-                        noise_pred, t, latents, step_index=i, return_pred_original_sample=False, **extra_step_kwargs
-                    )
-                else:
-                    scheduler_output = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)
-                latents = scheduler_output.prev_sample
-
-                if is_legacy:
-                    if i < len(timesteps) - 1:
-                        # masking
-                        if add_predicted_noise:
-                            init_latents_proper = self.scheduler.add_noise(image_latents, noise_pred_uncond, t)
-                        else:
-                            # https://github.com/huggingface/diffusers/pull/3749/files#diff-39d36ab1e622684e35fe6971c12fb44e24756bdc383aba3d7f6e3b1625bdaafc
-                            noise_timestep = timesteps[i + 1]
-                            init_latents_proper = self.scheduler.add_noise(image_latents, noise, noise_timestep)
-                    else:
-                        init_latents_proper = image_latents
-                    latents = (1 - init_mask) * init_latents_proper + init_mask * latents
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-                    if i == len(timesteps) - 1:
-                        # sync for accuracy it/s measure
-                        paddle.device.cuda.synchronize()
-
-        if not output_type == "latent":
-            image = self._decode_vae_latents(
-                latents / self.vae_scaling_factor, infer_op=infer_op_dict.get("vae_decoder", None)
-            )
-            image, has_nsfw_concept = self.run_safety_checker(image)
-        else:
-            image = latents
-            has_nsfw_concept = None
-
-        if has_nsfw_concept is None:
-            do_denormalize = [True] * image.shape[0]
-        else:
-            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
-
-        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_inpaint_legacy.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_inpaint_legacy.py
deleted file mode 100644
index c8bec577e469..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_inpaint_legacy.py
+++ /dev/null
@@ -1,527 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Callable, Dict, List, Optional, Union
-
-import numpy as np
-import paddle
-import PIL
-
-from paddlenlp.transformers import CLIPImageProcessor, CLIPTokenizer
-
-from ...pipeline_utils import DiffusionPipeline
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import PIL_INTERPOLATION, logging
-from ..fastdeploy_utils import FastDeployDiffusionPipelineMixin, FastDeployRuntimeModel
-from . import StableDiffusionPipelineOutput
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-def prepare_mask_and_masked_image(image, mask, height=None, width=None, return_image: bool = False):
-    """
-    Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be
-    converted to ``paddle.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
-    ``image`` and ``1`` for the ``mask``.
-
-    The ``image`` will be converted to ``paddle.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be
-    binarized (``mask > 0.5``) and cast to ``paddle.float32`` too.
-
-    Args:
-        image (Union[np.array, PIL.Image, paddle.Tensor]): The image to inpaint.
-            It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` or a ``channels x height x width``
-            ``paddle.Tensor`` or a ``batch x channels x height x width`` ``paddle.Tensor``.
-        mask (_type_): The mask to apply to the image, i.e. regions to inpaint.
-            It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width``
-            ``paddle.Tensor`` or a ``batch x 1 x height x width`` ``paddle.Tensor``.
-
-
-    Raises:
-        ValueError: ``paddle.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``paddle.Tensor`` mask
-        should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions.
-        TypeError: ``mask`` is a ``paddle.Tensor`` but ``image`` is not
-            (ot the other way around).
-
-    Returns:
-        tuple[paddle.Tensor]: The pair (mask, masked_image) as ``paddle.Tensor`` with 4
-            dimensions: ``batch x channels x height x width``.
-    """
-
-    if image is None:
-        raise ValueError("`image` input cannot be undefined.")
-
-    if mask is None:
-        raise ValueError("`mask_image` input cannot be undefined.")
-
-    if isinstance(image, paddle.Tensor):
-        if not isinstance(mask, paddle.Tensor):
-            raise TypeError(f"`image` is a paddle.Tensor but `mask` (type: {type(mask)} is not")
-
-        # Batch single image
-        if image.ndim == 3:
-            assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
-            image = image.unsqueeze(0)
-
-        # Batch and add channel dim for single mask
-        if mask.ndim == 2:
-            mask = mask.unsqueeze(0).unsqueeze(0)
-
-        # Batch single mask or add channel dim
-        if mask.ndim == 3:
-            # Single batched mask, no channel dim or single mask not batched but channel dim
-            if mask.shape[0] == 1:
-                mask = mask.unsqueeze(0)
-
-            # Batched masks no channel dim
-            else:
-                mask = mask.unsqueeze(1)
-
-        assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
-        assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
-        assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
-
-        # Check image is in [-1, 1]
-        if image.min() < -1 or image.max() > 1:
-            raise ValueError("Image should be in [-1, 1] range")
-
-        # Check mask is in [0, 1]
-        if mask.min() < 0 or mask.max() > 1:
-            raise ValueError("Mask should be in [0, 1] range")
-
-        # Binarize mask
-        mask[mask < 0.5] = 0
-        mask[mask >= 0.5] = 1
-
-        # Image as float32
-        image = image.cast(dtype=paddle.float32)
-    elif isinstance(mask, paddle.Tensor):
-        raise TypeError(f"`mask` is a paddle.Tensor but `image` (type: {type(image)} is not")
-    else:
-        # preprocess image
-        if isinstance(image, (PIL.Image.Image, np.ndarray)):
-            image = [image]
-        if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
-            # resize all images w.r.t passed height an width
-            if width is None or height is None:
-                w, h = image[0].size
-            else:
-                w, h = width, height
-            w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
-            image = [i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]) for i in image]
-            image = [np.array(i.convert("RGB"))[None, :] for i in image]
-            image = np.concatenate(image, axis=0)
-        elif isinstance(image, list) and isinstance(image[0], np.ndarray):
-            image = np.concatenate([i[None, :] for i in image], axis=0)
-
-        image = image.transpose(0, 3, 1, 2)
-        image = paddle.to_tensor(image, dtype=paddle.float32) / 127.5 - 1.0
-
-        # preprocess mask
-        if isinstance(mask, (PIL.Image.Image, np.ndarray)):
-            mask = [mask]
-
-        if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image):
-            # resize all images w.r.t passed height an width
-            if width is None or height is None:
-                w, h = mask[0].size
-            else:
-                w, h = width, height
-            w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
-            mask = [i.resize((w, h), resample=PIL_INTERPOLATION["nearest"]) for i in mask]
-            mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
-            mask = mask.astype(np.float32) / 255.0
-        elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
-            mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
-
-        mask[mask < 0.5] = 0
-        mask[mask >= 0.5] = 1
-        mask = paddle.to_tensor(mask)
-
-    masked_image = image * (mask < 0.5)
-
-    # n.b. ensure backwards compatibility as old function does not return image
-    if return_image:
-        return mask, masked_image, image
-
-    return mask, masked_image
-
-
-class FastDeployStableDiffusionInpaintPipelineLegacy(DiffusionPipeline, FastDeployDiffusionPipelineMixin):
-    r"""
-    Pipeline for text-guided image inpainting legacy using Stable Diffusion.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving etc.)
-
-    Args:
-        vae_encoder ([`FastDeployRuntimeModel`]):
-            Variational Auto-Encoder (VAE) Model to encode images to latent representations.
-        vae_decoder ([`FastDeployRuntimeModel`]):
-            Variational Auto-Encoder (VAE) Model to decode images from latent representations.
-        text_encoder ([`FastDeployRuntimeModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`FastDeployRuntimeModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], [`PNDMScheduler`], [`EulerDiscreteScheduler`], [`EulerAncestralDiscreteScheduler`]
-            or [`DPMSolverMultistepScheduler`].
-        safety_checker ([`FastDeployRuntimeModel`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPImageProcessor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-    _optional_components = ["safety_checker", "feature_extractor"]
-
-    def __init__(
-        self,
-        vae_encoder: FastDeployRuntimeModel,
-        vae_decoder: FastDeployRuntimeModel,
-        text_encoder: FastDeployRuntimeModel,
-        tokenizer: CLIPTokenizer,
-        unet: FastDeployRuntimeModel,
-        scheduler: KarrasDiffusionSchedulers,
-        safety_checker: FastDeployRuntimeModel,
-        feature_extractor: CLIPImageProcessor,
-        requires_safety_checker: bool = False,
-    ):
-        super().__init__()
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. PaddleNLP team, diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                f"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-
-        self.register_modules(
-            vae_encoder=vae_encoder,
-            vae_decoder=vae_decoder,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-        self.post_init()
-
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        image: Union[paddle.Tensor, PIL.Image.Image] = None,
-        mask_image: Union[paddle.Tensor, PIL.Image.Image] = None,
-        height: int = None,
-        width: int = None,
-        strength: float = 1.0,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        add_predicted_noise: Optional[bool] = False,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        parse_prompt_type: Optional[str] = "lpw",
-        max_embeddings_multiples: Optional[int] = 3,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
-        controlnet_conditioning_scale: float = 1.0,
-        infer_op_dict: Dict[str, str] = None,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            image (`paddle.Tensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, that will be used as the starting point for the
-                process. This is the image whose masked region will be inpainted.
-            mask_image (`paddle.Tensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
-                replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
-                PIL image, it will be converted to a single channel (luminance) before use. If mask is a tensor, the
-                expected shape should be either `(B, H, W, C)` or `(B, C, H, W)`, where C is 1 or 3.
-            height (`int`, *optional*, defaults to None):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to None):
-                The width in pixels of the generated image.
-            strength (`float`, *optional*, defaults to 1.0):
-                Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
-                is 1, the denoising process will be run on the masked area for the full number of iterations specified
-                in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more noise to
-                that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
-                the expense of slower inference. This parameter will be modulated by `strength`, as explained above.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
-                is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            add_predicted_noise (`bool`, *optional*, defaults to False):
-                Use predicted noise instead of random noise when constructing noisy versions of the original image in
-                the reverse diffusion process
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator`, *optional*):
-                One or a list of [paddle generator(s)] to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noise tensor, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. If not provided, a noise tensor will ge generated by sampling using the supplied random
-                `generator`.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        # 0. Preprocess mask and image
-        mask, masked_image, init_image = prepare_mask_and_masked_image(
-            image,
-            mask_image,
-            height,
-            width,
-            return_image=True,
-        )
-        height, width = init_image.shape[-2:]
-
-        # 1. Check inputs
-        self.check_inputs(
-            prompt,
-            height,
-            width,
-            callback_steps,
-            negative_prompt,
-            prompt_embeds,
-            negative_prompt_embeds,
-            strength,
-        )
-        infer_op_dict = self.prepare_infer_op_dict(infer_op_dict)
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # do_controlnet
-        do_controlnet = controlnet_cond is not None
-        if do_controlnet:
-            control_image, control_conditioning_scale = self.prepare_controlnet_cond(
-                controlnet_cond=controlnet_cond,
-                controlnet_conditioning_scale=controlnet_conditioning_scale,
-                width=width,
-                height=height,
-                batch_size=batch_size,
-                num_images_per_prompt=num_images_per_prompt,
-                do_classifier_free_guidance=do_classifier_free_guidance,
-            )
-
-        # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            parse_prompt_type=parse_prompt_type,
-            max_embeddings_multiples=max_embeddings_multiples,
-            infer_op=infer_op_dict.get("text_encoder", None),
-        )
-
-        # 4. set timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
-        # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
-        latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
-        # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
-        is_strength_max = strength == 1.0
-
-        # 5. Prepare latent variables
-        latents, noise, image_latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            height,
-            width,
-            generator,
-            latents,
-            image=init_image,
-            timestep=latent_timestep,
-            is_strength_max=is_strength_max,
-            return_noise=True,
-            return_image_latents=True,
-            infer_op=infer_op_dict.get("vae_encoder", None),
-        )
-
-        # 6. Prepare mask latent variables
-        mask = self.prepare_mask_latents(
-            mask,
-            masked_image,
-            batch_size * num_images_per_prompt,
-            height,
-            width,
-            do_classifier_free_guidance,
-            return_masked_image_latents=False,
-            infer_op=infer_op_dict.get("vae_encoder", None),
-        )
-
-        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        if do_classifier_free_guidance:
-            init_mask = mask[: mask.shape[0] // 2]
-        else:
-            init_mask = mask
-
-        # 8. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        is_scheduler_support_step_index = self.is_scheduler_support_step_index()
-
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-                if is_scheduler_support_step_index:
-                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i)
-                else:
-                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                unet_inputs = dict(
-                    sample=latent_model_input,
-                    timestep=t,
-                    encoder_hidden_states=prompt_embeds,
-                    infer_op=infer_op_dict.get("unet", None),
-                    output_shape=latent_model_input.shape,
-                )
-                if do_controlnet:
-                    unet_inputs["controlnet_cond"] = control_image
-                    unet_inputs["controlnet_conditioning_scale"] = control_conditioning_scale
-                # predict the noise residual
-                noise_pred_unet = self.unet(**unet_inputs)[0]
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                else:
-                    noise_pred = noise_pred_unet
-
-                # compute the previous noisy sample x_t -> x_t-1
-                if is_scheduler_support_step_index:
-                    scheduler_output = self.scheduler.step(
-                        noise_pred, t, latents, step_index=i, return_pred_original_sample=False, **extra_step_kwargs
-                    )
-                else:
-                    scheduler_output = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)
-                latents = scheduler_output.prev_sample
-
-                if i < len(timesteps) - 1:
-                    # masking
-                    if add_predicted_noise:
-                        init_latents_proper = self.scheduler.add_noise(image_latents, noise_pred_uncond, t)
-                    else:
-                        # https://github.com/huggingface/diffusers/pull/3749/files#diff-39d36ab1e622684e35fe6971c12fb44e24756bdc383aba3d7f6e3b1625bdaafc
-                        noise_timestep = timesteps[i + 1]
-                        init_latents_proper = self.scheduler.add_noise(image_latents, noise, noise_timestep)
-                else:
-                    init_latents_proper = image_latents
-
-                latents = (1 - init_mask) * init_latents_proper + init_mask * latents
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-                    if i == len(timesteps) - 1:
-                        # sync for accuracy it/s measure
-                        paddle.device.cuda.synchronize()
-
-        if not output_type == "latent":
-            image = self._decode_vae_latents(
-                latents / self.vae_scaling_factor, infer_op=infer_op_dict.get("vae_decoder", None)
-            )
-            image, has_nsfw_concept = self.run_safety_checker(image)
-        else:
-            image = latents
-            has_nsfw_concept = None
-
-        if has_nsfw_concept is None:
-            do_denormalize = [True] * image.shape[0]
-        else:
-            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
-
-        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_mega.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_mega.py
deleted file mode 100644
index d2c9622fd7c8..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_mega.py
+++ /dev/null
@@ -1,360 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import Callable, Dict, List, Optional, Union
-
-import paddle
-import PIL.Image
-
-from ...utils import logging
-from .pipeline_fastdeploy_cycle_diffusion import FastDeployCycleDiffusionPipeline
-from .pipeline_fastdeploy_stable_diffusion import FastDeployStableDiffusionPipeline
-from .pipeline_fastdeploy_stable_diffusion_img2img import (
-    FastDeployStableDiffusionImg2ImgPipeline,
-)
-from .pipeline_fastdeploy_stable_diffusion_inpaint import (
-    FastDeployStableDiffusionInpaintPipeline,
-)
-from .pipeline_fastdeploy_stable_diffusion_inpaint_legacy import (
-    FastDeployStableDiffusionInpaintPipelineLegacy,
-)
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-class FastDeployStableDiffusionMegaPipeline(FastDeployStableDiffusionPipeline):
-    r"""
-    Pipeline for generation using FastDeployStableDiffusion.
-
-    This model inherits from [`FastDeployStableDiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving etc.)
-
-    Args:
-        vae_encoder ([`FastDeployRuntimeModel`]):
-            Variational Auto-Encoder (VAE) Model to encode images to latent representations.
-        vae_decoder ([`FastDeployRuntimeModel`]):
-            Variational Auto-Encoder (VAE) Model to decode images from latent representations.
-        text_encoder ([`FastDeployRuntimeModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`FastDeployRuntimeModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], [`PNDMScheduler`], [`EulerDiscreteScheduler`], [`EulerAncestralDiscreteScheduler`]
-            or [`DPMSolverMultistepScheduler`].
-        safety_checker ([`FastDeployRuntimeModel`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPFeatureExtractor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-    _optional_components = ["vae_encoder", "safety_checker", "feature_extractor"]
-
-    def __call__(self, *args, **kwargs):
-        return self.text2img(*args, **kwargs)
-
-    def text2img(
-        self,
-        prompt: Union[str, List[str]],
-        height: Optional[int] = 512,
-        width: Optional[int] = 512,
-        num_inference_steps: Optional[int] = 50,
-        guidance_scale: Optional[float] = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: Optional[float] = 0.0,
-        generator: Optional[paddle.Generator] = None,
-        latents: Optional[paddle.Tensor] = None,
-        parse_prompt_type: Optional[str] = "lpw",
-        max_embeddings_multiples: Optional[int] = 3,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
-        controlnet_conditioning_scale: float = 1.0,
-        infer_op_dict: Dict[str, str] = None,
-    ):
-
-        expected_components = inspect.signature(FastDeployStableDiffusionPipeline.__init__).parameters.keys()
-        components = {name: component for name, component in self.components.items() if name in expected_components}
-        temp_pipeline = FastDeployStableDiffusionPipeline(
-            **components, requires_safety_checker=self.config.requires_safety_checker
-        )
-        temp_pipeline._progress_bar_config = self._progress_bar_config
-        output = temp_pipeline(
-            prompt=prompt,
-            height=height,
-            width=width,
-            num_inference_steps=num_inference_steps,
-            guidance_scale=guidance_scale,
-            negative_prompt=negative_prompt,
-            num_images_per_prompt=num_images_per_prompt,
-            eta=eta,
-            generator=generator,
-            latents=latents,
-            parse_prompt_type=parse_prompt_type,
-            max_embeddings_multiples=max_embeddings_multiples,
-            output_type=output_type,
-            return_dict=return_dict,
-            callback=callback,
-            callback_steps=callback_steps,
-            controlnet_cond=controlnet_cond,
-            controlnet_conditioning_scale=controlnet_conditioning_scale,
-            infer_op_dict=infer_op_dict,
-        )
-        return output
-
-    def img2img(
-        self,
-        prompt: Union[str, List[str]],
-        image: Union[paddle.Tensor, PIL.Image.Image],
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        strength: float = 0.8,
-        num_inference_steps: Optional[int] = 50,
-        guidance_scale: Optional[float] = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: Optional[float] = 0.0,
-        generator: Optional[paddle.Generator] = None,
-        latents: Optional[paddle.Tensor] = None,
-        parse_prompt_type: Optional[str] = "lpw",
-        max_embeddings_multiples: Optional[int] = 3,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
-        controlnet_conditioning_scale: float = 1.0,
-        infer_op_dict: Dict[str, str] = None,
-    ):
-        expected_components = inspect.signature(FastDeployStableDiffusionImg2ImgPipeline.__init__).parameters.keys()
-        components = {name: component for name, component in self.components.items() if name in expected_components}
-        temp_pipeline = FastDeployStableDiffusionImg2ImgPipeline(
-            **components, requires_safety_checker=self.config.requires_safety_checker
-        )
-        temp_pipeline._progress_bar_config = self._progress_bar_config
-        output = temp_pipeline(
-            prompt=prompt,
-            image=image,
-            height=height,
-            width=width,
-            strength=strength,
-            num_inference_steps=num_inference_steps,
-            guidance_scale=guidance_scale,
-            negative_prompt=negative_prompt,
-            num_images_per_prompt=num_images_per_prompt,
-            eta=eta,
-            generator=generator,
-            latents=latents,
-            parse_prompt_type=parse_prompt_type,
-            max_embeddings_multiples=max_embeddings_multiples,
-            output_type=output_type,
-            return_dict=return_dict,
-            callback=callback,
-            callback_steps=callback_steps,
-            controlnet_cond=controlnet_cond,
-            controlnet_conditioning_scale=controlnet_conditioning_scale,
-            infer_op_dict=infer_op_dict,
-        )
-
-        return output
-
-    def inpaint_legacy(
-        self,
-        prompt: Union[str, List[str]],
-        image: Union[paddle.Tensor, PIL.Image.Image],
-        mask_image: Union[paddle.Tensor, PIL.Image.Image],
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        strength: float = 0.8,
-        num_inference_steps: Optional[int] = 50,
-        guidance_scale: Optional[float] = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: Optional[float] = 0.0,
-        generator: Optional[paddle.Generator] = None,
-        latents: Optional[paddle.Tensor] = None,
-        parse_prompt_type: Optional[str] = "lpw",
-        max_embeddings_multiples: Optional[int] = 3,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
-        controlnet_conditioning_scale: float = 1.0,
-        infer_op_dict: Dict[str, str] = None,
-    ):
-        assert (
-            self.unet_num_latent_channels == 4
-        ), f"Detected `unet_num_latent_channels` is {self.unet_num_latent_channels}, Plese use `inpaint` method."
-        expected_components = inspect.signature(
-            FastDeployStableDiffusionInpaintPipelineLegacy.__init__
-        ).parameters.keys()
-        components = {name: component for name, component in self.components.items() if name in expected_components}
-        temp_pipeline = FastDeployStableDiffusionInpaintPipelineLegacy(
-            **components, requires_safety_checker=self.config.requires_safety_checker
-        )
-        temp_pipeline._progress_bar_config = self._progress_bar_config
-        output = temp_pipeline(
-            prompt=prompt,
-            image=image,
-            mask_image=mask_image,
-            height=height,
-            width=width,
-            strength=strength,
-            num_inference_steps=num_inference_steps,
-            guidance_scale=guidance_scale,
-            negative_prompt=negative_prompt,
-            num_images_per_prompt=num_images_per_prompt,
-            eta=eta,
-            generator=generator,
-            latents=latents,
-            parse_prompt_type=parse_prompt_type,
-            max_embeddings_multiples=max_embeddings_multiples,
-            output_type=output_type,
-            return_dict=return_dict,
-            callback=callback,
-            callback_steps=callback_steps,
-            controlnet_cond=controlnet_cond,
-            controlnet_conditioning_scale=controlnet_conditioning_scale,
-            infer_op_dict=infer_op_dict,
-        )
-
-        return output
-
-    def inpaint(
-        self,
-        prompt: Union[str, List[str]],
-        image: Union[paddle.Tensor, PIL.Image.Image],
-        mask_image: Union[paddle.Tensor, PIL.Image.Image],
-        height=None,
-        width=None,
-        strength: float = 0.8,
-        num_inference_steps: Optional[int] = 50,
-        guidance_scale: Optional[float] = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: Optional[float] = 0.0,
-        generator: Optional[paddle.Generator] = None,
-        latents: Optional[paddle.Tensor] = None,
-        parse_prompt_type: Optional[str] = "lpw",
-        max_embeddings_multiples: Optional[int] = 3,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None,
-        controlnet_conditioning_scale: float = 1.0,
-        infer_op_dict: Dict[str, str] = None,
-    ):
-        assert self.unet_num_latent_channels in [4, 9]
-        expected_components = inspect.signature(FastDeployStableDiffusionInpaintPipeline.__init__).parameters.keys()
-        components = {name: component for name, component in self.components.items() if name in expected_components}
-        temp_pipeline = FastDeployStableDiffusionInpaintPipeline(
-            **components, requires_safety_checker=self.config.requires_safety_checker
-        )
-        temp_pipeline._progress_bar_config = self._progress_bar_config
-        output = temp_pipeline(
-            prompt=prompt,
-            image=image,
-            mask_image=mask_image,
-            height=height,
-            width=width,
-            strength=strength,
-            num_inference_steps=num_inference_steps,
-            guidance_scale=guidance_scale,
-            negative_prompt=negative_prompt,
-            num_images_per_prompt=num_images_per_prompt,
-            eta=eta,
-            generator=generator,
-            latents=latents,
-            parse_prompt_type=parse_prompt_type,
-            max_embeddings_multiples=max_embeddings_multiples,
-            output_type=output_type,
-            return_dict=return_dict,
-            callback=callback,
-            callback_steps=callback_steps,
-            controlnet_cond=controlnet_cond,
-            controlnet_conditioning_scale=controlnet_conditioning_scale,
-            infer_op_dict=infer_op_dict,
-        )
-
-        return output
-
-    def cycle_diffusion(
-        self,
-        prompt: Union[str, List[str]],
-        source_prompt: Union[str, List[str]],
-        image: Union[paddle.Tensor, PIL.Image.Image] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        strength: float = 0.8,
-        num_inference_steps: Optional[int] = 50,
-        guidance_scale: Optional[float] = 7.5,
-        negative_prompt: Optional[paddle.Tensor] = None,
-        source_guidance_scale: Optional[float] = 1,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: Optional[float] = 0.1,
-        latents: Optional[paddle.Tensor] = None,
-        parse_prompt_type: Optional[str] = "lpw",
-        max_embeddings_multiples: Optional[int] = 3,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        infer_op_dict: Dict[str, str] = None,
-    ):
-        expected_components = inspect.signature(FastDeployCycleDiffusionPipeline.__init__).parameters.keys()
-        components = {name: component for name, component in self.components.items() if name in expected_components}
-        temp_pipeline = FastDeployCycleDiffusionPipeline(
-            **components, requires_safety_checker=self.config.requires_safety_checker
-        )
-        temp_pipeline._progress_bar_config = self._progress_bar_config
-        output = temp_pipeline(
-            prompt=prompt,
-            source_prompt=source_prompt,
-            source_guidance_scale=source_guidance_scale,
-            image=image,
-            height=height,
-            width=width,
-            strength=strength,
-            num_inference_steps=num_inference_steps,
-            guidance_scale=guidance_scale,
-            negative_prompt=negative_prompt,
-            num_images_per_prompt=num_images_per_prompt,
-            eta=eta,
-            generator=generator,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            latents=latents,
-            parse_prompt_type=parse_prompt_type,
-            max_embeddings_multiples=max_embeddings_multiples,
-            output_type=output_type,
-            return_dict=return_dict,
-            callback=callback,
-            callback_steps=callback_steps,
-            infer_op_dict=infer_op_dict,
-        )
-
-        return output
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_upscale.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_upscale.py
deleted file mode 100644
index dc1cf3548bcb..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_upscale.py
+++ /dev/null
@@ -1,317 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import numpy as np
-import paddle
-import PIL
-
-from ...pipeline_utils import DiffusionPipeline
-from ...schedulers import DDPMScheduler
-from ...utils import logging
-from ..fastdeploy_utils import FastDeployDiffusionPipelineMixin, FastDeployRuntimeModel
-from ..pipeline_utils import ImagePipelineOutput
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-class FastDeployStableDiffusionUpscalePipeline(DiffusionPipeline, FastDeployDiffusionPipelineMixin):
-    def __init__(
-        self,
-        vae: FastDeployRuntimeModel,
-        text_encoder: FastDeployRuntimeModel,
-        tokenizer: Any,
-        unet: FastDeployRuntimeModel,
-        low_res_scheduler: DDPMScheduler,
-        scheduler: Any,
-        max_noise_level: int = 350,
-    ):
-        super().__init__(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            low_res_scheduler=low_res_scheduler,
-            scheduler=scheduler,
-            safety_checker=None,
-            feature_extractor=None,
-            watermarker=None,
-            max_noise_level=max_noise_level,
-        )
-        self.post_init(vae_scaling_factor=0.08333)
-
-    def check_inputs(self, prompt, image, noise_level, callback_steps):
-        if not isinstance(prompt, str) and not isinstance(prompt, list):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if (
-            not isinstance(image, paddle.Tensor)
-            and not isinstance(image, PIL.Image.Image)
-            and not isinstance(image, list)
-        ):
-            raise ValueError(
-                f"`image` has to be of type `paddle.Tensor`, `PIL.Image.Image` or `list` but is {type(image)}"
-            )
-
-        # verify batch size of prompt and image are same if image is a list or tensor
-        if isinstance(image, list) or isinstance(image, paddle.Tensor):
-            if isinstance(prompt, str):
-                batch_size = 1
-            else:
-                batch_size = len(prompt)
-            if isinstance(image, list):
-                image_batch_size = len(image)
-            else:
-                image_batch_size = image.shape[0]
-            if batch_size != image_batch_size:
-                raise ValueError(
-                    f"`prompt` has batch size {batch_size} and `image` has batch size {image_batch_size}."
-                    " Please make sure that passed `prompt` matches the batch size of `image`."
-                )
-
-        # check noise level
-        if noise_level > self.config.max_noise_level:
-            raise ValueError(f"`noise_level` has to be <= {self.config.max_noise_level} but is {noise_level}")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-    def __call__(
-        self,
-        prompt: Union[str, List[str]],
-        image: Union[paddle.Tensor, PIL.Image.Image, List[PIL.Image.Image]],
-        num_inference_steps: int = 75,
-        guidance_scale: float = 9.0,
-        noise_level: int = 20,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        parse_prompt_type: Optional[str] = "lpw",
-        max_embeddings_multiples: Optional[int] = 3,
-        prompt_embeds: Optional[np.ndarray] = None,
-        negative_prompt_embeds: Optional[np.ndarray] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        infer_op_dict: Dict[str, str] = None,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`):
-                The prompt or prompts to guide the image generation.
-            image (`np.ndarray` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, that will be used as the starting point for the
-                process.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference. This parameter will be modulated by `strength`.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            noise_level TODO
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`np.random.RandomState`, *optional*):
-                A np.random.RandomState to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`np.ndarray`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`np.ndarray`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: np.ndarray)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-
-        # 1. Check inputs
-        self.check_inputs(prompt, image, noise_level, callback_steps)
-        infer_op_dict = self.prepare_infer_op_dict(infer_op_dict)
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input prompt
-        text_embeddings = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            parse_prompt_type=parse_prompt_type,
-            max_embeddings_multiples=max_embeddings_multiples,
-            infer_op=infer_op_dict.get("text_encoder", None),
-        )
-
-        # 4. Preprocess image
-        image = self.image_processor.preprocess(image)
-
-        # 5. set timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps)
-
-        # 5. Add noise to image
-        noise_level = paddle.to_tensor([noise_level], dtype="int64")
-        noise = paddle.randn(image.shape, generator=generator, dtype=text_embeddings.dtype)
-        image = self.low_res_scheduler.add_noise(image, noise, noise_level)
-
-        batch_multiplier = 2 if do_classifier_free_guidance else 1
-        image = paddle.concat([image] * batch_multiplier * num_images_per_prompt)
-        noise_level = paddle.concat([noise_level] * image.shape[0])
-
-        # 6. Prepare latent variables
-        height, width = image.shape[2:]
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            height,
-            width,
-            generator,
-            latents,
-        )
-        NUM_UNET_INPUT_CHANNELS = self.unet_num_latent_channels
-        NUM_LATENT_CHANNELS = self.vae_decoder_num_latent_channels
-
-        # 7. Check that sizes of image and latents match
-        num_channels_image = image.shape[1]
-        if NUM_LATENT_CHANNELS + num_channels_image != NUM_UNET_INPUT_CHANNELS:
-            raise ValueError(
-                "Incorrect configuration settings! The config of `pipeline.unet` expects"
-                f" {NUM_UNET_INPUT_CHANNELS} but received `num_channels_latents`: {NUM_LATENT_CHANNELS} +"
-                f" `num_channels_image`: {num_channels_image} "
-                f" = {NUM_LATENT_CHANNELS+num_channels_image}. Please verify the config of"
-                " `pipeline.unet` or your `image` input."
-            )
-
-        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 9. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        is_scheduler_support_step_index = self.is_scheduler_support_step_index()
-
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-                if is_scheduler_support_step_index:
-                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i)
-                else:
-                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                unet_inputs = dict(
-                    sample=paddle.concat(
-                        [latent_model_input, image], axis=1
-                    ),  # concat latents, image in the channel dimension
-                    timestep=t,
-                    encoder_hidden_states=prompt_embeds,
-                    infer_op=infer_op_dict.get("unet", None),
-                    output_shape=latent_model_input.shape,
-                )
-                # predict the noise residual
-                noise_pred_unet = self.unet(**unet_inputs)[0]
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                else:
-                    noise_pred = noise_pred_unet
-
-                # compute the previous noisy sample x_t -> x_t-1
-                if is_scheduler_support_step_index:
-                    scheduler_output = self.scheduler.step(
-                        noise_pred, t, latents, step_index=i, return_pred_original_sample=False, **extra_step_kwargs
-                    )
-                else:
-                    scheduler_output = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)
-                latents = scheduler_output.prev_sample
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-                    if i == len(timesteps) - 1:
-                        # sync for accuracy it/s measure
-                        paddle.device.cuda.synchronize()
-
-        if not output_type == "latent":
-            image = self._decode_vae_latents(
-                latents / self.vae_scaling_factor, infer_op=infer_op_dict.get("vae_decoder", None)
-            )
-        else:
-            image = latents
-
-        do_denormalize = [True] * image.shape[0]
-
-        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-
-        if not return_dict:
-            return (image,)
-
-        return ImagePipelineOutput(
-            images=image,
-        )
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
deleted file mode 100644
index 27c40e607da8..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ /dev/null
@@ -1,608 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import paddle
-from packaging import version
-
-from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
-
-from ...configuration_utils import FrozenDict
-from ...loaders import FromCkptMixin, LoraLoaderMixin, TextualInversionLoaderMixin
-from ...models import AutoencoderKL, UNet2DConditionModel
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import deprecate, logging, randn_tensor, replace_example_docstring
-from ..pipeline_utils import DiffusionPipeline
-from . import StableDiffusionPipelineOutput
-from .safety_checker import StableDiffusionSafetyChecker
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> import paddle
-        >>> from ppdiffusers import StableDiffusionPipeline
-
-        >>> pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", paddle_dtype=paddle.float16)
-
-        >>> prompt = "a photo of an astronaut riding a horse on mars"
-        >>> image = pipe(prompt).images[0]
-        ```
-"""
-
-
-class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromCkptMixin):
-    r"""
-    Pipeline for text-to-image generation using Stable Diffusion.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    In addition the pipeline inherits the following loading methods:
-        - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
-        - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`]
-        - *Ckpt*: [`loaders.FromCkptMixin.from_ckpt`]
-    as well as the following saving methods:
-        - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`]
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], [`PNDMScheduler`], [`EulerDiscreteScheduler`], [`EulerAncestralDiscreteScheduler`]
-            or [`DPMSolverMultistepScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPImageProcessor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-    _optional_components = ["safety_checker", "feature_extractor"]
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPImageProcessor,
-        requires_safety_checker: bool = True,
-    ):
-        super().__init__()
-
-        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
-                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
-                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
-                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
-                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file"
-            )
-            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["steps_offset"] = 1
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
-                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
-                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
-                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
-                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
-            )
-            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["clip_sample"] = False
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. PaddleNLP team, diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                f"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-
-        is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse(
-            version.parse(unet.config._ppdiffusers_version).base_version
-        ) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
-        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
-            deprecation_message = (
-                "The configuration file of the unet has set the default `sample_size` to smaller than"
-                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
-                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
-                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
-                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
-                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
-                " in the config might lead to incorrect results in future versions. If you have downloaded this"
-                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
-                " the `unet/config.json` file"
-            )
-            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(unet.config)
-            new_config["sample_size"] = 64
-            unet._internal_dict = FrozenDict(new_config)
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-
-    def _encode_prompt(
-        self,
-        prompt,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-        """
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
-
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask
-            else:
-                attention_mask = None
-
-            prompt_embeds = self.text_encoder(
-                text_input_ids,
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
-
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = uncond_input.attention_mask
-            else:
-                attention_mask = None
-
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids,
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
-
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
-
-        return prompt_embeds
-
-    def run_safety_checker(self, image, dtype):
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
-            )
-        else:
-            has_nsfw_concept = None
-        return image, has_nsfw_concept
-
-    def decode_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
-        image = self.vae.decode(latents).sample
-        image = (image / 2 + 0.5).clip(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        image = image.transpose([0, 2, 3, 1]).cast("float32").numpy()
-        return image
-
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    def check_inputs(
-        self,
-        prompt,
-        height,
-        width,
-        callback_steps,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-    ):
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
-        shape = [batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor]
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, dtype=dtype)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    @paddle.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [ppdiffusers.cross_attention](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/ppdiffusers/ppdiffusers/models/cross_attention.py).
-
-        Examples:
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * self.vae_scale_factor
-        width = width or self.unet.config.sample_size * self.vae_scale_factor
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
-        )
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-        )
-
-        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps = self.scheduler.timesteps
-
-        # 5. Prepare latent variables
-        num_channels_latents = self.unet.config.in_channels
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            prompt_embeds.dtype,
-            generator,
-            latents,
-        )
-
-        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 7. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                # predict the noise residual
-                noise_pred = self.unet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                ).sample
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        if output_type == "latent":
-            image = latents
-            has_nsfw_concept = None
-        elif output_type == "pil":
-            # 8. Post-processing
-            image = self.decode_latents(latents)
-
-            # 9. Run safety checker
-            image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
-
-            # 10. Convert to PIL
-            image = self.numpy_to_pil(image)
-        else:
-            # 8. Post-processing
-            image = self.decode_latents(latents)
-
-            # 9. Run safety checker
-            image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_adapter.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_adapter.py
deleted file mode 100644
index f38f70717555..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_adapter.py
+++ /dev/null
@@ -1,592 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import numpy as np
-import paddle
-import PIL
-
-from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
-
-# from ...loaders import TextualInversionLoaderMixin
-from ...models import AutoencoderKL, MultiAdapter, T2IAdapter, UNet2DConditionModel
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import PIL_INTERPOLATION, logging, randn_tensor, replace_example_docstring
-from ..pipeline_utils import DiffusionPipeline
-from . import StableDiffusionPipelineOutput
-from .safety_checker import StableDiffusionSafetyChecker
-
-logger = logging.get_logger(__name__)
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> from PIL import Image
-        >>> from ppdiffusers.utils import load_image
-
-        >>> image = load_image("https://huggingface.co/RzZ/sd-v1-4-adapter-color/resolve/main/color_ref.png")
-
-        >>> color_palette = image.resize((8, 8))
-        >>> color_palette = color_palette.resize((512, 512), resample=Image.Resampling.NEAREST)
-
-        >>> import paddle
-        >>> from ppdiffusers import StableDiffusionAdapterPipeline, T2IAdapter
-
-        >>> adapter = T2IAdapter.from_pretrained("RzZ/sd-v1-4-adapter-color")
-        >>> pipe = StableDiffusionAdapterPipeline.from_pretrained(
-        ...     "CompVis/stable-diffusion-v1-4",
-        ...     adapter=adapter,
-        ...     paddle_dtype=paddle.float16,
-        ... )
-
-        >>> out_image = pipe(
-        ...     "At night, glowing cubes in front of the beach",
-        ...     image=color_palette,
-        ...     generator=generator,
-        ... ).images[0]
-        ```
-"""
-
-
-def is_power_of_two(n):
-    if n <= 0:
-        return False
-    else:
-        return n & (n - 1) == 0
-
-
-def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image:
-    w, h = images.size
-
-    coef = w / h
-
-    w, h = img_size, img_size
-
-    if coef >= 1:
-        w = int(round(img_size / 8 * coef) * 8)
-    else:
-        h = int(round(img_size / 8 / coef) * 8)
-
-    images = images.resize((w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None)
-
-    return images
-
-
-def preprocess(image):
-    if isinstance(image, paddle.Tensor):
-        return image
-    elif isinstance(image, PIL.Image.Image):
-        image = [image]
-    if isinstance(image[0], PIL.Image.Image):
-        w, h = image[0].size
-        w, h = (x - x % 8 for x in (w, h))
-        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])) for i in image]
-        image = [(i[None, ..., None] if i.ndim == 2 else i[None, ...]) for i in image]
-        image = np.concatenate(image, axis=0)
-        image = np.array(image).astype(np.float32) / 255.0
-        image = image.transpose(0, 3, 1, 2)
-        image = paddle.to_tensor(data=image)
-    elif isinstance(image[0], paddle.Tensor):
-        if image[0].ndim == 3:
-            image = paddle.stack(x=image, axis=0)
-        elif image[0].ndim == 4:
-            image = paddle.concat(x=image, axis=0)
-        else:
-            raise ValueError(
-                f"Invalid image tensor! Expecting image tensor with 3 or 4 dimension, but recive: {image[0].ndim}"
-            )
-    else:
-        raise ValueError("Invalid image type!")
-    return image
-
-
-class StableDiffusionAdapterPipeline(DiffusionPipeline):
-    """
-    Pipeline for text-to-image generation using Stable Diffusion augmented with T2I-Adapter
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        adapter ([`T2IAdapter`] or [`MultiAdapter`] or `List[T2IAdapter]`):
-            Provides additional conditioning to the unet during the denoising process. If you set multiple Adapter as a
-            list, the outputs from each Adapter are added together to create one combined additional conditioning.
-        adapter_weights (`List[float]`, *optional*, defaults to None):
-            List of floats representing the weight which will be multiply to each adapter's output before adding them
-            together.
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            CLIP, specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class CLIPTokenizer.
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPFeatureExtractor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-
-    _optional_components = ["safety_checker", "feature_extractor"]
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        adapter: Union[T2IAdapter, MultiAdapter, List[T2IAdapter]],
-        scheduler: KarrasDiffusionSchedulers,
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPFeatureExtractor,
-        adapter_weights: Optional[List[float]] = None,
-        requires_safety_checker: bool = True,
-    ):
-        super().__init__()
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-        if isinstance(adapter, (list, tuple)):
-            adapter = MultiAdapter(adapter, adapter_weights=adapter_weights)
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            adapter=adapter,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-
-    def enable_vae_slicing(self):
-        """
-        Enable sliced VAE decoding.
-
-        When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several
-        steps. This is useful to save some memory and allow larger batch sizes.
-        """
-        self.vae.enable_slicing()
-
-    def disable_vae_slicing(self):
-        """
-        Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
-        computing decoding in one step.
-        """
-        self.vae.disable_slicing()
-
-    def _encode_prompt(
-        self,
-        prompt,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-    ):
-        """
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-             prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-        """
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-        if prompt_embeds is None:
-            # if isinstance(self, TextualInversionLoaderMixin):
-            #     prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    f"The following part of your input was truncated because CLIP can only handle sequences up to {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask
-            else:
-                attention_mask = None
-            prompt_embeds = self.text_encoder(text_input_ids, attention_mask=attention_mask)
-            prompt_embeds = prompt_embeds[0]
-        prompt_embeds = prompt_embeds.astype(self.text_encoder.dtype)
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        prompt_embeds = prompt_embeds.tile(repeat_times=[1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} != {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`: {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-            # if isinstance(self, TextualInversionLoaderMixin):
-            #     uncond_tokens = self.maybe_convert_prompt(uncond_tokens,
-            #         self.tokenizer)
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens, padding="max_length", max_length=max_length, truncation=True, return_tensors="pd"
-            )
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = uncond_input.attention_mask
-            else:
-                attention_mask = None
-            negative_prompt_embeds = self.text_encoder(uncond_input.input_ids, attention_mask=attention_mask)
-            negative_prompt_embeds = negative_prompt_embeds[0]
-        if do_classifier_free_guidance:
-            seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.astype(self.text_encoder.dtype)
-            negative_prompt_embeds = negative_prompt_embeds.tile(repeat_times=[1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape((batch_size * num_images_per_prompt, seq_len, -1))
-            prompt_embeds = paddle.concat(x=[negative_prompt_embeds, prompt_embeds])
-        return prompt_embeds
-
-    def run_safety_checker(self, image, dtype):
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.astype(dtype)
-            )
-        else:
-            has_nsfw_concept = None
-        return image, has_nsfw_concept
-
-    def decode_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
-        image = self.vae.decode(latents).sample
-        image = (image / 2 + 0.5).clip(min=0, max=1)
-        image = image.cpu().transpose(perm=[0, 2, 3, 1]).astype(dtype="float32").numpy()
-        return image
-
-    def prepare_extra_step_kwargs(self, generator, eta):
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    def check_inputs(
-        self,
-        prompt,
-        height,
-        width,
-        callback_steps,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-    ):
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-        if (
-            callback_steps is None
-            or callback_steps is not None
-            and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type {type(callback_steps)}."
-            )
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`: {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    f"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds` {negative_prompt_embeds.shape}."
-                )
-
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, dtype=dtype)
-        else:
-            latents = latents
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    def _default_height_width(self, height, width, image):
-        while isinstance(image, list):
-            image = image[0]
-        if height is None:
-            if isinstance(image, PIL.Image.Image):
-                height = image.height
-            elif isinstance(image, paddle.Tensor):
-                height = image.shape[-2]
-            height = height // 8 * 8
-        if width is None:
-            if isinstance(image, PIL.Image.Image):
-                width = image.width
-            elif isinstance(image, paddle.Tensor):
-                width = image.shape[-1]
-            width = width // 8 * 8
-        return height, width
-
-    @paddle.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        image: Union[paddle.Tensor, PIL.Image.Image, List[PIL.Image.Image]] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: int = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        adapter_conditioning_scale: Union[float, List[float]] = 1.0,
-    ):
-        """
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            image (`paddle.Tensor`, `PIL.Image.Image`, `List[paddle.Tensor]` or `List[PIL.Image.Image]` or `List[List[PIL.Image.Image]]`):
-                The Adapter input condition. Adapter uses this input condition to generate guidance to Unet. If the
-                type is specified as `paddle.Tensor`, it is passed to Adapter as is. PIL.Image.Image` can also be
-                accepted as an image. The control image is automatically resized to fit the output image.
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
-                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
-                One or a list of paddle generator(s)
-                to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
-                `self.processor` in
-                [ppdiffusers.cross_attention].
-            adapter_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
-                The outputs of the adapter are multiplied by `adapter_conditioning_scale` before they are added to the
-                residual in the original unet. If multiple adapters are specified in init, you can set the
-                corresponding scale as a list.
-
-        Examples:
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        height, width = self._default_height_width(height, width, image)
-        if (not is_power_of_two(height)) or (not is_power_of_two(width)):
-            height = 512
-            width = 512
-            image = resize(image, 512)
-
-        self.check_inputs(
-            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
-        )
-        is_multi_adapter = isinstance(self.adapter, MultiAdapter)
-        if is_multi_adapter:
-            adapter_input = [preprocess(img) for img in image]
-            n, c, h, w = adapter_input[0].shape
-            adapter_input = paddle.stack(x=[x.reshape([n * c, h, w]) for x in adapter_input])
-        else:
-            adapter_input = preprocess(image)
-        adapter_input = adapter_input.astype(self.adapter.dtype)
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-        do_classifier_free_guidance = guidance_scale > 1.0
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-        )
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps = self.scheduler.timesteps
-        num_channels_latents = self.unet.in_channels
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            prompt_embeds.dtype,
-            generator,
-            latents,
-        )
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-        adapter_state = self.adapter(adapter_input)
-        for k, v in enumerate(adapter_state):
-            adapter_state[k] = v * adapter_conditioning_scale
-        if num_images_per_prompt > 1:
-            for k, v in enumerate(adapter_state):
-                adapter_state[k] = v.tile(repeat_times=[num_images_per_prompt, 1, 1, 1])
-        if do_classifier_free_guidance:
-            for k, v in enumerate(adapter_state):
-                adapter_state[k] = paddle.concat(x=[v] * 2, axis=0)
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                latent_model_input = paddle.concat(x=[latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-                noise_pred = self.unet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                    down_block_additional_residuals=[state.clone() for state in adapter_state],
-                ).sample
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(chunks=2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-                if i == len(timesteps) - 1 or i + 1 > num_warmup_steps and (i + 1) % self.scheduler.order == 0:
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-        if output_type == "latent":
-            image = latents
-            has_nsfw_concept = None
-        elif output_type == "pil":
-            image = self.decode_latents(latents)
-            image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
-            image = self.numpy_to_pil(image)
-        else:
-            image = self.decode_latents(latents)
-            image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
-        if not return_dict:
-            return image, has_nsfw_concept
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_all_in_one.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_all_in_one.py
deleted file mode 100644
index d3a2ff069ff5..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_all_in_one.py
+++ /dev/null
@@ -1,1343 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-import os
-import random
-import re
-import time
-from typing import Callable, List, Optional, Union
-
-import numpy as np
-import paddle
-import PIL
-import PIL.Image
-from packaging import version
-
-from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
-
-from ...configuration_utils import FrozenDict
-from ...loaders import FromCkptMixin, LoraLoaderMixin, TextualInversionLoaderMixin
-from ...models import AutoencoderKL, UNet2DConditionModel
-from ...pipeline_utils import DiffusionPipeline
-from ...schedulers import (
-    DDIMScheduler,
-    DPMSolverMultistepScheduler,
-    EulerAncestralDiscreteScheduler,
-    EulerDiscreteScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-)
-from ...utils import PIL_INTERPOLATION, deprecate, logging
-from ...utils.testing_utils import load_image
-from . import StableDiffusionPipelineOutput
-from .safety_checker import StableDiffusionSafetyChecker
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-def save_all(images, FORMAT="jpg", OUTDIR="./outputs/"):
-    if not isinstance(images, (list, tuple)):
-        images = [images]
-    for image in images:
-        PRECISION = "fp32"
-        argument = image.argument
-        os.makedirs(OUTDIR, exist_ok=True)
-        epoch_time = argument["epoch_time"]
-        PROMPT = argument["prompt"]
-        NEGPROMPT = argument["negative_prompt"]
-        HEIGHT = argument["height"]
-        WIDTH = argument["width"]
-        SEED = argument["seed"]
-        STRENGTH = argument.get("strength", 1)
-        INFERENCE_STEPS = argument["num_inference_steps"]
-        GUIDANCE_SCALE = argument["guidance_scale"]
-
-        filename = f"{str(epoch_time)}_scale_{GUIDANCE_SCALE}_steps_{INFERENCE_STEPS}_seed_{SEED}.{FORMAT}"
-        filedir = f"{OUTDIR}/{filename}"
-        image.save(filedir)
-        with open(f"{OUTDIR}/{epoch_time}_prompt.txt", "w") as file:
-            file.write(
-                f"PROMPT: {PROMPT}\nNEG_PROMPT: {NEGPROMPT}\n\nINFERENCE_STEPS: {INFERENCE_STEPS}\nHeight: {HEIGHT}\nWidth: {WIDTH}\nSeed: {SEED}\n\nPrecision: {PRECISION}\nSTRENGTH: {STRENGTH}\nGUIDANCE_SCALE: {GUIDANCE_SCALE}"
-            )
-
-
-re_attention = re.compile(
-    r"""
-\\\(|
-\\\)|
-\\\[|
-\\]|
-\\\\|
-\\|
-\(|
-\[|
-:([+-]?[.\d]+)\)|
-\)|
-]|
-[^\\()\[\]:]+|
-:
-""",
-    re.X,
-)
-
-
-def parse_prompt_attention(text):
-    r"""
-    Parses a string with attention tokens and returns a list of pairs: text and its associated weight.
-    Accepted tokens are:
-      (abc) - increases attention to abc by a multiplier of 1.1
-      (abc:3.12) - increases attention to abc by a multiplier of 3.12
-      [abc] - decreases attention to abc by a multiplier of 1.1
-      \( - literal character '('
-      \[ - literal character '['
-      \) - literal character ')'
-      \] - literal character ']'
-      \\ - literal character '\'
-      anything else - just text
-    >>> parse_prompt_attention('normal text')
-    [['normal text', 1.0]]
-    >>> parse_prompt_attention('an (important) word')
-    [['an ', 1.0], ['important', 1.1], [' word', 1.0]]
-    >>> parse_prompt_attention('(unbalanced')
-    [['unbalanced', 1.1]]
-    >>> parse_prompt_attention('\(literal\]')
-    [['(literal]', 1.0]]
-    >>> parse_prompt_attention('(unnecessary)(parens)')
-    [['unnecessaryparens', 1.1]]
-    >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
-    [['a ', 1.0],
-     ['house', 1.5730000000000004],
-     [' ', 1.1],
-     ['on', 1.0],
-     [' a ', 1.1],
-     ['hill', 0.55],
-     [', sun, ', 1.1],
-     ['sky', 1.4641000000000006],
-     ['.', 1.1]]
-    """
-
-    res = []
-    round_brackets = []
-    square_brackets = []
-
-    round_bracket_multiplier = 1.1
-    square_bracket_multiplier = 1 / 1.1
-
-    def multiply_range(start_position, multiplier):
-        for p in range(start_position, len(res)):
-            res[p][1] *= multiplier
-
-    for m in re_attention.finditer(text):
-        text = m.group(0)
-        weight = m.group(1)
-
-        if text.startswith("\\"):
-            res.append([text[1:], 1.0])
-        elif text == "(":
-            round_brackets.append(len(res))
-        elif text == "[":
-            square_brackets.append(len(res))
-        elif weight is not None and len(round_brackets) > 0:
-            multiply_range(round_brackets.pop(), float(weight))
-        elif text == ")" and len(round_brackets) > 0:
-            multiply_range(round_brackets.pop(), round_bracket_multiplier)
-        elif text == "]" and len(square_brackets) > 0:
-            multiply_range(square_brackets.pop(), square_bracket_multiplier)
-        else:
-            res.append([text, 1.0])
-
-    for pos in round_brackets:
-        multiply_range(pos, round_bracket_multiplier)
-
-    for pos in square_brackets:
-        multiply_range(pos, square_bracket_multiplier)
-
-    if len(res) == 0:
-        res = [["", 1.0]]
-
-    # merge runs of identical weights
-    i = 0
-    while i + 1 < len(res):
-        if res[i][1] == res[i + 1][1]:
-            res[i][0] += res[i + 1][0]
-            res.pop(i + 1)
-        else:
-            i += 1
-
-    return res
-
-
-def get_prompts_with_weights(pipe: DiffusionPipeline, prompt: List[str], max_length: int):
-    r"""
-    Tokenize a list of prompts and return its tokens with weights of each token.
-
-    No padding, starting or ending token is included.
-    """
-    tokens = []
-    weights = []
-    truncated = False
-    for text in prompt:
-        texts_and_weights = parse_prompt_attention(text)
-        text_token = []
-        text_weight = []
-        for word, weight in texts_and_weights:
-            # tokenize and discard the starting and the ending token
-            token = pipe.tokenizer(word).input_ids[1:-1]
-            text_token += token
-
-            # copy the weight by length of token
-            text_weight += [weight] * len(token)
-
-            # stop if the text is too long (longer than truncation limit)
-            if len(text_token) > max_length:
-                truncated = True
-                break
-
-        # truncate
-        if len(text_token) > max_length:
-            truncated = True
-            text_token = text_token[:max_length]
-            text_weight = text_weight[:max_length]
-
-        tokens.append(text_token)
-        weights.append(text_weight)
-    if truncated:
-        logger.warning("Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples")
-    return tokens, weights
-
-
-def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, pad, no_boseos_middle=True, chunk_length=77):
-    r"""
-    Pad the tokens (with starting and ending tokens) and weights (with 1.0) to max_length.
-    """
-    max_embeddings_multiples = (max_length - 2) // (chunk_length - 2)
-    weights_length = max_length if no_boseos_middle else max_embeddings_multiples * chunk_length
-    for i in range(len(tokens)):
-        tokens[i] = [bos] + tokens[i] + [eos] + [pad] * (max_length - 2 - len(tokens[i]))
-        if no_boseos_middle:
-            weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 - len(weights[i]))
-        else:
-            w = []
-            if len(weights[i]) == 0:
-                w = [1.0] * weights_length
-            else:
-                for j in range(max_embeddings_multiples):
-                    w.append(1.0)  # weight for starting token in this chunk
-                    w += weights[i][j * (chunk_length - 2) : min(len(weights[i]), (j + 1) * (chunk_length - 2))]
-                    w.append(1.0)  # weight for ending token in this chunk
-                w += [1.0] * (weights_length - len(w))
-            weights[i] = w[:]
-
-    return tokens, weights
-
-
-def get_unweighted_text_embeddings(
-    pipe: DiffusionPipeline, text_input: paddle.Tensor, chunk_length: int, no_boseos_middle: Optional[bool] = True
-):
-    """
-    When the length of tokens is a multiple of the capacity of the text encoder,
-    it should be split into chunks and sent to the text encoder individually.
-    """
-    max_embeddings_multiples = (text_input.shape[1] - 2) // (chunk_length - 2)
-    if max_embeddings_multiples > 1:
-        text_embeddings = []
-        for i in range(max_embeddings_multiples):
-            # extract the i-th chunk
-            text_input_chunk = text_input[:, i * (chunk_length - 2) : (i + 1) * (chunk_length - 2) + 2].clone()
-
-            # cover the head and the tail by the starting and the ending tokens
-            text_input_chunk[:, 0] = text_input[0, 0]
-            text_input_chunk[:, -1] = text_input[0, -1]
-
-            text_embedding = pipe.text_encoder(text_input_chunk)[0]
-
-            if no_boseos_middle:
-                if i == 0:
-                    # discard the ending token
-                    text_embedding = text_embedding[:, :-1]
-                elif i == max_embeddings_multiples - 1:
-                    # discard the starting token
-                    text_embedding = text_embedding[:, 1:]
-                else:
-                    # discard both starting and ending tokens
-                    text_embedding = text_embedding[:, 1:-1]
-
-            text_embeddings.append(text_embedding)
-        text_embeddings = paddle.concat(text_embeddings, axis=1)
-    else:
-        text_embeddings = pipe.text_encoder(text_input)[0]
-    return text_embeddings
-
-
-def get_weighted_text_embeddings(
-    pipe: DiffusionPipeline,
-    prompt: Union[str, List[str]],
-    uncond_prompt: Optional[Union[str, List[str]]] = None,
-    max_embeddings_multiples: Optional[int] = 1,
-    no_boseos_middle: Optional[bool] = False,
-    skip_parsing: Optional[bool] = False,
-    skip_weighting: Optional[bool] = False,
-    **kwargs
-):
-    r"""
-    Prompts can be assigned with local weights using brackets. For example,
-    prompt 'A (very beautiful) masterpiece' highlights the words 'very beautiful',
-    and the embedding tokens corresponding to the words get multiplied by a constant, 1.1.
-
-    Also, to regularize of the embedding, the weighted embedding would be scaled to preserve the original mean.
-
-    Args:
-        pipe (`DiffusionPipeline`):
-            Pipe to provide access to the tokenizer and the text encoder.
-        prompt (`str` or `List[str]`):
-            The prompt or prompts to guide the image generation.
-        uncond_prompt (`str` or `List[str]`):
-            The unconditional prompt or prompts for guide the image generation. If unconditional prompt
-            is provided, the embeddings of prompt and uncond_prompt are concatenated.
-        max_embeddings_multiples (`int`, *optional*, defaults to `1`):
-            The max multiple length of prompt embeddings compared to the max output length of text encoder.
-        no_boseos_middle (`bool`, *optional*, defaults to `False`):
-            If the length of text token is multiples of the capacity of text encoder, whether reserve the starting and
-            ending token in each of the chunk in the middle.
-        skip_parsing (`bool`, *optional*, defaults to `False`):
-            Skip the parsing of brackets.
-        skip_weighting (`bool`, *optional*, defaults to `False`):
-            Skip the weighting. When the parsing is skipped, it is forced True.
-    """
-    max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
-    if isinstance(prompt, str):
-        prompt = [prompt]
-
-    if not skip_parsing:
-        prompt_tokens, prompt_weights = get_prompts_with_weights(pipe, prompt, max_length - 2)
-        if uncond_prompt is not None:
-            if isinstance(uncond_prompt, str):
-                uncond_prompt = [uncond_prompt]
-            uncond_tokens, uncond_weights = get_prompts_with_weights(pipe, uncond_prompt, max_length - 2)
-    else:
-        prompt_tokens = [
-            token[1:-1] for token in pipe.tokenizer(prompt, max_length=max_length, truncation=True).input_ids
-        ]
-        prompt_weights = [[1.0] * len(token) for token in prompt_tokens]
-        if uncond_prompt is not None:
-            if isinstance(uncond_prompt, str):
-                uncond_prompt = [uncond_prompt]
-            uncond_tokens = [
-                token[1:-1]
-                for token in pipe.tokenizer(uncond_prompt, max_length=max_length, truncation=True).input_ids
-            ]
-            uncond_weights = [[1.0] * len(token) for token in uncond_tokens]
-
-    # round up the longest length of tokens to a multiple of (model_max_length - 2)
-    max_length = max([len(token) for token in prompt_tokens])
-    if uncond_prompt is not None:
-        max_length = max(max_length, max([len(token) for token in uncond_tokens]))
-
-    max_embeddings_multiples = min(
-        max_embeddings_multiples, (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1
-    )
-    max_embeddings_multiples = max(1, max_embeddings_multiples)
-    max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
-
-    # pad the length of tokens and weights
-    # support bert tokenizer
-    bos = pipe.tokenizer.bos_token_id if pipe.tokenizer.bos_token_id is not None else pipe.tokenizer.cls_token_id
-    eos = pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id is not None else pipe.tokenizer.sep_token_id
-    pad = pipe.tokenizer.pad_token_id
-    prompt_tokens, prompt_weights = pad_tokens_and_weights(
-        prompt_tokens,
-        prompt_weights,
-        max_length,
-        bos,
-        eos,
-        pad,
-        no_boseos_middle=no_boseos_middle,
-        chunk_length=pipe.tokenizer.model_max_length,
-    )
-    prompt_tokens = paddle.to_tensor(prompt_tokens, dtype=paddle.int64)
-    if uncond_prompt is not None:
-        uncond_tokens, uncond_weights = pad_tokens_and_weights(
-            uncond_tokens,
-            uncond_weights,
-            max_length,
-            bos,
-            eos,
-            pad,
-            no_boseos_middle=no_boseos_middle,
-            chunk_length=pipe.tokenizer.model_max_length,
-        )
-        uncond_tokens = paddle.to_tensor(uncond_tokens, dtype=paddle.int64)
-
-    # get the embeddings
-    text_embeddings = get_unweighted_text_embeddings(
-        pipe,
-        prompt_tokens,
-        pipe.tokenizer.model_max_length,
-        no_boseos_middle=no_boseos_middle,
-    )
-    prompt_weights = paddle.to_tensor(prompt_weights, dtype=paddle.float32).cast(text_embeddings.dtype)
-    if uncond_prompt is not None:
-        uncond_embeddings = get_unweighted_text_embeddings(
-            pipe,
-            uncond_tokens,
-            pipe.tokenizer.model_max_length,
-            no_boseos_middle=no_boseos_middle,
-        )
-        uncond_weights = paddle.to_tensor(uncond_weights, dtype=paddle.float32).cast(uncond_embeddings.dtype)
-
-    # assign weights to the prompts and normalize in the sense of mean
-    # TODO: should we normalize by chunk or in a whole (current implementation)?
-    if (not skip_parsing) and (not skip_weighting):
-        previous_mean = text_embeddings.mean(axis=[-2, -1])
-        text_embeddings *= prompt_weights.unsqueeze(-1)
-        text_embeddings *= previous_mean / text_embeddings.mean(axis=[-2, -1], keepdim=True)
-        if uncond_prompt is not None:
-            previous_mean = uncond_embeddings.mean(axis=[-2, -1])
-            uncond_embeddings *= uncond_weights.unsqueeze(-1)
-            uncond_embeddings *= previous_mean / uncond_embeddings.mean(axis=[-2, -1], keepdim=True)
-
-    if uncond_prompt is not None:
-        return text_embeddings, uncond_embeddings
-    return text_embeddings, None
-
-
-def preprocess_image(image):
-    w, h = image.size
-    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
-    image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
-    image = np.array(image).astype(np.float32) / 255.0
-    image = image[None].transpose(0, 3, 1, 2)
-    image = paddle.to_tensor(image)
-    return 2.0 * image - 1.0
-
-
-def preprocess_mask(mask, scale_factor=8):
-    mask = mask.convert("L")
-    w, h = mask.size
-    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
-    mask = mask.resize((w // scale_factor, h // scale_factor), resample=PIL_INTERPOLATION["nearest"])
-    mask = np.array(mask).astype(np.float32) / 255.0
-    mask = np.tile(mask, (4, 1, 1))
-    mask = mask[None].transpose(0, 1, 2, 3)  # what does this step do?
-    mask = 1 - mask  # repaint white, keep black
-    mask = paddle.to_tensor(mask)
-    return mask
-
-
-class StableDiffusionPipelineAllinOne(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromCkptMixin):
-    r"""
-    Pipeline for text-to-image image-to-image inpainting generation using Stable Diffusion.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular xxxx, etc.)
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], [`PNDMScheduler`], [`EulerDiscreteScheduler`], [`EulerAncestralDiscreteScheduler`]
-            or [`DPMSolverMultistepScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/junnyu/stable-diffusion-v1-4-paddle) for details.
-        feature_extractor ([`CLIPFeatureExtractor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-    _optional_components = ["safety_checker", "feature_extractor"]
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: Union[
-            DDIMScheduler,
-            PNDMScheduler,
-            LMSDiscreteScheduler,
-            EulerDiscreteScheduler,
-            EulerAncestralDiscreteScheduler,
-            DPMSolverMultistepScheduler,
-        ],
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPFeatureExtractor,
-        requires_safety_checker: bool = False,
-    ):
-        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
-                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
-                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
-                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
-                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file"
-            )
-            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["steps_offset"] = 1
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
-                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
-                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
-                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
-                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
-            )
-            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["clip_sample"] = False
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. PaddleNLP team, diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                f"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-        is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse(
-            version.parse(unet.config._ppdiffusers_version).base_version
-        ) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
-        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
-            deprecation_message = (
-                "The configuration file of the unet has set the default `sample_size` to smaller than"
-                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
-                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
-                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
-                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
-                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
-                " in the config might lead to incorrect results in future versions. If you have downloaded this"
-                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
-                " the `unet/config.json` file"
-            )
-            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(unet.config)
-            new_config["sample_size"] = 64
-            unet._internal_dict = FrozenDict(new_config)
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-
-        self.__init__additional__()
-
-    def __init__additional__(self):
-        if not hasattr(self, "vae_scale_factor"):
-            setattr(self, "vae_scale_factor", 2 ** (len(self.vae.config.block_out_channels) - 1))
-
-    def __call__(self, *args, **kwargs):
-        return self.text2image(*args, **kwargs)
-
-    def text2img(self, *args, **kwargs):
-        return self.text2image(*args, **kwargs)
-
-    def _encode_prompt(
-        self,
-        prompt,
-        negative_prompt,
-        max_embeddings_multiples,
-        no_boseos_middle,
-        skip_parsing,
-        skip_weighting,
-        do_classifier_free_guidance,
-        num_images_per_prompt,
-        **kwargs,
-    ):
-        batch_size = len(prompt) if isinstance(prompt, list) else 1
-
-        if negative_prompt is None:
-            negative_prompt = [""] * batch_size
-        elif isinstance(negative_prompt, str):
-            negative_prompt = [negative_prompt] * batch_size
-        if batch_size != len(negative_prompt):
-            raise ValueError(
-                f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                " the batch size of `prompt`."
-            )
-
-        text_embeddings, uncond_embeddings = get_weighted_text_embeddings(
-            pipe=self,
-            prompt=prompt,
-            uncond_prompt=negative_prompt if do_classifier_free_guidance else None,
-            max_embeddings_multiples=max_embeddings_multiples,
-            no_boseos_middle=no_boseos_middle,
-            skip_parsing=skip_parsing,
-            skip_weighting=skip_weighting,
-            **kwargs,
-        )
-        bs_embed, seq_len, _ = text_embeddings.shape
-        text_embeddings = text_embeddings.tile([1, num_images_per_prompt, 1])
-        text_embeddings = text_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        if do_classifier_free_guidance:
-            seq_len = uncond_embeddings.shape[1]
-            uncond_embeddings = uncond_embeddings.tile([1, num_images_per_prompt, 1])
-            uncond_embeddings = uncond_embeddings.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-            text_embeddings = paddle.concat([uncond_embeddings, text_embeddings])
-
-        return text_embeddings
-
-    def run_safety_checker(self, image, dtype):
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
-            )
-        else:
-            has_nsfw_concept = None
-        return image, has_nsfw_concept
-
-    def decode_latents(self, latents):
-        latents = 1 / 0.18215 * latents
-        image = self.vae.decode(latents).sample
-        image = (image / 2 + 0.5).clip(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
-        image = image.transpose([0, 2, 3, 1]).cast("float32").numpy()
-        return image
-
-    def prepare_extra_step_kwargs(self, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        return extra_step_kwargs
-
-    def check_inputs_text2img(self, prompt, height, width, callback_steps):
-        if not isinstance(prompt, str) and not isinstance(prompt, list):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-    def check_inputs_img2img_inpaint(self, prompt, strength, callback_steps):
-        if not isinstance(prompt, str) and not isinstance(prompt, list):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if strength < 0 or strength > 1:
-            raise ValueError(f"The value of strength should in [1.0, 1.0] but is {strength}")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-    def prepare_latents_text2img(self, batch_size, num_channels_latents, height, width, dtype, latents=None):
-        shape = [batch_size, num_channels_latents, height // 8, width // 8]
-        if latents is None:
-            latents = paddle.randn(shape, dtype=dtype)
-        else:
-            if latents.shape != shape:
-                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    def prepare_latents_img2img(self, image, timestep, num_images_per_prompt, dtype):
-        image = image.cast(dtype=dtype)
-        init_latent_dist = self.vae.encode(image).latent_dist
-        init_latents = init_latent_dist.sample()
-        init_latents = 0.18215 * init_latents
-
-        b, c, h, w = init_latents.shape
-        init_latents = init_latents.tile([1, num_images_per_prompt, 1, 1])
-        init_latents = init_latents.reshape([b * num_images_per_prompt, c, h, w])
-
-        # add noise to latents using the timesteps
-        noise = paddle.randn(init_latents.shape, dtype=dtype)
-
-        # get latents
-        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
-        latents = init_latents
-
-        return latents
-
-    def get_timesteps(self, num_inference_steps, strength):
-        # get the original timestep using init_timestep
-        offset = self.scheduler.config.get("steps_offset", 0)
-        init_timestep = int(num_inference_steps * strength) + offset
-        init_timestep = min(init_timestep, num_inference_steps)
-
-        t_start = max(num_inference_steps - init_timestep + offset, 0)
-        timesteps = self.scheduler.timesteps[t_start:]
-
-        return timesteps, num_inference_steps - t_start
-
-    def prepare_latents_inpaint(self, image, timestep, num_images_per_prompt, dtype):
-        image = image.cast(dtype)
-        init_latent_dist = self.vae.encode(image).latent_dist
-        init_latents = init_latent_dist.sample()
-        init_latents = 0.18215 * init_latents
-
-        b, c, h, w = init_latents.shape
-        init_latents = init_latents.tile([1, num_images_per_prompt, 1, 1])
-        init_latents = init_latents.reshape([b * num_images_per_prompt, c, h, w])
-
-        init_latents_orig = init_latents
-
-        # add noise to latents using the timesteps
-        noise = paddle.randn(init_latents.shape, dtype=dtype)
-        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
-        latents = init_latents
-        return latents, init_latents_orig, noise
-
-    @paddle.no_grad()
-    def text2image(
-        self,
-        prompt: Union[str, List[str]],
-        height: int = 512,
-        width: int = 512,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        seed: Optional[int] = None,
-        latents: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        # new add
-        max_embeddings_multiples: Optional[int] = 3,
-        no_boseos_middle: Optional[bool] = False,
-        skip_parsing: Optional[bool] = False,
-        skip_weighting: Optional[bool] = False,
-        **kwargs,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`):
-                The prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to 512):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to 512):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            seed (`int`, *optional*):
-                Random number seed.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `seed`.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        seed = random.randint(0, 2**32) if seed is None else seed
-        argument = dict(
-            prompt=prompt,
-            negative_prompt=negative_prompt,
-            height=height,
-            width=width,
-            num_inference_steps=num_inference_steps,
-            guidance_scale=guidance_scale,
-            num_images_per_prompt=num_images_per_prompt,
-            eta=eta,
-            seed=seed,
-            latents=latents,
-            max_embeddings_multiples=max_embeddings_multiples,
-            no_boseos_middle=no_boseos_middle,
-            skip_parsing=skip_parsing,
-            skip_weighting=skip_weighting,
-            epoch_time=time.time(),
-        )
-        paddle.seed(seed)
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs_text2img(prompt, height, width, callback_steps)
-
-        # 2. Define call parameters
-        batch_size = 1 if isinstance(prompt, str) else len(prompt)
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input prompt
-        text_embeddings = self._encode_prompt(
-            prompt,
-            negative_prompt,
-            max_embeddings_multiples,
-            no_boseos_middle,
-            skip_parsing,
-            skip_weighting,
-            do_classifier_free_guidance,
-            num_images_per_prompt,
-        )
-
-        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps = self.scheduler.timesteps
-
-        # 5. Prepare latent variables
-        num_channels_latents = self.unet.in_channels
-        latents = self.prepare_latents_text2img(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            text_embeddings.dtype,
-            latents,
-        )
-
-        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(eta)
-
-        # 7. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                # predict the noise residual
-                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        # 8. Post-processing
-        image = self.decode_latents(latents)
-
-        # 9. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image, text_embeddings.dtype)
-
-        # 10. Convert to PIL
-        if output_type == "pil":
-            image = self.numpy_to_pil(image, argument=argument)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
-
-    @paddle.no_grad()
-    def img2img(
-        self,
-        prompt: Union[str, List[str]],
-        image: Union[paddle.Tensor, PIL.Image.Image],
-        strength: float = 0.8,
-        height=None,
-        width=None,
-        num_inference_steps: Optional[int] = 50,
-        guidance_scale: Optional[float] = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: Optional[float] = 0.0,
-        seed: Optional[int] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        # new add
-        max_embeddings_multiples: Optional[int] = 1,
-        no_boseos_middle: Optional[bool] = False,
-        skip_parsing: Optional[bool] = False,
-        skip_weighting: Optional[bool] = False,
-        **kwargs,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`):
-                The prompt or prompts to guide the image generation.
-            image (`paddle.Tensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, that will be used as the starting point for the
-                process.
-            strength (`float`, *optional*, defaults to 0.8):
-                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
-                `image` will be used as a starting point, adding more noise to it the larger the `strength`. The
-                number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
-                noise will be maximum and the denoising process will run for the full number of iterations specified in
-                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference. This parameter will be modulated by `strength`.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            seed (`int`, *optional*):
-                A random seed.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        seed = random.randint(0, 2**32) if seed is None else seed
-        image_str = image
-        if isinstance(image_str, str):
-            image = load_image(image_str)
-
-        if height is None and width is None:
-            width = (image.size[0] // 8) * 8
-            height = (image.size[1] // 8) * 8
-        elif height is None and width is not None:
-            height = (image.size[1] // 8) * 8
-        elif width is None and height is not None:
-            width = (image.size[0] // 8) * 8
-        else:
-            height = height
-            width = width
-
-        argument = dict(
-            prompt=prompt,
-            image=image_str,
-            negative_prompt=negative_prompt,
-            height=height,
-            width=width,
-            strength=strength,
-            num_inference_steps=num_inference_steps,
-            guidance_scale=guidance_scale,
-            num_images_per_prompt=num_images_per_prompt,
-            eta=eta,
-            seed=seed,
-            max_embeddings_multiples=max_embeddings_multiples,
-            no_boseos_middle=no_boseos_middle,
-            skip_parsing=skip_parsing,
-            skip_weighting=skip_weighting,
-            epoch_time=time.time(),
-        )
-        paddle.seed(seed)
-
-        # 1. Check inputs
-        self.check_inputs_img2img_inpaint(prompt, strength, callback_steps)
-
-        # 2. Define call parameters
-        batch_size = 1 if isinstance(prompt, str) else len(prompt)
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input prompt
-        text_embeddings = self._encode_prompt(
-            prompt,
-            negative_prompt,
-            max_embeddings_multiples,
-            no_boseos_middle,
-            skip_parsing,
-            skip_weighting,
-            do_classifier_free_guidance,
-            num_images_per_prompt,
-        )
-
-        # 4. Preprocess image
-        if isinstance(image, PIL.Image.Image):
-            image = image.resize((width, height))
-            image = preprocess_image(image)
-
-        # 5. set timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
-        latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
-
-        # 6. Prepare latent variables
-        latents = self.prepare_latents_img2img(image, latent_timestep, num_images_per_prompt, text_embeddings.dtype)
-
-        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(eta)
-
-        # 8. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                # predict the noise residual
-                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        # 9. Post-processing
-        image = self.decode_latents(latents)
-
-        # 10. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image, text_embeddings.dtype)
-
-        # 11. Convert to PIL
-        if output_type == "pil":
-            image = self.numpy_to_pil(image, argument=argument)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
-
-    @paddle.no_grad()
-    def inpaint(
-        self,
-        prompt: Union[str, List[str]],
-        image: Union[paddle.Tensor, PIL.Image.Image],
-        mask_image: Union[paddle.Tensor, PIL.Image.Image],
-        height=None,
-        width=None,
-        strength: float = 0.8,
-        num_inference_steps: Optional[int] = 50,
-        guidance_scale: Optional[float] = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: Optional[float] = 0.0,
-        seed: Optional[int] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        # new add
-        max_embeddings_multiples: Optional[int] = 1,
-        no_boseos_middle: Optional[bool] = False,
-        skip_parsing: Optional[bool] = False,
-        skip_weighting: Optional[bool] = False,
-        **kwargs,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`):
-                The prompt or prompts to guide the image generation.
-            image (`paddle.Tensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, that will be used as the starting point for the
-                process. This is the image whose masked region will be inpainted.
-            mask_image (`paddle.Tensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
-                replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
-                PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
-                contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
-            strength (`float`, *optional*, defaults to 0.8):
-                Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
-                is 1, the denoising process will be run on the masked area for the full number of iterations specified
-                in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more
-                noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
-                the expense of slower inference. This parameter will be modulated by `strength`, as explained above.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            seed (`int`, *optional*):
-                A random seed.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        seed = random.randint(0, 2**32) if seed is None else seed
-        image_str = image
-        mask_image_str = mask_image
-
-        if isinstance(image_str, str):
-            image = load_image(image_str)
-        if isinstance(mask_image_str, str):
-            mask_image = load_image(mask_image_str)
-
-        if height is None and width is None:
-            width = (image.size[0] // 8) * 8
-            height = (image.size[1] // 8) * 8
-        elif height is None and width is not None:
-            height = (image.size[1] // 8) * 8
-        elif width is None and height is not None:
-            width = (image.size[0] // 8) * 8
-        else:
-            height = height
-            width = width
-
-        argument = dict(
-            prompt=prompt,
-            image=image_str,
-            mask_image=mask_image_str,
-            negative_prompt=negative_prompt,
-            height=height,
-            width=width,
-            strength=strength,
-            num_inference_steps=num_inference_steps,
-            guidance_scale=guidance_scale,
-            num_images_per_prompt=num_images_per_prompt,
-            eta=eta,
-            seed=seed,
-            max_embeddings_multiples=max_embeddings_multiples,
-            no_boseos_middle=no_boseos_middle,
-            skip_parsing=skip_parsing,
-            skip_weighting=skip_weighting,
-            epoch_time=time.time(),
-        )
-        paddle.seed(seed)
-
-        # 1. Check inputs
-        self.check_inputs_img2img_inpaint(prompt, strength, callback_steps)
-
-        # 2. Define call parameters
-        batch_size = 1 if isinstance(prompt, str) else len(prompt)
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input prompt
-        text_embeddings = self._encode_prompt(
-            prompt,
-            negative_prompt,
-            max_embeddings_multiples,
-            no_boseos_middle,
-            skip_parsing,
-            skip_weighting,
-            do_classifier_free_guidance,
-            num_images_per_prompt,
-        )
-
-        if not isinstance(image, paddle.Tensor):
-            image = image.resize((width, height))
-            image = preprocess_image(image)
-
-        if not isinstance(mask_image, paddle.Tensor):
-            mask_image = mask_image.resize((width, height))
-            mask_image = preprocess_mask(mask_image)
-
-        # 5. set timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
-        latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
-
-        # 6. Prepare latent variables
-        # encode the init image into latents and scale the latents
-        latents, init_latents_orig, noise = self.prepare_latents_inpaint(
-            image, latent_timestep, num_images_per_prompt, text_embeddings.dtype
-        )
-
-        # 7. Prepare mask latent
-        mask = mask_image.cast(latents.dtype)
-        mask = paddle.concat([mask] * batch_size * num_images_per_prompt)
-
-        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(eta)
-
-        # 9. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                # predict the noise residual
-                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-                # masking
-                init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, t)
-
-                latents = (init_latents_proper * mask) + (latents * (1 - mask))
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        # 10. Post-processing
-        image = self.decode_latents(latents)
-
-        # 11. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image, text_embeddings.dtype)
-
-        # 12. Convert to PIL
-        if output_type == "pil":
-            image = self.numpy_to_pil(image, argument=argument)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
-
-    @staticmethod
-    def numpy_to_pil(images, **kwargs):
-        """
-        Convert a numpy image or a batch of images to a PIL image.
-        """
-        if images.ndim == 3:
-            images = images[None, ...]
-        images = (images * 255).round().astype("uint8")
-        pil_images = []
-        argument = kwargs.pop("argument", None)
-        for image in images:
-            image = PIL.Image.fromarray(image)
-            if argument is not None:
-                image.argument = argument
-            pil_images.append(image)
-
-        return pil_images
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py
deleted file mode 100644
index 03f138fcc599..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py
+++ /dev/null
@@ -1,993 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-import math
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-import paddle.nn as nn
-from paddle.nn import functional as F
-
-from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
-
-from ...loaders import TextualInversionLoaderMixin
-from ...models import AutoencoderKL, UNet2DConditionModel
-from ...models.attention_processor import Attention
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import logging, randn_tensor, replace_example_docstring
-from ..pipeline_utils import DiffusionPipeline
-from . import StableDiffusionPipelineOutput
-from .safety_checker import StableDiffusionSafetyChecker
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> import paddle
-        >>> from ppdiffusers import StableDiffusionAttendAndExcitePipeline
-
-        >>> pipe = StableDiffusionAttendAndExcitePipeline.from_pretrained(
-        ...     "CompVis/stable-diffusion-v1-4", paddle_dtype=paddle.float16
-        ... )
-
-
-        >>> prompt = "a cat and a frog"
-
-        >>> # use get_indices function to find out indices of the tokens you want to alter
-        >>> pipe.get_indices(prompt)
-        {0: '<|startoftext|>', 1: 'a</w>', 2: 'cat</w>', 3: 'and</w>', 4: 'a</w>', 5: 'frog</w>', 6: '<|endoftext|>'}
-
-        >>> token_indices = [2, 5]
-        >>> seed = 6141
-        >>> generator = paddle.Generator().manual_seed(seed)
-
-        >>> images = pipe(
-        ...     prompt=prompt,
-        ...     token_indices=token_indices,
-        ...     guidance_scale=7.5,
-        ...     generator=generator,
-        ...     num_inference_steps=50,
-        ...     max_iter_to_alter=25,
-        ... ).images
-
-        >>> image = images[0]
-        >>> image.save(f"../images/{prompt}_{seed}.png")
-        ```
-"""
-
-
-class AttentionStore:
-    @staticmethod
-    def get_empty_store():
-        return {"down": [], "mid": [], "up": []}
-
-    def __call__(self, attn, is_cross: bool, place_in_unet: str):
-        if self.cur_att_layer >= 0 and is_cross:
-            if attn.shape[1] == np.prod(self.attn_res):
-                self.step_store[place_in_unet].append(attn)
-
-        self.cur_att_layer += 1
-        if self.cur_att_layer == self.num_att_layers:
-            self.cur_att_layer = 0
-            self.between_steps()
-
-    def between_steps(self):
-        self.attention_store = self.step_store
-        self.step_store = self.get_empty_store()
-
-    def get_average_attention(self):
-        average_attention = self.attention_store
-        return average_attention
-
-    def aggregate_attention(self, from_where: List[str]) -> paddle.Tensor:
-        """Aggregates the attention across the different layers and heads at the specified resolution."""
-        out = []
-        attention_maps = self.get_average_attention()
-        for location in from_where:
-            for item in attention_maps[location]:
-                cross_maps = item.reshape([-1, self.attn_res[0], self.attn_res[1], item.shape[-1]])
-                out.append(cross_maps)
-        out = paddle.concat(out, axis=0)
-        out = out.sum(0) / out.shape[0]
-        return out
-
-    def reset(self):
-        self.cur_att_layer = 0
-        self.step_store = self.get_empty_store()
-        self.attention_store = {}
-
-    def __init__(self, attn_res):
-        """
-        Initialize an empty AttentionStore :param step_index: used to visualize only a specific step in the diffusion
-        process
-        """
-        self.num_att_layers = -1
-        self.cur_att_layer = 0
-        self.step_store = self.get_empty_store()
-        self.attention_store = {}
-        self.curr_step_index = 0
-        self.attn_res = attn_res
-
-
-class AttendExciteAttnProcessor:
-    def __init__(self, attnstore, place_in_unet):
-        super().__init__()
-        self.attnstore = attnstore
-        self.place_in_unet = place_in_unet
-
-    def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None):
-        batch_size, sequence_length, _ = hidden_states.shape
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-
-        query = attn.to_q(hidden_states)
-
-        is_cross = encoder_hidden_states is not None
-        encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
-        key = attn.to_k(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states)
-
-        query = attn.head_to_batch_dim(query)
-        key = attn.head_to_batch_dim(key)
-        value = attn.head_to_batch_dim(value)
-
-        attention_probs = attn.get_attention_scores(query, key, attention_mask)
-
-        # only need to store attention maps during the Attend and Excite process
-        if not attention_probs.stop_gradient:
-            # TODO must flatten （0, 1)
-            # [bs, num_heads, q_len, k_len] -> [bs*num_heads, q_len, k_len]
-            self.attnstore(attention_probs.flatten(0, 1), is_cross, self.place_in_unet)
-
-        hidden_states = paddle.matmul(attention_probs, value)
-        hidden_states = attn.batch_to_head_dim(hidden_states)
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-
-        return hidden_states
-
-
-class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, TextualInversionLoaderMixin):
-    r"""
-    Pipeline for text-to-image generation using Stable Diffusion and Attend and Excite.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPImageProcessor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-    _optional_components = ["safety_checker", "feature_extractor"]
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPImageProcessor,
-        requires_safety_checker: bool = True,
-    ):
-        super().__init__()
-
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. PaddleNLP team, diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                f"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
-    def _encode_prompt(
-        self,
-        prompt,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-        """
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
-
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask
-            else:
-                attention_mask = None
-
-            prompt_embeds = self.text_encoder(
-                text_input_ids,
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
-
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = uncond_input.attention_mask
-            else:
-                attention_mask = None
-
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids,
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
-
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
-
-        return prompt_embeds
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
-    def run_safety_checker(self, image, dtype):
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
-            )
-        else:
-            has_nsfw_concept = None
-        return image, has_nsfw_concept
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
-    def decode_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
-        image = self.vae.decode(latents).sample
-        image = (image / 2 + 0.5).clip(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        image = image.transpose([0, 2, 3, 1]).cast("float32").numpy()
-        return image
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    def check_inputs(
-        self,
-        prompt,
-        indices,
-        height,
-        width,
-        callback_steps,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-    ):
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-        indices_is_list_ints = isinstance(indices, list) and isinstance(indices[0], int)
-        indices_is_list_list_ints = (
-            isinstance(indices, list) and isinstance(indices[0], list) and isinstance(indices[0][0], int)
-        )
-
-        if not indices_is_list_ints and not indices_is_list_list_ints:
-            raise TypeError("`indices` must be a list of ints or a list of a list of ints")
-
-        if (indices is None) or (indices is not None and not isinstance(indices, List)):
-            raise ValueError(f"`indices` has to be a list but is {type(indices)}")
-
-        if indices_is_list_ints:
-            indices_batch_size = 1
-        elif indices_is_list_list_ints:
-            indices_batch_size = len(indices)
-
-        if prompt is not None and isinstance(prompt, str):
-            prompt_batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            prompt_batch_size = len(prompt)
-        elif prompt_embeds is not None:
-            prompt_batch_size = prompt_embeds.shape[0]
-
-        if indices_batch_size != prompt_batch_size:
-            raise ValueError(
-                f"indices batch size must be same as prompt batch size. indices batch size: {indices_batch_size}, prompt batch size: {prompt_batch_size}"
-            )
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, dtype=dtype)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    @staticmethod
-    def _compute_max_attention_per_index(
-        attention_maps: paddle.Tensor,
-        indices: List[int],
-    ) -> List[paddle.Tensor]:
-        """Computes the maximum attention value for each of the tokens we wish to alter."""
-        attention_for_text = attention_maps[:, :, 1:-1]
-        attention_for_text *= 100
-        attention_for_text = F.softmax(attention_for_text, axis=-1)
-
-        # Shift indices since we removed the first token
-        indices = [index - 1 for index in indices]
-
-        # Extract the maximum values
-        max_indices_list = []
-        for i in indices:
-            image = attention_for_text[:, :, i]
-            smoothing = GaussianSmoothing()
-            input = F.pad(image.unsqueeze(0).unsqueeze(0), (1, 1, 1, 1), mode="reflect")
-            image = smoothing(input).squeeze(0).squeeze(0)
-            # paddle.max donot support float16
-            max_indices_list.append(image.max())
-        return max_indices_list
-
-    def _aggregate_and_get_max_attention_per_token(
-        self,
-        indices: List[int],
-    ):
-        """Aggregates the attention for each token and computes the max activation value for each token to alter."""
-        attention_maps = self.attention_store.aggregate_attention(
-            from_where=("up", "down", "mid"),
-        )
-        max_attention_per_index = self._compute_max_attention_per_index(
-            attention_maps=attention_maps,
-            indices=indices,
-        )
-        return max_attention_per_index
-
-    @staticmethod
-    def _compute_loss(max_attention_per_index: List[paddle.Tensor]) -> paddle.Tensor:
-        """Computes the attend-and-excite loss using the maximum attention value for each token."""
-        losses = [max(0, 1.0 - curr_max) for curr_max in max_attention_per_index]
-        loss = max(losses)
-        return loss
-
-    @staticmethod
-    def _update_latent(latents: paddle.Tensor, loss: paddle.Tensor, step_size: float) -> paddle.Tensor:
-        """Update the latent according to the computed loss."""
-        loss.stop_gradient = False
-        grad_cond = paddle.autograd.grad(loss, [latents], retain_graph=True)[0]
-        latents = latents - step_size * grad_cond
-        return latents
-
-    def _perform_iterative_refinement_step(
-        self,
-        latents: paddle.Tensor,
-        indices: List[int],
-        loss: paddle.Tensor,
-        threshold: float,
-        text_embeddings: paddle.Tensor,
-        step_size: float,
-        t: int,
-        max_refinement_steps: int = 20,
-    ):
-        """
-        Performs the iterative latent refinement introduced in the paper. Here, we continuously update the latent code
-        according to our loss objective until the given threshold is reached for all tokens.
-        """
-        iteration = 0
-        target_loss = max(0, 1.0 - threshold)
-        while loss > target_loss:
-            iteration += 1
-
-            latents = latents.clone().detach()
-            latents.stop_gradient = False
-            self.unet(latents, t, encoder_hidden_states=text_embeddings).sample
-            self.unet.clear_gradients()
-
-            # Get max activation value for each subject token
-            max_attention_per_index = self._aggregate_and_get_max_attention_per_token(
-                indices=indices,
-            )
-
-            loss = self._compute_loss(max_attention_per_index)
-
-            if loss != 0:
-                latents = self._update_latent(latents, loss, step_size)
-
-            logger.info(f"\t Try {iteration}. loss: {loss}")
-
-            if iteration >= max_refinement_steps:
-                logger.info(f"\t Exceeded max number of iterations ({max_refinement_steps})! ")
-                break
-
-        # Run one more time but don't compute gradients and update the latents.
-        # We just need to compute the new loss - the grad update will occur below
-        latents = latents.clone().detach()
-        latents.stop_gradient = False
-
-        _ = self.unet(latents, t, encoder_hidden_states=text_embeddings).sample
-        self.unet.clear_gradients()
-
-        # Get max activation value for each subject token
-        max_attention_per_index = self._aggregate_and_get_max_attention_per_token(
-            indices=indices,
-        )
-        loss = self._compute_loss(max_attention_per_index)
-        logger.info(f"\t Finished with loss of: {loss}")
-        return loss, latents, max_attention_per_index
-
-    def register_attention_control(self):
-        attn_procs = {}
-        cross_att_count = 0
-        for name in self.unet.attn_processors.keys():
-            if name.startswith("mid_block"):
-                place_in_unet = "mid"
-            elif name.startswith("up_blocks"):
-                place_in_unet = "up"
-            elif name.startswith("down_blocks"):
-                place_in_unet = "down"
-            else:
-                continue
-
-            cross_att_count += 1
-            attn_procs[name] = AttendExciteAttnProcessor(attnstore=self.attention_store, place_in_unet=place_in_unet)
-
-        self.unet.set_attn_processor(attn_procs)
-        self.attention_store.num_att_layers = cross_att_count
-
-    def get_indices(self, prompt: str) -> Dict[str, int]:
-        """Utility function to list the indices of the tokens you wish to alte"""
-        ids = self.tokenizer(prompt).input_ids
-        indices = {i: tok for tok, i in zip(self.tokenizer.convert_ids_to_tokens(ids), range(len(ids)))}
-        return indices
-
-    @paddle.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        prompt: Union[str, List[str]],
-        token_indices: Union[List[int], List[List[int]]],
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: int = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        max_iter_to_alter: int = 25,
-        thresholds: dict = {0: 0.05, 10: 0.5, 20: 0.8},
-        scale_factor: int = 20,
-        attn_res: Optional[Tuple[int]] = (16, 16),
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            token_indices (`List[int]`):
-                The token indices to alter with attend-and-excite.
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [ppdiffusers.cross_attention](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/ppdiffusers/ppdiffusers/models/cross_attention.py).
-            max_iter_to_alter (`int`, *optional*, defaults to `25`):
-                Number of denoising steps to apply attend-and-excite. The first <max_iter_to_alter> denoising steps are
-                where the attend-and-excite is applied. I.e. if `max_iter_to_alter` is 25 and there are a total of `30`
-                denoising steps, the first 25 denoising steps will apply attend-and-excite and the last 5 will not
-                apply attend-and-excite.
-            thresholds (`dict`, *optional*, defaults to `{0: 0.05, 10: 0.5, 20: 0.8}`):
-                Dictionary defining the iterations and desired thresholds to apply iterative latent refinement in.
-            scale_factor (`int`, *optional*, default to 20):
-                Scale factor that controls the step size of each Attend and Excite update.
-            attn_res (`tuple`, *optional*, default computed from width and height):
-                The 2D resolution of the semantic attention map.
-
-        Examples:
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`. :type attention_store: object
-        """
-
-        # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * self.vae_scale_factor
-        width = width or self.unet.config.sample_size * self.vae_scale_factor
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt,
-            token_indices,
-            height,
-            width,
-            callback_steps,
-            negative_prompt,
-            prompt_embeds,
-            negative_prompt_embeds,
-        )
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-        )
-
-        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps = self.scheduler.timesteps
-
-        # 5. Prepare latent variables
-        num_channels_latents = self.unet.config.in_channels
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            prompt_embeds.dtype,
-            generator,
-            latents,
-        )
-
-        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        if attn_res is None:
-            attn_res = int(np.ceil(width / 32)), int(np.ceil(height / 32))
-        self.attention_store = AttentionStore(attn_res)
-        self.register_attention_control()
-
-        # default config for step size from original repo
-        scale_range = np.linspace(1.0, 0.5, len(self.scheduler.timesteps))
-        step_size = scale_factor * np.sqrt(scale_range)
-
-        text_embeddings = (
-            prompt_embeds[batch_size * num_images_per_prompt :] if do_classifier_free_guidance else prompt_embeds
-        )
-
-        if isinstance(token_indices[0], int):
-            token_indices = [token_indices]
-
-        indices = []
-
-        for ind in token_indices:
-            indices = indices + [ind] * num_images_per_prompt
-
-        # 7. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # Attend and excite process
-                with paddle.set_grad_enabled(True):
-                    latents = latents.clone().detach()
-                    latents.stop_gradient = False
-                    updated_latents = []
-                    for latent, index, text_embedding in zip(latents, indices, text_embeddings):
-                        # Forward pass of denoising with text conditioning
-                        latent = latent.unsqueeze(0)
-                        text_embedding = text_embedding.unsqueeze(0)
-
-                        self.unet(
-                            latent,
-                            t,
-                            encoder_hidden_states=text_embedding,
-                            cross_attention_kwargs=cross_attention_kwargs,
-                        ).sample
-                        self.unet.clear_gradients()
-
-                        # Get max activation value for each subject token
-                        max_attention_per_index = self._aggregate_and_get_max_attention_per_token(
-                            indices=index,
-                        )
-
-                        loss = self._compute_loss(max_attention_per_index=max_attention_per_index)
-
-                        # If this is an iterative refinement step, verify we have reached the desired threshold for all
-                        if i in thresholds.keys() and loss > 1.0 - thresholds[i]:
-                            loss, latent, max_attention_per_index = self._perform_iterative_refinement_step(
-                                latents=latent,
-                                indices=index,
-                                loss=loss,
-                                threshold=thresholds[i],
-                                text_embeddings=text_embedding,
-                                step_size=step_size[i],
-                                t=t,
-                            )
-
-                        # Perform gradient update
-                        if i < max_iter_to_alter:
-                            if loss != 0:
-                                latent = self._update_latent(
-                                    latents=latent,
-                                    loss=loss,
-                                    step_size=step_size[i],
-                                )
-                            logger.info(f"Iteration {i} | Loss: {loss.item():0.4f}")
-
-                        updated_latents.append(latent)
-
-                    latents = paddle.concat(updated_latents, axis=0)
-
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                # predict the noise residual
-                noise_pred = self.unet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                ).sample
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        # 8. Post-processing
-        image = self.decode_latents(latents)
-
-        # 9. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
-
-        # 10. Convert to PIL
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
-
-
-class GaussianSmoothing(nn.Layer):
-    """
-    Arguments:
-    Apply gaussian smoothing on a 1d, 2d or 3d tensor. Filtering is performed separately for each channel in the input
-    using a depthwise convolution.
-        channels (int, sequence): Number of channels of the input tensors. Output will
-            have this number of channels as well.
-        kernel_size (int, sequence): Size of the gaussian kernel. sigma (float, sequence): Standard deviation of the
-        gaussian kernel. dim (int, optional): The number of dimensions of the data.
-            Default value is 2 (spatial).
-    """
-
-    # channels=1, kernel_size=kernel_size, sigma=sigma, dim=2
-    def __init__(
-        self,
-        channels: int = 1,
-        kernel_size: int = 3,
-        sigma: float = 0.5,
-        dim: int = 2,
-    ):
-        super().__init__()
-
-        if isinstance(kernel_size, int):
-            kernel_size = [kernel_size] * dim
-        if isinstance(sigma, float):
-            sigma = [sigma] * dim
-
-        # The gaussian kernel is the product of the
-        # gaussian function of each dimension.
-        kernel = 1
-        meshgrids = paddle.meshgrid([paddle.arange(size, dtype=paddle.float32) for size in kernel_size])
-        for size, std, mgrid in zip(kernel_size, sigma, meshgrids):
-            mean = (size - 1) / 2
-            kernel *= 1 / (std * math.sqrt(2 * math.pi)) * paddle.exp(-(((mgrid - mean) / (2 * std)) ** 2))
-
-        # Make sure sum of values in gaussian kernel equals 1.
-        kernel = kernel / paddle.sum(kernel)
-
-        # Reshape to depthwise convolutional weight
-        kernel = kernel.reshape([1, 1, *kernel.shape])
-        kernel = kernel.tile([channels, *[1] * (kernel.ndim - 1)])
-
-        self.register_buffer("weight", kernel)
-        self.groups = channels
-
-        if dim == 1:
-            self.conv = F.conv1d
-        elif dim == 2:
-            self.conv = F.conv2d
-        elif dim == 3:
-            self.conv = F.conv3d
-        else:
-            raise RuntimeError("Only 1, 2 and 3 dimensions are supported. Received {}.".format(dim))
-
-    def forward(self, input):
-        """
-        Arguments:
-        Apply gaussian filter to input.
-            input (paddle.Tensor): Input to apply gaussian filter on.
-        Returns:
-            filtered (paddle.Tensor): Filtered output.
-        """
-        return self.conv(input, weight=self.weight.cast(input.dtype), groups=self.groups)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
deleted file mode 100644
index 758d570cda29..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ /dev/null
@@ -1,947 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import inspect
-import os
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-import paddle.nn as nn
-import PIL.Image
-
-from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
-
-from ...loaders import TextualInversionLoaderMixin
-from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
-from ...models.controlnet import ControlNetOutput
-from ...models.modeling_utils import ModelMixin
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import PIL_INTERPOLATION, logging, randn_tensor, replace_example_docstring
-from ..pipeline_utils import DiffusionPipeline
-from . import StableDiffusionPipelineOutput
-from .safety_checker import StableDiffusionSafetyChecker
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> # !pip install opencv-python
-        >>> from ppdiffusers import StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler
-        >>> from ppdiffusers.utils import load_image
-        >>> import numpy as np
-        >>> import paddle
-        >>> import cv2
-        >>> from PIL import Image
-        >>> # download an image
-        >>> image = load_image(
-        ...     "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
-        ... )
-        >>> image = np.array(image)
-        >>> # get canny image
-        >>> image = cv2.Canny(image, 100, 200)
-        >>> image = image[:, :, None]
-        >>> image = np.concatenate([image, image, image], axis=2)
-        >>> canny_image = Image.fromarray(image)
-        >>> # load control net and stable diffusion v1-5
-        >>> controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", paddle_dtype=paddle.float16)
-        >>> pipe = StableDiffusionControlNetPipeline.from_pretrained(
-        ...     "runwayml/stable-diffusion-v1-5", controlnet=controlnet, paddle_dtype=paddle.float16
-        ... )
-        >>> # speed up diffusion process with faster scheduler and memory optimization
-        >>> pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
-        >>> # remove following line if xformers is not installed
-        >>> pipe.enable_xformers_memory_efficient_attention()
-        >>> # generate image
-        >>> generator = paddle.Generator().manual_seed(0)
-        >>> image = pipe(
-        ...     "futuristic-looking woman", num_inference_steps=20, generator=generator, image=canny_image
-        ... ).images[0]
-        ```
-"""
-
-
-class MultiControlNetModel(ModelMixin):
-    r"""
-    Multiple `ControlNetModel` wrapper class for Multi-ControlNet
-    This module is a wrapper for multiple instances of the `ControlNetModel`. The `forward()` API is designed to be
-    compatible with `ControlNetModel`.
-    Args:
-        controlnets (`List[ControlNetModel]`):
-            Provides additional conditioning to the unet during the denoising process. You must set multiple
-            `ControlNetModel` as a list.
-    """
-
-    def __init__(self, controlnets: Union[List[ControlNetModel], Tuple[ControlNetModel]]):
-        super().__init__()
-        self.nets = nn.LayerList(controlnets)
-
-    def forward(
-        self,
-        sample: paddle.Tensor,
-        timestep: Union[paddle.Tensor, float, int],
-        encoder_hidden_states: paddle.Tensor,
-        controlnet_cond: List[paddle.Tensor],
-        conditioning_scale: Union[List[List[float]], List[float]],
-        class_labels: Optional[paddle.Tensor] = None,
-        timestep_cond: Optional[paddle.Tensor] = None,
-        attention_mask: Optional[paddle.Tensor] = None,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        return_dict: bool = True,
-    ) -> Union[ControlNetOutput, Tuple]:
-        for i, (image, scale, controlnet) in enumerate(zip(controlnet_cond, conditioning_scale, self.nets)):
-            down_samples, mid_sample = controlnet(
-                sample,
-                timestep,
-                encoder_hidden_states,
-                image,
-                scale,
-                class_labels,
-                timestep_cond,
-                attention_mask,
-                cross_attention_kwargs,
-                return_dict,
-            )
-
-            # merge samples
-            if i == 0:
-                down_block_res_samples, mid_block_res_sample = down_samples, mid_sample
-            else:
-                down_block_res_samples = [
-                    samples_prev + samples_curr
-                    for samples_prev, samples_curr in zip(down_block_res_samples, down_samples)
-                ]
-                mid_block_res_sample += mid_sample
-
-        return down_block_res_samples, mid_block_res_sample
-
-
-class StableDiffusionControlNetPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
-    r"""
-    Pipeline for text-to-image generation using Stable Diffusion with ControlNet guidance.
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    In addition the pipeline inherits the following loading methods:
-        - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        controlnet ([`ControlNetModel`] or `List[ControlNetModel]`):
-            Provides additional conditioning to the unet during the denoising process. If you set multiple ControlNets
-            as a list, the outputs from each ControlNet are added together to create one combined additional
-            conditioning.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPImageProcessor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-    _optional_components = ["safety_checker", "feature_extractor"]
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
-        scheduler: KarrasDiffusionSchedulers,
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPImageProcessor,
-        requires_safety_checker: bool = True,
-    ):
-        super().__init__()
-
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. PaddleNLP team, diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-
-        if isinstance(controlnet, (list, tuple)):
-            controlnet = MultiControlNetModel(controlnet)
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            controlnet=controlnet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-
-    def enable_vae_slicing(self):
-        r"""
-        Enable sliced VAE decoding.
-        When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several
-        steps. This is useful to save some memory and allow larger batch sizes.
-        """
-        self.vae.enable_slicing()
-
-    def disable_vae_slicing(self):
-        r"""
-        Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
-        computing decoding in one step.
-        """
-        self.vae.disable_slicing()
-
-    def enable_vae_tiling(self):
-        r"""
-        Enable tiled VAE decoding.
-        When this option is enabled, the VAE will split the input tensor into tiles to compute decoding and encoding in
-        several steps. This is useful to save a large amount of memory and to allow the processing of larger images.
-        """
-        self.vae.enable_tiling()
-
-    def disable_vae_tiling(self):
-        r"""
-        Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to
-        computing decoding in one step.
-        """
-        self.vae.disable_tiling()
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
-    def _encode_prompt(
-        self,
-        prompt,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-        Args:
-             prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-        """
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
-
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            config = (
-                self.text_encoder.config
-                if isinstance(self.text_encoder.config, dict)
-                else self.text_encoder.config.to_dict()
-            )
-            if config.get("use_attention_mask", None) is not None and config["use_attention_mask"]:
-                attention_mask = text_inputs.attention_mask
-            else:
-                attention_mask = None
-
-            prompt_embeds = self.text_encoder(
-                text_input_ids,
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
-
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-
-            config = (
-                self.text_encoder.config
-                if isinstance(self.text_encoder.config, dict)
-                else self.text_encoder.config.to_dict()
-            )
-            if config.get("use_attention_mask", None) is not None and config["use_attention_mask"]:
-                attention_mask = uncond_input.attention_mask
-            else:
-                attention_mask = None
-
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids,
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.cast(dtype=self.text_encoder.dtype)
-
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
-
-        return prompt_embeds
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
-    def run_safety_checker(self, image, dtype):
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
-            )
-        else:
-            has_nsfw_concept = None
-        return image, has_nsfw_concept
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
-    def decode_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
-        image = self.vae.decode(latents).sample
-        image = (image / 2 + 0.5).clip(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        image = image.transpose([0, 2, 3, 1]).cast("float32").numpy()
-        return image
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    def check_inputs(
-        self,
-        prompt,
-        image,
-        height,
-        width,
-        callback_steps,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-        controlnet_conditioning_scale=1.0,
-    ):
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-        # `prompt` needs more sophisticated handling when there are multiple
-        # conditionings.
-        if isinstance(self.controlnet, MultiControlNetModel):
-            if isinstance(prompt, list):
-                logger.warning(
-                    f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}"
-                    " prompts. The conditionings will be fixed across the prompts."
-                )
-
-        # Check `image`
-        if isinstance(self.controlnet, ControlNetModel):
-            self.check_image(image, prompt, prompt_embeds)
-        elif isinstance(self.controlnet, MultiControlNetModel):
-            if not isinstance(image, list):
-                raise TypeError("For multiple controlnets: `image` must be type `list`")
-
-            # When `image` is a nested list:
-            # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
-            elif any(isinstance(i, list) for i in image):
-                raise ValueError("A single batch of multiple conditionings are supported at the moment.")
-            elif len(image) != len(self.controlnet.nets):
-                raise ValueError(
-                    "For multiple controlnets: `image` must have the same length as the number of controlnets."
-                )
-
-            for image_ in image:
-                self.check_image(image_, prompt, prompt_embeds)
-        else:
-            assert False
-
-        # Check `controlnet_conditioning_scale`
-        if isinstance(self.controlnet, ControlNetModel):
-            if not isinstance(controlnet_conditioning_scale, float):
-                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
-        elif isinstance(self.controlnet, MultiControlNetModel):
-            if isinstance(controlnet_conditioning_scale, list):
-                if any(isinstance(i, list) for i in controlnet_conditioning_scale):
-                    raise ValueError("A single batch of multiple conditionings are supported at the moment.")
-            elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
-                self.controlnet.nets
-            ):
-                raise ValueError(
-                    "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
-                    " the same length as the number of controlnets"
-                )
-        else:
-            assert False
-
-    def check_image(self, image, prompt, prompt_embeds):
-        image_is_pil = isinstance(image, PIL.Image.Image)
-        image_is_tensor = isinstance(image, paddle.Tensor)
-        image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
-        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], paddle.Tensor)
-
-        if not image_is_pil and not image_is_tensor and not image_is_pil_list and not image_is_tensor_list:
-            raise TypeError(
-                "image must be one of PIL image, paddle tensor, list of PIL images, or list of paddle tensors"
-            )
-
-        if image_is_pil:
-            image_batch_size = 1
-        elif image_is_tensor:
-            image_batch_size = image.shape[0]
-        elif image_is_pil_list:
-            image_batch_size = len(image)
-        elif image_is_tensor_list:
-            image_batch_size = len(image)
-
-        if prompt is not None and isinstance(prompt, str):
-            prompt_batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            prompt_batch_size = len(prompt)
-        elif prompt_embeds is not None:
-            prompt_batch_size = prompt_embeds.shape[0]
-
-        if image_batch_size != 1 and image_batch_size != prompt_batch_size:
-            raise ValueError(
-                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
-            )
-
-    def prepare_image(
-        self,
-        image,
-        width,
-        height,
-        batch_size,
-        num_images_per_prompt,
-        dtype,
-        do_classifier_free_guidance=False,
-        guess_mode=False,
-    ):
-        if not isinstance(image, paddle.Tensor):
-            if isinstance(image, PIL.Image.Image):
-                image = [image]
-
-            if isinstance(image[0], PIL.Image.Image):
-                images = []
-                for image_ in image:
-                    image_ = image_.convert("RGB")
-                    image_ = image_.resize((width, height), resample=PIL_INTERPOLATION["lanczos"])
-                    image_ = np.array(image_)
-                    image_ = image_[None, :]
-                    images.append(image_)
-
-                image = np.concatenate(images, axis=0)
-                image = np.array(image).astype(np.float32) / 255.0
-                image = image.transpose(0, 3, 1, 2)
-                image = paddle.to_tensor(image)
-            elif isinstance(image[0], paddle.Tensor):
-                image = paddle.concat(image, axis=0)
-
-        image_batch_size = image.shape[0]
-
-        if image_batch_size == 1:
-            repeat_by = batch_size
-        else:
-            # image batch size is the same as prompt batch size
-            repeat_by = num_images_per_prompt
-
-        image = image.repeat_interleave(repeat_by, axis=0)
-
-        image = image.cast(dtype)
-        if do_classifier_free_guidance and not guess_mode:
-            image = paddle.concat([image] * 2)
-
-        return image
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
-        shape = [batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor]
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, dtype=dtype)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    def _default_height_width(self, height, width, image):
-        while isinstance(image, list):
-            image = image[0]
-
-        if height is None:
-            if isinstance(image, PIL.Image.Image):
-                height = image.height
-            elif isinstance(image, paddle.Tensor):
-                height = image.shape[2]
-
-            height = (height // 8) * 8  # round down to nearest multiple of 8
-
-        if width is None:
-            if isinstance(image, PIL.Image.Image):
-                width = image.width
-            elif isinstance(image, paddle.Tensor):
-                width = image.shape[3]
-
-            width = (width // 8) * 8  # round down to nearest multiple of 8
-
-        return height, width
-
-    # override DiffusionPipeline
-    def save_pretrained(
-        self,
-        save_directory: Union[str, os.PathLike],
-        safe_serialization: bool = False,
-        variant: Optional[str] = None,
-        to_diffusers: bool = None,
-    ):
-        if isinstance(self.controlnet, ControlNetModel):
-            super().save_pretrained(
-                save_directory, safe_serialization=safe_serialization, variant=variant, to_diffusers=to_diffusers
-            )
-        else:
-            raise NotImplementedError("Currently, the `save_pretrained()` is not implemented for Multi-ControlNet.")
-
-    @paddle.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        image: Union[paddle.Tensor, PIL.Image.Image, List[paddle.Tensor], List[PIL.Image.Image]] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: int = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
-        guess_mode: bool = False,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            image (`paddle.Tensor`, `PIL.Image.Image`, `List[paddle.Tensor]`, `List[PIL.Image.Image]`,
-                    `List[List[paddle.Tensor]]`, or `List[List[PIL.Image.Image]]`):
-                The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If
-                the type is specified as `paddle.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can
-                also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If
-                height and/or width are passed, `image` is resized according to them. If multiple ControlNets are
-                specified in init, images must be passed as a list such that each element of the list can be correctly
-                batched for input to a single controlnet.
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [ppdiffusers.cross_attention](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/ppdiffusers/ppdiffusers/models/cross_attention.py).
-            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
-                The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
-                to the residual in the original unet. If multiple ControlNets are specified in init, you can set the
-                corresponding scale as a list.
-            guess_mode (`bool`, *optional*, defaults to `False`):
-                In this mode, the ControlNet encoder will try best to recognize the content of the input image even if
-                you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended.
-        Examples:
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        # 0. Default height and width to unet
-        height, width = self._default_height_width(height, width, image)
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt,
-            image,
-            height,
-            width,
-            callback_steps,
-            negative_prompt,
-            prompt_embeds,
-            negative_prompt_embeds,
-            controlnet_conditioning_scale,
-        )
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        if isinstance(self.controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
-            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(self.controlnet.nets)
-
-        # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-        )
-
-        # 4. Prepare image
-        if isinstance(self.controlnet, ControlNetModel):
-            image = self.prepare_image(
-                image=image,
-                width=width,
-                height=height,
-                batch_size=batch_size * num_images_per_prompt,
-                num_images_per_prompt=num_images_per_prompt,
-                dtype=self.controlnet.dtype,
-                do_classifier_free_guidance=do_classifier_free_guidance,
-                guess_mode=guess_mode,
-            )
-        elif isinstance(self.controlnet, MultiControlNetModel):
-            images = []
-
-            for image_ in image:
-                image_ = self.prepare_image(
-                    image=image_,
-                    width=width,
-                    height=height,
-                    batch_size=batch_size * num_images_per_prompt,
-                    num_images_per_prompt=num_images_per_prompt,
-                    dtype=self.controlnet.dtype,
-                    do_classifier_free_guidance=do_classifier_free_guidance,
-                    guess_mode=guess_mode,
-                )
-
-                images.append(image_)
-
-            image = images
-        else:
-            assert False
-
-        # 5. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps = self.scheduler.timesteps
-
-        # 6. Prepare latent variables
-        num_channels_latents = self.unet.config.in_channels
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            prompt_embeds.dtype,
-            generator,
-            latents,
-        )
-
-        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 8. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                # controlnet(s) inference
-                if guess_mode and do_classifier_free_guidance:
-                    # Infer ControlNet only for the conditional batch.
-                    controlnet_latent_model_input = latents
-                    controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
-                else:
-                    controlnet_latent_model_input = latent_model_input
-                    controlnet_prompt_embeds = prompt_embeds
-
-                # controlnet(s) inference
-                down_block_res_samples, mid_block_res_sample = self.controlnet(
-                    controlnet_latent_model_input,
-                    t,
-                    encoder_hidden_states=controlnet_prompt_embeds,
-                    controlnet_cond=image,
-                    conditioning_scale=controlnet_conditioning_scale,
-                    guess_mode=guess_mode,
-                    return_dict=False,
-                )
-
-                if guess_mode and do_classifier_free_guidance:
-                    # Infered ControlNet only for the conditional batch.
-                    # To apply the output of ControlNet to both the unconditional and conditional batches,
-                    # add 0 to the unconditional batch to keep it unchanged.
-                    down_block_res_samples = [paddle.concat([paddle.zeros_like(d), d]) for d in down_block_res_samples]
-                    mid_block_res_sample = paddle.concat(
-                        [paddle.zeros_like(mid_block_res_sample), mid_block_res_sample]
-                    )
-
-                # predict the noise residual
-                noise_pred = self.unet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                    down_block_additional_residuals=down_block_res_samples,
-                    mid_block_additional_residual=mid_block_res_sample,
-                ).sample
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        if output_type == "latent":
-            image = latents
-            has_nsfw_concept = None
-        elif output_type == "pil":
-            # 8. Post-processing
-            image = self.decode_latents(latents)
-
-            # 9. Run safety checker
-            image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
-
-            # 10. Convert to PIL
-            image = self.numpy_to_pil(image)
-        else:
-            # 8. Post-processing
-            image = self.decode_latents(latents)
-
-            # 9. Run safety checker
-            image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
deleted file mode 100644
index cbd2ae8f6eee..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
+++ /dev/null
@@ -1,664 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import Callable, List, Optional, Union
-
-import numpy as np
-import paddle
-import PIL
-from packaging import version
-
-from paddlenlp.transformers import (
-    CLIPTextModel,
-    CLIPTokenizer,
-    DPTForDepthEstimation,
-    DPTImageProcessor,
-)
-
-from ...configuration_utils import FrozenDict
-from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
-from ...models import AutoencoderKL, UNet2DConditionModel
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import PIL_INTERPOLATION, deprecate, logging, randn_tensor
-from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
-def preprocess(image):
-    if isinstance(image, paddle.Tensor):
-        return image
-    elif isinstance(image, PIL.Image.Image):
-        image = [image]
-
-    if isinstance(image[0], PIL.Image.Image):
-        w, h = image[0].size
-        w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
-
-        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
-        image = np.concatenate(image, axis=0)
-        image = np.array(image).astype(np.float32) / 255.0
-        image = image.transpose(0, 3, 1, 2)
-        image = 2.0 * image - 1.0
-        image = paddle.to_tensor(image)
-    elif isinstance(image[0], paddle.Tensor):
-        image = paddle.concat(image, axis=0)
-    return image
-
-
-class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
-    r"""
-    Pipeline for text-guided image to image generation using Stable Diffusion.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    In addition the pipeline inherits the following loading methods:
-        - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
-        - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`]
-    as well as the following saving methods:
-        - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`]
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-    """
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
-        depth_estimator: DPTForDepthEstimation,
-        feature_extractor: DPTImageProcessor,
-    ):
-        super().__init__()
-
-        is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse(
-            version.parse(unet.config._ppdiffusers_version).base_version
-        ) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
-        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
-            deprecation_message = (
-                "The configuration file of the unet has set the default `sample_size` to smaller than"
-                " 64 which seems highly unlikely .If your checkpoint is a fine-tuned version of any of the"
-                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
-                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
-                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
-                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
-                " in the config might lead to incorrect results in future versions. If you have downloaded this"
-                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
-                " the `unet/config.json` file"
-            )
-            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(unet.config)
-            new_config["sample_size"] = 64
-            unet._internal_dict = FrozenDict(new_config)
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            depth_estimator=depth_estimator,
-            feature_extractor=feature_extractor,
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
-    def _encode_prompt(
-        self,
-        prompt,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-        """
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
-
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask
-            else:
-                attention_mask = None
-
-            prompt_embeds = self.text_encoder(
-                text_input_ids,
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
-
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = uncond_input.attention_mask
-            else:
-                attention_mask = None
-
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids,
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
-
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
-
-        return prompt_embeds
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
-    def run_safety_checker(self, image, dtype):
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
-            )
-        else:
-            has_nsfw_concept = None
-        return image, has_nsfw_concept
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
-    def decode_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
-        image = self.vae.decode(latents).sample
-        image = (image / 2 + 0.5).clip(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        image = image.transpose([0, 2, 3, 1]).cast("float32").numpy()
-        return image
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.check_inputs
-    def check_inputs(
-        self, prompt, strength, callback_steps, negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None
-    ):
-        if strength < 0 or strength > 1:
-            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
-    def get_timesteps(self, num_inference_steps, strength):
-        # get the original timestep using init_timestep
-        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
-
-        t_start = max(num_inference_steps - init_timestep, 0)
-        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
-
-        return timesteps, num_inference_steps - t_start
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.prepare_latents
-    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None):
-        if not isinstance(image, (paddle.Tensor, PIL.Image.Image, list)):
-            raise ValueError(
-                f"`image` has to be of type `paddle.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
-            )
-
-        image = image.cast(dtype)
-
-        batch_size = batch_size * num_images_per_prompt
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if isinstance(generator, list):
-            init_latents = [
-                self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
-            ]
-            init_latents = paddle.concat(init_latents, axis=0)
-        else:
-            init_latents = self.vae.encode(image).latent_dist.sample(generator)
-
-        init_latents = self.vae.config.scaling_factor * init_latents
-
-        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
-            # expand init_latents for batch_size
-            deprecation_message = (
-                f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
-                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
-                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
-                " your script to pass as many initial images as text prompts to suppress this warning."
-            )
-            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
-            additional_image_per_prompt = batch_size // init_latents.shape[0]
-            init_latents = paddle.concat([init_latents] * additional_image_per_prompt, axis=0)
-        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
-            raise ValueError(
-                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
-            )
-        else:
-            init_latents = paddle.concat([init_latents], axis=0)
-
-        shape = init_latents.shape
-        noise = randn_tensor(shape, generator=generator, dtype=dtype)
-
-        # get latents
-        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
-        latents = init_latents
-
-        return latents
-
-    def prepare_depth_map(self, image, depth_map, batch_size, do_classifier_free_guidance, dtype):
-        if isinstance(image, PIL.Image.Image):
-            image = [image]
-        else:
-            image = list(image)
-
-        if isinstance(image[0], PIL.Image.Image):
-            width, height = image[0].size
-        else:
-            height, width = image[0].shape[-2:]
-
-        if depth_map is None:
-            pixel_values = self.feature_extractor(images=image, return_tensors="pd").pixel_values
-            # The DPT-Hybrid model uses batch-norm layers which are not compatible with fp16.
-            # TODO DPTModel `expand_as`` donot supoort float16
-            with paddle.amp.auto_cast(True, level="O2"):
-                depth_map = self.depth_estimator(pixel_values).predicted_depth.cast("float32")
-        else:
-            depth_map = depth_map.cast("float32")
-
-        depth_map = paddle.nn.functional.interpolate(
-            depth_map.unsqueeze(1),
-            size=(height // self.vae_scale_factor, width // self.vae_scale_factor),
-            mode="bicubic",
-            align_corners=False,
-        )
-        # amin / amax donot support float16
-        depth_min = paddle.amin(depth_map, axis=[1, 2, 3], keepdim=True)
-        depth_max = paddle.amax(depth_map, axis=[1, 2, 3], keepdim=True)
-        depth_map = 2.0 * (depth_map - depth_min) / (depth_max - depth_min) - 1.0
-        # maybe cast to float16
-        depth_map = depth_map.cast(dtype)
-
-        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
-        if depth_map.shape[0] < batch_size:
-            repeat_by = batch_size // depth_map.shape[0]
-            depth_map = depth_map.tile([repeat_by, 1, 1, 1])
-
-        depth_map = paddle.concat([depth_map] * 2) if do_classifier_free_guidance else depth_map
-        return depth_map
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        image: Union[paddle.Tensor, PIL.Image.Image] = None,
-        depth_map: Optional[paddle.Tensor] = None,
-        strength: float = 0.8,
-        num_inference_steps: Optional[int] = 50,
-        guidance_scale: Optional[float] = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: Optional[float] = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            image (`paddle.Tensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, that will be used as the starting point for the
-                process.
-            strength (`float`, *optional*, defaults to 0.8):
-                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
-                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
-                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
-                be maximum and the denoising process will run for the full number of iterations specified in
-                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference. This parameter will be modulated by `strength`.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
-                is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-
-        Examples:
-
-        ```py
-        >>> import paddle
-        >>> import requests
-        >>> from PIL import Image
-
-        >>> from ppdiffusers import StableDiffusionDepth2ImgPipeline
-
-        >>> pipe = StableDiffusionDepth2ImgPipeline.from_pretrained(
-        ...     "stabilityai/stable-diffusion-2-depth",
-        ...     paddle_dtype=paddle.float16,
-        ... )
-
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> init_image = Image.open(requests.get(url, stream=True).raw)
-        >>> prompt = "two tigers"
-        >>> n_propmt = "bad, deformed, ugly, bad anotomy"
-        >>> image = pipe(prompt=prompt, image=init_image, negative_prompt=n_propmt, strength=0.7).images[0]
-        ```
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        # 1. Check inputs
-        self.check_inputs(
-            prompt,
-            strength,
-            callback_steps,
-            negative_prompt=negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-        )
-
-        if image is None:
-            raise ValueError("`image` input cannot be undefined.")
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-        )
-
-        # 4. Prepare depth mask
-        depth_mask = self.prepare_depth_map(
-            image,
-            depth_map,
-            batch_size * num_images_per_prompt,
-            do_classifier_free_guidance,
-            prompt_embeds.dtype,
-        )
-
-        # 5. Preprocess image
-        image = preprocess(image)
-
-        # 6. Set timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
-        latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
-
-        # 7. Prepare latent variables
-        latents = self.prepare_latents(
-            image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, generator
-        )
-
-        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 9. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-                latent_model_input = paddle.concat([latent_model_input, depth_mask], axis=1)
-
-                # predict the noise residual
-                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-                latents = latents.cast(prompt_embeds.dtype)
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        # 10. Post-processing
-        image = self.decode_latents(latents)
-
-        # 11. Convert to PIL
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image,)
-
-        return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
deleted file mode 100644
index fab7aad3e896..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
+++ /dev/null
@@ -1,374 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import Callable, List, Optional, Union
-
-import paddle
-import PIL
-from packaging import version
-
-from paddlenlp.transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
-
-from ...configuration_utils import FrozenDict
-from ...models import AutoencoderKL, UNet2DConditionModel
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import deprecate, logging, randn_tensor
-from ..pipeline_utils import DiffusionPipeline
-from . import StableDiffusionPipelineOutput
-from .safety_checker import StableDiffusionSafetyChecker
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-class StableDiffusionImageVariationPipeline(DiffusionPipeline):
-    r"""
-    Pipeline to generate variations from an input image using Stable Diffusion.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        image_encoder ([`CLIPVisionModelWithProjection`]):
-            Frozen CLIP image-encoder. Stable Diffusion Image Variation uses the vision portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPVisionModelWithProjection),
-            specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPImageProcessor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-    # TODO: feature_extractor is required to encode images (if they are in PIL format),
-    # we should give a descriptive message if the pipeline doesn't have one.
-    _optional_components = ["safety_checker"]
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        image_encoder: CLIPVisionModelWithProjection,
-        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPImageProcessor,
-        requires_safety_checker: bool = True,
-    ):
-        super().__init__()
-
-        if safety_checker is None and requires_safety_checker:
-            logger.warn(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. PaddleNLP team, diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                f"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-
-        is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse(
-            version.parse(unet.config._ppdiffusers_version).base_version
-        ) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
-        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
-            deprecation_message = (
-                "The configuration file of the unet has set the default `sample_size` to smaller than"
-                " 64 which seems highly unlikely .If your checkpoint is a fine-tuned version of any of the"
-                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
-                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
-                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
-                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
-                " in the config might lead to incorrect results in future versions. If you have downloaded this"
-                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
-                " the `unet/config.json` file"
-            )
-            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(unet.config)
-            new_config["sample_size"] = 64
-            unet._internal_dict = FrozenDict(new_config)
-
-        self.register_modules(
-            vae=vae,
-            image_encoder=image_encoder,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-
-    def _encode_image(self, image, num_images_per_prompt, do_classifier_free_guidance):
-        dtype = self.image_encoder.dtype
-
-        if not isinstance(image, paddle.Tensor):
-            image = self.feature_extractor(images=image, return_tensors="pd").pixel_values
-
-        image = image.cast(dtype)
-        image_embeddings = self.image_encoder(image).image_embeds
-        image_embeddings = image_embeddings.unsqueeze(1)
-
-        # duplicate image embeddings for each generation per prompt, using mps friendly method
-        bs_embed, seq_len, _ = image_embeddings.shape
-        image_embeddings = image_embeddings.tile([1, num_images_per_prompt, 1])
-        image_embeddings = image_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        if do_classifier_free_guidance:
-            negative_prompt_embeds = paddle.zeros_like(image_embeddings)
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            image_embeddings = paddle.concat([negative_prompt_embeds, image_embeddings])
-
-        return image_embeddings
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
-    def run_safety_checker(self, image, dtype):
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
-            )
-        else:
-            has_nsfw_concept = None
-        return image, has_nsfw_concept
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
-    def decode_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
-        image = self.vae.decode(latents).sample
-        image = (image / 2 + 0.5).clip(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        image = image.transpose([0, 2, 3, 1]).cast("float32").numpy()
-        return image
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    def check_inputs(self, image, height, width, callback_steps):
-        if (
-            not isinstance(image, paddle.Tensor)
-            and not isinstance(image, PIL.Image.Image)
-            and not isinstance(image, list)
-        ):
-            raise ValueError(
-                "`image` has to be of type `paddle.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
-                f" {type(image)}"
-            )
-
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, dtype=dtype)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        image: Union[PIL.Image.Image, List[PIL.Image.Image], paddle.Tensor],
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `paddle.Tensor`):
-                The image or images to guide the image generation. If you provide a tensor, it needs to comply with the
-                configuration of
-                [this](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json)
-                `CLIPImageProcessor`
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * self.vae_scale_factor
-        width = width or self.unet.config.sample_size * self.vae_scale_factor
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(image, height, width, callback_steps)
-
-        # 2. Define call parameters
-        if isinstance(image, PIL.Image.Image):
-            batch_size = 1
-        elif isinstance(image, list):
-            batch_size = len(image)
-        else:
-            batch_size = image.shape[0]
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input image
-        image_embeddings = self._encode_image(image, num_images_per_prompt, do_classifier_free_guidance)
-
-        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps = self.scheduler.timesteps
-
-        # 5. Prepare latent variables
-        num_channels_latents = self.unet.config.in_channels
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            image_embeddings.dtype,
-            generator,
-            latents,
-        )
-
-        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 7. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                # predict the noise residual
-                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=image_embeddings).sample
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        # 8. Post-processing
-        image = self.decode_latents(latents)
-
-        # 9. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image, image_embeddings.dtype)
-
-        # 10. Convert to PIL
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
deleted file mode 100644
index 842e7d6bba8c..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ /dev/null
@@ -1,678 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import numpy as np
-import paddle
-import PIL
-from packaging import version
-
-from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
-
-from ...configuration_utils import FrozenDict
-from ...image_processor import VaeImageProcessor
-from ...loaders import FromCkptMixin, LoraLoaderMixin, TextualInversionLoaderMixin
-from ...models import AutoencoderKL, UNet2DConditionModel
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import (
-    PIL_INTERPOLATION,
-    deprecate,
-    logging,
-    randn_tensor,
-    replace_example_docstring,
-)
-from ..pipeline_utils import DiffusionPipeline
-from . import StableDiffusionPipelineOutput
-from .safety_checker import StableDiffusionSafetyChecker
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> import requests
-        >>> import paddle
-        >>> from PIL import Image
-        >>> from io import BytesIO
-
-        >>> from ppdiffusers import StableDiffusionImg2ImgPipeline
-
-        >>> model_id_or_path = "runwayml/stable-diffusion-v1-5"
-        >>> pipe = StableDiffusionImg2ImgPipeline.from_pretrained(model_id_or_path, paddle_dtype=paddle.float16)
-
-        >>> url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
-
-        >>> response = requests.get(url)
-        >>> init_image = Image.open(BytesIO(response.content)).convert("RGB")
-        >>> init_image = init_image.resize((768, 512))
-
-        >>> prompt = "A fantasy landscape, trending on artstation"
-
-        >>> images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
-        >>> images[0].save("fantasy_landscape.png")
-        ```
-"""
-
-
-def preprocess(image):
-    if isinstance(image, paddle.Tensor):
-        return image
-    elif isinstance(image, PIL.Image.Image):
-        image = [image]
-
-    if isinstance(image[0], PIL.Image.Image):
-        w, h = image[0].size
-        w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
-
-        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
-        image = np.concatenate(image, axis=0)
-        image = np.array(image).astype(np.float32) / 255.0
-        image = image.transpose(0, 3, 1, 2)
-        image = 2.0 * image - 1.0
-        image = paddle.to_tensor(image)
-    elif isinstance(image[0], paddle.Tensor):
-        image = paddle.concat(image, axis=0)
-    return image
-
-
-class StableDiffusionImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromCkptMixin):
-    r"""
-    Pipeline for text-guided image to image generation using Stable Diffusion.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    In addition the pipeline inherits the following loading methods:
-        - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
-        - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`]
-        - *Ckpt*: [`loaders.FromCkptMixin.from_ckpt`]
-    as well as the following saving methods:
-        - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`]
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], [`PNDMScheduler`], [`EulerDiscreteScheduler`], [`EulerAncestralDiscreteScheduler`]
-            or [`DPMSolverMultistepScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPImageProcessor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-    _optional_components = ["safety_checker", "feature_extractor"]
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.__init__
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPImageProcessor,
-        requires_safety_checker: bool = True,
-    ):
-        super().__init__()
-
-        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
-                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
-                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
-                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
-                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file"
-            )
-            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["steps_offset"] = 1
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
-                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
-                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
-                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
-                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
-            )
-            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["clip_sample"] = False
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. PaddleNLP team, diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                f"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-
-        is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse(
-            version.parse(unet.config._ppdiffusers_version).base_version
-        ) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
-        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
-            deprecation_message = (
-                "The configuration file of the unet has set the default `sample_size` to smaller than"
-                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
-                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
-                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
-                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
-                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
-                " in the config might lead to incorrect results in future versions. If you have downloaded this"
-                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
-                " the `unet/config.json` file"
-            )
-            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(unet.config)
-            new_config["sample_size"] = 64
-            unet._internal_dict = FrozenDict(new_config)
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
-        self.register_to_config(
-            requires_safety_checker=requires_safety_checker,
-        )
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
-    def _encode_prompt(
-        self,
-        prompt,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-        """
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
-
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask
-            else:
-                attention_mask = None
-
-            prompt_embeds = self.text_encoder(
-                text_input_ids,
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-        prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
-
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = uncond_input.attention_mask
-            else:
-                attention_mask = None
-
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids,
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
-
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
-
-        return prompt_embeds
-
-    def run_safety_checker(self, image, dtype):
-        if self.safety_checker is None:
-            has_nsfw_concept = None
-        else:
-            if paddle.is_tensor(image):
-                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
-            else:
-                feature_extractor_input = self.image_processor.numpy_to_pil(image)
-            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pd")
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=paddle.cast(safety_checker_input.pixel_values, dtype)
-            )
-        return image, has_nsfw_concept
-
-    def decode_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
-        image = self.vae.decode(latents).sample
-        # image = (image / 2 + 0.5).clip(0, 1)
-        return image
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    def check_inputs(
-        self, prompt, strength, callback_steps, negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None
-    ):
-        if strength < 0 or strength > 1:
-            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-    def get_timesteps(self, num_inference_steps, strength):
-        # get the original timestep using init_timestep
-        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
-
-        t_start = max(num_inference_steps - init_timestep, 0)
-        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
-
-        return timesteps, num_inference_steps - t_start
-
-    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None):
-        if not isinstance(image, (paddle.Tensor, list)):
-            raise ValueError(f"`image` has to be of type `paddle.Tensor` or list but is {type(image)}")
-
-        image = image.cast(dtype)
-
-        batch_size = batch_size * num_images_per_prompt
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if isinstance(generator, list):
-            init_latents = [
-                self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
-            ]
-            init_latents = paddle.concat(init_latents, axis=0)
-        else:
-            init_latents = self.vae.encode(image).latent_dist.sample(generator)
-
-        init_latents = self.vae.config.scaling_factor * init_latents
-
-        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
-            # expand init_latents for batch_size
-            deprecation_message = (
-                f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
-                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
-                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
-                " your script to pass as many initial images as text prompts to suppress this warning."
-            )
-            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
-            additional_image_per_prompt = batch_size // init_latents.shape[0]
-            init_latents = paddle.concat([init_latents] * additional_image_per_prompt, axis=0)
-        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
-            raise ValueError(
-                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
-            )
-        else:
-            init_latents = paddle.concat([init_latents], axis=0)
-
-        shape = init_latents.shape
-        noise = randn_tensor(shape, generator=generator, dtype=dtype)
-
-        # get latents
-        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
-        latents = init_latents
-
-        return latents
-
-    @paddle.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        image: Union[paddle.Tensor, PIL.Image.Image] = None,
-        strength: float = 0.8,
-        num_inference_steps: Optional[int] = 50,
-        guidance_scale: Optional[float] = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: Optional[float] = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            image (`paddle.Tensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, that will be used as the starting point for the
-                process.
-            strength (`float`, *optional*, defaults to 0.8):
-                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
-                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
-                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
-                be maximum and the denoising process will run for the full number of iterations specified in
-                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference. This parameter will be modulated by `strength`.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
-                is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
-        Examples:
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(prompt, strength, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds)
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-        )
-
-        # 4. Preprocess image
-        image = self.image_processor.preprocess(image)
-
-        # 5. set timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
-        latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
-
-        # 6. Prepare latent variables
-        latents = self.prepare_latents(
-            image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, generator
-        )
-
-        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 8. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                # predict the noise residual
-                noise_pred = self.unet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                ).sample
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        if not output_type == "latent":
-            image = self.decode_latents(latents)
-            image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
-        else:
-            image = latents
-            has_nsfw_concept = None
-
-        if has_nsfw_concept is None:
-            do_denormalize = [True] * image.shape[0]
-        else:
-            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
-
-        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
deleted file mode 100644
index ea5c1f50c0b3..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ /dev/null
@@ -1,803 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import Callable, List, Optional, Union
-
-import numpy as np
-import paddle
-import paddle.nn.functional as F
-import PIL
-from packaging import version
-
-from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
-
-from ...configuration_utils import FrozenDict
-from ...models import AutoencoderKL, UNet2DConditionModel
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import deprecate, logging, randn_tensor
-from ..pipeline_utils import DiffusionPipeline
-from . import StableDiffusionPipelineOutput
-from .safety_checker import StableDiffusionSafetyChecker
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-def prepare_mask_and_masked_image(image, mask):
-    """
-    Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be
-    converted to ``paddle.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
-    ``image`` and ``1`` for the ``mask``.
-
-    The ``image`` will be converted to ``paddle.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be
-    binarized (``mask > 0.5``) and cast to ``paddle.float32`` too.
-
-    Args:
-        image (Union[np.array, PIL.Image, paddle.Tensor]): The image to inpaint.
-            It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` or a ``channels x height x width``
-            ``paddle.Tensor`` or a ``batch x channels x height x width`` ``paddle.Tensor``.
-        mask (_type_): The mask to apply to the image, i.e. regions to inpaint.
-            It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width``
-            ``paddle.Tensor`` or a ``batch x 1 x height x width`` ``paddle.Tensor``.
-
-
-    Raises:
-        ValueError: ``paddle.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``paddle.Tensor`` mask
-        should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions.
-        TypeError: ``mask`` is a ``paddle.Tensor`` but ``image`` is not
-            (ot the other way around).
-
-    Returns:
-        tuple[paddle.Tensor]: The pair (mask, masked_image) as ``paddle.Tensor`` with 4
-            dimensions: ``batch x channels x height x width``.
-    """
-    if isinstance(image, paddle.Tensor):
-        if not isinstance(mask, paddle.Tensor):
-            raise TypeError(f"`image` is a paddle.Tensor but `mask` (type: {type(mask)} is not")
-
-        # Batch single image
-        if image.ndim == 3:
-            assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
-            image = image.unsqueeze(0)
-
-        # Batch and add channel dim for single mask
-        if mask.ndim == 2:
-            mask = mask.unsqueeze(0).unsqueeze(0)
-
-        # Batch single mask or add channel dim
-        if mask.ndim == 3:
-            # Single batched mask, no channel dim or single mask not batched but channel dim
-            if mask.shape[0] == 1:
-                mask = mask.unsqueeze(0)
-
-            # Batched masks no channel dim
-            else:
-                mask = mask.unsqueeze(1)
-
-        assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
-        assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
-        assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
-
-        # Check image is in [-1, 1]
-        if image.min() < -1 or image.max() > 1:
-            raise ValueError("Image should be in [-1, 1] range")
-
-        # Check mask is in [0, 1]
-        if mask.min() < 0 or mask.max() > 1:
-            raise ValueError("Mask should be in [0, 1] range")
-
-        # Binarize mask
-        mask = paddle.where(mask < 0.5, 0.0, 1.0)
-
-        # Image as float32
-        image = image.cast(paddle.float32)
-    elif isinstance(mask, paddle.Tensor):
-        raise TypeError(f"`mask` is a paddle.Tensor but `image` (type: {type(image)} is not")
-    else:
-        # preprocess image
-        if isinstance(image, (PIL.Image.Image, np.ndarray)):
-            image = [image]
-
-        if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
-            image = [np.array(i.convert("RGB"))[None, :] for i in image]
-            image = np.concatenate(image, axis=0)
-        elif isinstance(image, list) and isinstance(image[0], np.ndarray):
-            image = np.concatenate([i[None, :] for i in image], axis=0)
-
-        image = image.transpose(0, 3, 1, 2)
-        image = paddle.to_tensor(image).cast(paddle.float32) / 127.5 - 1.0
-
-        # preprocess mask
-        if isinstance(mask, (PIL.Image.Image, np.ndarray)):
-            mask = [mask]
-
-        if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image):
-            mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
-            mask = mask.astype(np.float32) / 255.0
-        elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
-            mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
-
-        mask[mask < 0.5] = 0
-        mask[mask >= 0.5] = 1
-        mask = paddle.to_tensor(mask)
-
-    masked_image = image * (mask < 0.5)
-
-    return mask, masked_image
-
-
-class StableDiffusionInpaintPipeline(DiffusionPipeline):
-    r"""
-    Pipeline for text-guided image inpainting using Stable Diffusion. *This is an experimental feature*.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPFeatureExtractor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-    _optional_components = ["safety_checker", "feature_extractor"]
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPFeatureExtractor,
-        requires_safety_checker: bool = True,
-    ):
-        super().__init__()
-
-        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
-                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
-                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
-                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
-                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file"
-            )
-            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["steps_offset"] = 1
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if hasattr(scheduler.config, "skip_prk_steps") and scheduler.config.skip_prk_steps is False:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} has not set the configuration"
-                " `skip_prk_steps`. `skip_prk_steps` should be set to True in the configuration file. Please make"
-                " sure to update the config accordingly as not setting `skip_prk_steps` in the config might lead to"
-                " incorrect results in future versions. If you have downloaded this checkpoint from the Hugging Face"
-                " Hub, it would be very nice if you could open a Pull request for the"
-                " `scheduler/scheduler_config.json` file"
-            )
-            deprecate("skip_prk_steps not set", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["skip_prk_steps"] = True
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. PaddleNLP team, diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                f"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-
-        is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse(
-            version.parse(unet.config._ppdiffusers_version).base_version
-        ) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
-        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
-            deprecation_message = (
-                "The configuration file of the unet has set the default `sample_size` to smaller than"
-                " 64 which seems highly unlikely .If your checkpoint is a fine-tuned version of any of the"
-                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
-                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
-                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
-                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
-                " in the config might lead to incorrect results in future versions. If you have downloaded this"
-                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
-                " the `unet/config.json` file"
-            )
-            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(unet.config)
-            new_config["sample_size"] = 64
-            unet._internal_dict = FrozenDict(new_config)
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
-    def _encode_prompt(
-        self,
-        prompt,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
-                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-        """
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask
-            else:
-                attention_mask = None
-
-            prompt_embeds = self.text_encoder(
-                text_input_ids,
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = uncond_input.attention_mask
-            else:
-                attention_mask = None
-
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids,
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
-
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
-
-        return prompt_embeds
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
-    def run_safety_checker(self, image, dtype):
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
-            )
-        else:
-            has_nsfw_concept = None
-        return image, has_nsfw_concept
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
-    def decode_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
-        image = self.vae.decode(latents).sample
-        image = (image / 2 + 0.5).clip(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        image = image.transpose([0, 2, 3, 1]).cast("float32").numpy()
-        return image
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
-    def check_inputs(
-        self,
-        prompt,
-        height,
-        width,
-        callback_steps,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-    ):
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
-        shape = [batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor]
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, dtype=dtype)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    def prepare_mask_latents(
-        self, mask, masked_image, batch_size, height, width, dtype, generator, do_classifier_free_guidance
-    ):
-        # resize the mask to latents shape as we concatenate the mask to the latents
-        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
-        # and half precision
-        mask = F.interpolate(mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor))
-        mask = mask.cast(dtype)
-
-        masked_image = masked_image.cast(dtype)
-
-        # encode the mask image into latents space so we can concatenate it to the latents
-        if isinstance(generator, list):
-            masked_image_latents = [
-                self.vae.encode(masked_image[i : i + 1]).latent_dist.sample(generator=generator[i])
-                for i in range(batch_size)
-            ]
-            masked_image_latents = paddle.concat(masked_image_latents, axis=0)
-        else:
-            masked_image_latents = self.vae.encode(masked_image).latent_dist.sample(generator=generator)
-        masked_image_latents = self.vae.config.scaling_factor * masked_image_latents
-
-        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
-        if mask.shape[0] < batch_size:
-            if not batch_size % mask.shape[0] == 0:
-                raise ValueError(
-                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
-                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
-                    " of masks that you pass is divisible by the total requested batch size."
-                )
-            mask = mask.tile([batch_size // mask.shape[0], 1, 1, 1])
-        if masked_image_latents.shape[0] < batch_size:
-            if not batch_size % masked_image_latents.shape[0] == 0:
-                raise ValueError(
-                    "The passed images and the required batch size don't match. Images are supposed to be duplicated"
-                    f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
-                    " Make sure the number of images that you pass is divisible by the total requested batch size."
-                )
-            masked_image_latents = masked_image_latents.tile([batch_size // masked_image_latents.shape[0], 1, 1, 1])
-
-        mask = paddle.concat([mask] * 2) if do_classifier_free_guidance else mask
-        masked_image_latents = (
-            paddle.concat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
-        )
-
-        # aligning device to prevent device errors when concating it with the latent model input
-        masked_image_latents = masked_image_latents.cast(dtype)
-        return mask, masked_image_latents
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        image: Union[paddle.Tensor, PIL.Image.Image] = None,
-        mask_image: Union[paddle.Tensor, PIL.Image.Image] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            image (`PIL.Image.Image`):
-                `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
-                be masked out with `mask_image` and repainted according to `prompt`.
-            mask_image (`PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
-                repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
-                to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
-                instead of 3, so the expected shape would be `(B, H, W, 1)`.
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
-                is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-
-        Examples:
-
-        ```py
-        >>> import PIL
-        >>> import requests
-        >>> import paddle
-        >>> from io import BytesIO
-
-        >>> from ppdiffusers import StableDiffusionInpaintPipeline
-
-
-        >>> def download_image(url):
-        ...     response = requests.get(url)
-        ...     return PIL.Image.open(BytesIO(response.content)).convert("RGB")
-
-
-        >>> img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
-        >>> mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
-
-        >>> init_image = download_image(img_url).resize((512, 512))
-        >>> mask_image = download_image(mask_url).resize((512, 512))
-
-        >>> pipe = StableDiffusionInpaintPipeline.from_pretrained(
-        ...     "runwayml/stable-diffusion-inpainting", paddle_dtype=paddle.float16
-        ... )
-
-        >>> prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
-        >>> image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
-        ```
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * self.vae_scale_factor
-        width = width or self.unet.config.sample_size * self.vae_scale_factor
-
-        # 1. Check inputs
-        self.check_inputs(
-            prompt,
-            height,
-            width,
-            callback_steps,
-            negative_prompt,
-            prompt_embeds,
-            negative_prompt_embeds,
-        )
-
-        if image is None:
-            raise ValueError("`image` input cannot be undefined.")
-
-        if mask_image is None:
-            raise ValueError("`mask_image` input cannot be undefined.")
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-        )
-
-        # 4. Preprocess mask and image
-        mask, masked_image = prepare_mask_and_masked_image(image, mask_image)
-
-        # 5. set timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps = self.scheduler.timesteps
-
-        # 6. Prepare latent variables
-        num_channels_latents = self.vae.config.latent_channels
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            prompt_embeds.dtype,
-            generator,
-            latents,
-        )
-
-        # 7. Prepare mask latent variables
-        mask, masked_image_latents = self.prepare_mask_latents(
-            mask,
-            masked_image,
-            batch_size * num_images_per_prompt,
-            height,
-            width,
-            prompt_embeds.dtype,
-            generator,
-            do_classifier_free_guidance,
-        )
-
-        # 8. Check that sizes of mask, masked image and latents match
-        num_channels_mask = mask.shape[1]
-        num_channels_masked_image = masked_image_latents.shape[1]
-        if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
-            raise ValueError(
-                f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
-                f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
-                f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
-                f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
-                " `pipeline.unet` or your `mask_image` or `image` input."
-            )
-
-        # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 10. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-
-                # concat latents, mask, masked_image_latents in the channel dimension
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-                latent_model_input = paddle.concat([latent_model_input, mask, masked_image_latents], axis=1)
-
-                # predict the noise residual
-                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-                # must cast dtype, paddle.concat has bug....
-                latents = latents.cast(prompt_embeds.dtype)
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        # 11. Post-processing
-        image = self.decode_latents(latents)
-
-        # 12. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
-
-        # 13. Convert to PIL
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
deleted file mode 100644
index a79cca0a8f2e..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
+++ /dev/null
@@ -1,650 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import Callable, List, Optional, Union
-
-import numpy as np
-import paddle
-import paddle.nn.functional as F
-import PIL
-from packaging import version
-
-from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
-
-from ...configuration_utils import FrozenDict
-from ...loaders import FromCkptMixin, LoraLoaderMixin, TextualInversionLoaderMixin
-from ...models import AutoencoderKL, UNet2DConditionModel
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import PIL_INTERPOLATION, deprecate, logging, randn_tensor
-from ..pipeline_utils import DiffusionPipeline
-from . import StableDiffusionPipelineOutput
-from .safety_checker import StableDiffusionSafetyChecker
-
-logger = logging.get_logger(__name__)
-
-
-def preprocess_image(image, batch_size):
-    w, h = image.size
-    w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
-    image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
-    image = np.array(image).astype(np.float32) / 255.0
-    image = np.vstack([image[None].transpose(0, 3, 1, 2)] * batch_size)
-    image = paddle.to_tensor(image)
-    return 2.0 * image - 1.0
-
-
-def preprocess_mask(mask, batch_size, scale_factor=8):
-    if not isinstance(mask, paddle.Tensor):
-        mask = mask.convert("L")
-        w, h = mask.size
-        w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
-        mask = mask.resize((w // scale_factor, h // scale_factor), resample=PIL_INTERPOLATION["nearest"])
-        mask = np.array(mask).astype(np.float32) / 255.0
-        mask = np.tile(mask, (4, 1, 1))
-        mask = np.vstack([mask[None]] * batch_size)
-        mask = 1 - mask  # repaint white, keep black
-        mask = paddle.to_tensor(mask)
-        return mask
-
-    else:
-        valid_mask_channel_sizes = [1, 3]
-        # if mask channel is fourth tensor dimension, permute dimensions to pytorch standard (B, C, H, W)
-        if mask.shape[3] in valid_mask_channel_sizes:
-            mask = mask.transpose([0, 3, 1, 2])
-        elif mask.shape[1] not in valid_mask_channel_sizes:
-            raise ValueError(
-                f"Mask channel dimension of size in {valid_mask_channel_sizes} should be second or fourth dimension,"
-                f" but received mask of shape {tuple(mask.shape)}"
-            )
-        # (potentially) reduce mask channel dimension from 3 to 1 for broadcasting to latent shape
-        mask = mask.mean(1, keepdim=True)
-        h, w = mask.shape[-2:]
-        h, w = (x - x % 8 for x in (h, w))  # resize to integer multiple of 8
-        mask = F.interpolate(mask, (h // scale_factor, w // scale_factor))
-        return mask
-
-
-class StableDiffusionInpaintPipelineLegacy(
-    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromCkptMixin
-):
-    r"""
-    Pipeline for text-guided image inpainting using Stable Diffusion. *This is an experimental feature*.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    In addition the pipeline inherits the following loading methods:
-        - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
-        - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`]
-        - *Ckpt*: [`loaders.FromCkptMixin.from_ckpt`]
-    as well as the following saving methods:
-        - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`]
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPImageProcessor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-    _optional_components = ["feature_extractor"]
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.__init__
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPImageProcessor,
-        requires_safety_checker: bool = True,
-    ):
-        super().__init__()
-
-        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
-                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
-                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
-                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
-                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file"
-            )
-            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["steps_offset"] = 1
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
-                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
-                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
-                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
-                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
-            )
-            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["clip_sample"] = False
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. PaddleNLP team, diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                f"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-
-        is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse(
-            version.parse(unet.config._ppdiffusers_version).base_version
-        ) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
-        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
-            deprecation_message = (
-                "The configuration file of the unet has set the default `sample_size` to smaller than"
-                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
-                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
-                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
-                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
-                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
-                " in the config might lead to incorrect results in future versions. If you have downloaded this"
-                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
-                " the `unet/config.json` file"
-            )
-            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(unet.config)
-            new_config["sample_size"] = 64
-            unet._internal_dict = FrozenDict(new_config)
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
-    def _encode_prompt(
-        self,
-        prompt,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-        """
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
-
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask
-            else:
-                attention_mask = None
-
-            prompt_embeds = self.text_encoder(
-                text_input_ids,
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
-
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = uncond_input.attention_mask
-            else:
-                attention_mask = None
-
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids,
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
-
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
-
-        return prompt_embeds
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
-    def run_safety_checker(self, image, dtype):
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
-            )
-        else:
-            has_nsfw_concept = None
-        return image, has_nsfw_concept
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
-    def decode_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
-        image = self.vae.decode(latents).sample
-        image = (image / 2 + 0.5).clip(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        image = image.transpose([0, 2, 3, 1]).cast("float32").numpy()
-        return image
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.check_inputs
-    def check_inputs(
-        self, prompt, strength, callback_steps, negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None
-    ):
-        if strength < 0 or strength > 1:
-            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
-    def get_timesteps(self, num_inference_steps, strength):
-        # get the original timestep using init_timestep
-        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
-
-        t_start = max(num_inference_steps - init_timestep, 0)
-        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
-
-        return timesteps, num_inference_steps - t_start
-
-    def prepare_latents(self, image, timestep, num_images_per_prompt, dtype, generator):
-        image = image.cast(dtype)
-        init_latent_dist = self.vae.encode(image).latent_dist
-        init_latents = init_latent_dist.sample(generator=generator)
-        init_latents = self.vae.config.scaling_factor * init_latents
-
-        # Expand init_latents for batch_size and num_images_per_prompt
-        init_latents = paddle.concat([init_latents] * num_images_per_prompt, axis=0)
-        init_latents_orig = init_latents
-
-        # add noise to latents using the timesteps
-        noise = randn_tensor(init_latents.shape, generator=generator, dtype=dtype)
-        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
-        latents = init_latents
-        return latents, init_latents_orig, noise
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]],
-        image: Union[paddle.Tensor, PIL.Image.Image] = None,
-        mask_image: Union[paddle.Tensor, PIL.Image.Image] = None,
-        strength: float = 0.8,
-        num_inference_steps: Optional[int] = 50,
-        guidance_scale: Optional[float] = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        add_predicted_noise: Optional[bool] = False,
-        eta: Optional[float] = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            image (`paddle.Tensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, that will be used as the starting point for the
-                process. This is the image whose masked region will be inpainted.
-            mask_image (`paddle.Tensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
-                replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
-                PIL image, it will be converted to a single channel (luminance) before use. If mask is a tensor, the
-                expected shape should be either `(B, H, W, C)` or `(B, C, H, W)`, where C is 1 or 3.
-            strength (`float`, *optional*, defaults to 0.8):
-                Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
-                is 1, the denoising process will be run on the masked area for the full number of iterations specified
-                in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more noise to
-                that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
-                the expense of slower inference. This parameter will be modulated by `strength`, as explained above.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
-                is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            add_predicted_noise (`bool`, *optional*, defaults to True):
-                Use predicted noise instead of random noise when constructing noisy versions of the original image in
-                the reverse diffusion process
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-
-        # 1. Check inputs
-        self.check_inputs(prompt, strength, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds)
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-        )
-
-        # 4. Preprocess image and mask
-        if not isinstance(image, paddle.Tensor):
-            image = preprocess_image(image, batch_size)
-
-        mask_image = preprocess_mask(mask_image, batch_size, self.vae_scale_factor)
-
-        # 5. set timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
-        latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt])
-
-        # 6. Prepare latent variables
-        # encode the init image into latents and scale the latents
-        latents, init_latents_orig, noise = self.prepare_latents(
-            image, latent_timestep, num_images_per_prompt, prompt_embeds.dtype, generator
-        )
-
-        # 7. Prepare mask latent
-        mask = mask_image.cast(latents.dtype)
-        mask = paddle.concat([mask] * num_images_per_prompt)
-
-        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 9. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                # predict the noise residual
-                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-                if i < len(timesteps) - 1:
-                    # masking
-                    if add_predicted_noise:
-                        init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise_pred_uncond, t)
-                    else:
-                        # https://github.com/huggingface/diffusers/pull/3749/files#diff-39d36ab1e622684e35fe6971c12fb44e24756bdc383aba3d7f6e3b1625bdaafc
-                        noise_timestep = timesteps[i + 1]
-                        init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, noise_timestep)
-                else:
-                    init_latents_proper = init_latents_orig
-
-                latents = (init_latents_proper * mask) + (latents * (1 - mask))
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        # 10. Post-processing
-        image = self.decode_latents(latents)
-
-        # 11. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
-
-        # 12. Convert to PIL
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
deleted file mode 100644
index 0d89bfa076f0..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
+++ /dev/null
@@ -1,670 +0,0 @@
-# Copyright 2023 The InstructPix2Pix Authors and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import Callable, List, Optional, Union
-
-import numpy as np
-import paddle
-import PIL
-
-from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
-
-from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
-from ...models import AutoencoderKL, UNet2DConditionModel
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import PIL_INTERPOLATION, deprecate, logging, randn_tensor
-from ..pipeline_utils import DiffusionPipeline
-from . import StableDiffusionPipelineOutput
-from .safety_checker import StableDiffusionSafetyChecker
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
-def preprocess(image):
-    if isinstance(image, paddle.Tensor):
-        return image
-    elif isinstance(image, PIL.Image.Image):
-        image = [image]
-
-    if isinstance(image[0], PIL.Image.Image):
-        w, h = image[0].size
-        w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
-
-        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
-        image = np.concatenate(image, axis=0)
-        image = np.array(image).astype(np.float32) / 255.0
-        image = image.transpose(0, 3, 1, 2)
-        image = 2.0 * image - 1.0
-        image = paddle.to_tensor(image)
-    elif isinstance(image[0], paddle.Tensor):
-        image = paddle.concat(image, axis=0)
-    return image
-
-
-class StableDiffusionInstructPix2PixPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
-    r"""
-    Pipeline for pixel-level image editing by following text instructions. Based on Stable Diffusion.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    In addition the pipeline inherits the following loading methods:
-        - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
-        - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`]
-    as well as the following saving methods:
-        - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`]
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPImageProcessor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-    _optional_components = ["safety_checker", "feature_extractor"]
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPImageProcessor,
-        requires_safety_checker: bool = True,
-    ):
-        super().__init__()
-
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. PaddleNLP team, diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                f"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        image: Union[paddle.Tensor, PIL.Image.Image] = None,
-        num_inference_steps: int = 100,
-        guidance_scale: float = 7.5,
-        image_guidance_scale: float = 1.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            image (`PIL.Image.Image`):
-                `Image`, or tensor representing an image batch which will be repainted according to `prompt`.
-            num_inference_steps (`int`, *optional*, defaults to 100):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality. This pipeline requires a value of at least `1`.
-            image_guidance_scale (`float`, *optional*, defaults to 1.5):
-                Image guidance scale is to push the generated image towards the inital image `image`. Image guidance
-                scale is enabled by setting `image_guidance_scale > 1`. Higher image guidance scale encourages to
-                generate images that are closely linked to the source image `image`, usually at the expense of lower
-                image quality. This pipeline requires a value of at least `1`.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
-                is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-
-        Examples:
-
-        ```py
-        >>> import PIL
-        >>> import requests
-        >>> import paddle
-        >>> from io import BytesIO
-
-        >>> from ppdiffusers import StableDiffusionInstructPix2PixPipeline
-
-
-        >>> def download_image(url):
-        ...     response = requests.get(url)
-        ...     return PIL.Image.open(BytesIO(response.content)).convert("RGB")
-
-
-        >>> img_url = "https://huggingface.co/datasets/diffusers/diffusers-images-docs/resolve/main/mountain.png"
-
-        >>> image = download_image(img_url).resize((512, 512))
-
-        >>> pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
-        ...     "timbrooks/instruct-pix2pix", paddle_dtype=paddle.float16
-        ... )
-
-        >>> prompt = "make the mountains snowy"
-        >>> image = pipe(prompt=prompt, image=image).images[0]
-        ```
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        # 0. Check inputs
-        self.check_inputs(prompt, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds)
-
-        if image is None:
-            raise ValueError("`image` input cannot be undefined.")
-
-        # 1. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0 and image_guidance_scale >= 1.0
-        # check if scheduler is in sigmas space
-        scheduler_is_in_sigma_space = hasattr(self.scheduler, "sigmas")
-
-        # 2. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-        )
-
-        # 3. Preprocess image
-        image = preprocess(image)
-        height, width = image.shape[-2:]
-
-        # 4. set timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps = self.scheduler.timesteps
-
-        # 5. Prepare Image latents
-        image_latents = self.prepare_image_latents(
-            image,
-            batch_size,
-            num_images_per_prompt,
-            prompt_embeds.dtype,
-            do_classifier_free_guidance,
-            generator,
-        )
-
-        # 6. Prepare latent variables
-        num_channels_latents = self.vae.config.latent_channels
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            prompt_embeds.dtype,
-            generator,
-            latents,
-        )
-
-        # 7. Check that shapes of latents and image match the UNet channels
-        num_channels_image = image_latents.shape[1]
-        if num_channels_latents + num_channels_image != self.unet.config.in_channels:
-            raise ValueError(
-                f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
-                f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
-                f" `num_channels_image`: {num_channels_image} "
-                f" = {num_channels_latents+num_channels_image}. Please verify the config of"
-                " `pipeline.unet` or your `image` input."
-            )
-
-        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 9. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # Expand the latents if we are doing classifier free guidance.
-                # The latents are expanded 3 times because for pix2pix the guidance\
-                # is applied for both the text and the input image.
-                latent_model_input = paddle.concat([latents] * 3) if do_classifier_free_guidance else latents
-
-                # concat latents, image_latents in the channel dimension
-                scaled_latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-                scaled_latent_model_input = paddle.concat(
-                    [scaled_latent_model_input, image_latents.cast(scaled_latent_model_input.dtype)], axis=1
-                )
-
-                # predict the noise residual
-                noise_pred = self.unet(scaled_latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
-
-                # Hack:
-                # For karras style schedulers the model does classifer free guidance using the
-                # predicted_original_sample instead of the noise_pred. So we need to compute the
-                # predicted_original_sample here if we are using a karras style scheduler.
-                if scheduler_is_in_sigma_space:
-                    step_index = (self.scheduler.timesteps == t).nonzero().item()
-                    sigma = self.scheduler.sigmas[step_index]
-                    noise_pred = latent_model_input - sigma * noise_pred
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_text, noise_pred_image, noise_pred_uncond = noise_pred.chunk(3)
-                    noise_pred = (
-                        noise_pred_uncond
-                        + guidance_scale * (noise_pred_text - noise_pred_image)
-                        + image_guidance_scale * (noise_pred_image - noise_pred_uncond)
-                    )
-
-                # Hack:
-                # For karras style schedulers the model does classifer free guidance using the
-                # predicted_original_sample instead of the noise_pred. But the scheduler.step function
-                # expects the noise_pred and computes the predicted_original_sample internally. So we
-                # need to overwrite the noise_pred here such that the value of the computed
-                # predicted_original_sample is correct.
-                if scheduler_is_in_sigma_space:
-                    noise_pred = (noise_pred - latents) / (-sigma)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        # 10. Post-processing
-        image = self.decode_latents(latents)
-
-        # 11. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
-
-        # 12. Convert to PIL
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
-
-    def _encode_prompt(
-        self,
-        prompt,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_ prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-        """
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
-
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask
-            else:
-                attention_mask = None
-
-            prompt_embeds = self.text_encoder(
-                text_input_ids,
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
-
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = uncond_input.attention_mask
-            else:
-                attention_mask = None
-
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids,
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
-
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            # pix2pix has two  negative embeddings, and unlike in other pipelines latents are ordered [prompt_embeds, negative_prompt_embeds, negative_prompt_embeds]
-            prompt_embeds = paddle.concat([prompt_embeds, negative_prompt_embeds, negative_prompt_embeds])
-
-        return prompt_embeds
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
-    def run_safety_checker(self, image, dtype):
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
-            )
-        else:
-            has_nsfw_concept = None
-        return image, has_nsfw_concept
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
-    def decode_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
-        image = self.vae.decode(latents).sample
-        image = (image / 2 + 0.5).clip(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        image = image.transpose([0, 2, 3, 1]).cast("float32").numpy()
-        return image
-
-    def check_inputs(
-        self, prompt, callback_steps, negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None
-    ):
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, dtype=dtype)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    def prepare_image_latents(
-        self, image, batch_size, num_images_per_prompt, dtype, do_classifier_free_guidance, generator=None
-    ):
-        if not isinstance(image, (paddle.Tensor, PIL.Image.Image, list)):
-            raise ValueError(
-                f"`image` has to be of type `paddle.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
-            )
-
-        image = image.cast(dtype)
-
-        batch_size = batch_size * num_images_per_prompt
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if isinstance(generator, list):
-            image_latents = [self.vae.encode(image[i : i + 1]).latent_dist.mode() for i in range(batch_size)]
-            image_latents = paddle.concat(image_latents, axis=0)
-        else:
-            image_latents = self.vae.encode(image).latent_dist.mode()
-
-        if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
-            # expand image_latents for batch_size
-            deprecation_message = (
-                f"You have passed {batch_size} text prompts (`prompt`), but only {image_latents.shape[0]} initial"
-                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
-                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
-                " your script to pass as many initial images as text prompts to suppress this warning."
-            )
-            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
-            additional_image_per_prompt = batch_size // image_latents.shape[0]
-            image_latents = paddle.concat([image_latents] * additional_image_per_prompt, axis=0)
-        elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
-            raise ValueError(
-                f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
-            )
-        else:
-            image_latents = paddle.concat([image_latents], axis=0)
-
-        if do_classifier_free_guidance:
-            uncond_image_latents = paddle.zeros_like(image_latents)
-            image_latents = paddle.concat([image_latents, image_latents, uncond_image_latents], axis=0)
-
-        return image_latents
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py
deleted file mode 100755
index 3976a4546551..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# NOT IMPLEMENT YET!
-StableDiffusionKDiffusionPipeline = None
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
deleted file mode 100644
index eb5dd1eafcef..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
+++ /dev/null
@@ -1,479 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Callable, List, Optional, Union
-
-import numpy as np
-import paddle
-import paddle.nn.functional as F
-import PIL
-
-from paddlenlp.transformers import CLIPTextModel, CLIPTokenizer
-
-from ...models import AutoencoderKL, UNet2DConditionModel
-from ...schedulers import EulerDiscreteScheduler
-from ...utils import logging, randn_tensor
-from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.preprocess
-def preprocess(image):
-    if isinstance(image, paddle.Tensor):
-        return image
-    elif isinstance(image, PIL.Image.Image):
-        image = [image]
-
-    if isinstance(image[0], PIL.Image.Image):
-        w, h = image[0].size
-        w, h = (x - x % 64 for x in (w, h))  # resize to integer multiple of 64
-
-        image = [np.array(i.resize((w, h)))[None, :] for i in image]
-        image = np.concatenate(image, axis=0)
-        image = np.array(image).astype(np.float32) / 255.0
-        image = image.transpose(0, 3, 1, 2)
-        image = 2.0 * image - 1.0
-        image = paddle.to_tensor(image)
-    elif isinstance(image[0], paddle.Tensor):
-        image = paddle.concat(image, axis=0)
-    return image
-
-
-class StableDiffusionLatentUpscalePipeline(DiffusionPipeline):
-    r"""
-    Pipeline to upscale the resolution of Stable Diffusion output images by a factor of 2.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/main/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`EulerDiscreteScheduler`].
-    """
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: EulerDiscreteScheduler,
-    ):
-        super().__init__()
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-        )
-
-    def _encode_prompt(self, prompt, do_classifier_free_guidance, negative_prompt):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `list(int)`):
-                prompt to be encoded
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-        """
-        batch_size = len(prompt) if isinstance(prompt, list) else 1
-
-        text_inputs = self.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=self.tokenizer.model_max_length,
-            truncation=True,
-            return_length=True,
-            return_tensors="pd",
-        )
-        text_input_ids = text_inputs.input_ids
-
-        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
-
-        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
-            text_input_ids, untruncated_ids
-        ):
-            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
-            logger.warning(
-                "The following part of your input was truncated because CLIP can only handle sequences up to"
-                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-            )
-
-        text_encoder_out = self.text_encoder(
-            text_input_ids,
-            output_hidden_states=True,
-        )
-        text_embeddings = text_encoder_out.hidden_states[-1]
-        text_pooler_out = text_encoder_out.pooler_output
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            max_length = text_input_ids.shape[-1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_length=True,
-                return_tensors="pd",
-            )
-
-            uncond_encoder_out = self.text_encoder(
-                uncond_input.input_ids,
-                output_hidden_states=True,
-            )
-
-            uncond_embeddings = uncond_encoder_out.hidden_states[-1]
-            uncond_pooler_out = uncond_encoder_out.pooler_output
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            text_embeddings = paddle.concat([uncond_embeddings, text_embeddings])
-            text_pooler_out = paddle.concat([uncond_pooler_out, text_pooler_out])
-
-        return text_embeddings, text_pooler_out
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
-    def decode_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
-        image = self.vae.decode(latents).sample
-        image = (image / 2 + 0.5).clip(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        image = image.transpose([0, 2, 3, 1]).cast("float32").numpy()
-        return image
-
-    def check_inputs(self, prompt, image, callback_steps):
-        if not isinstance(prompt, str) and not isinstance(prompt, list):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if (
-            not isinstance(image, paddle.Tensor)
-            and not isinstance(image, PIL.Image.Image)
-            and not isinstance(image, list)
-        ):
-            raise ValueError(
-                f"`image` has to be of type `paddle.Tensor`, `PIL.Image.Image` or `list` but is {type(image)}"
-            )
-
-        # verify batch size of prompt and image are same if image is a list or tensor
-        if isinstance(image, list) or isinstance(image, paddle.Tensor):
-            if isinstance(prompt, str):
-                batch_size = 1
-            else:
-                batch_size = len(prompt)
-            if isinstance(image, list):
-                image_batch_size = len(image)
-            else:
-                image_batch_size = image.shape[0] if image.ndim == 4 else 1
-            if batch_size != image_batch_size:
-                raise ValueError(
-                    f"`prompt` has batch size {batch_size} and `image` has batch size {image_batch_size}."
-                    " Please make sure that passed `prompt` matches the batch size of `image`."
-                )
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.prepare_latents
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height, width)
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, dtype=dtype)
-        else:
-            if latents.shape != list(shape):
-                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]],
-        image: Union[paddle.Tensor, PIL.Image.Image, List[PIL.Image.Image]],
-        num_inference_steps: int = 75,
-        guidance_scale: float = 9.0,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`):
-                The prompt or prompts to guide the image upscaling.
-            image (`PIL.Image.Image` or List[`PIL.Image.Image`] or `paddle.Tensor`):
-                `Image`, or tensor representing an image batch which will be upscaled. If it's a tensor, it can be
-                either a latent output from a stable diffusion model, or an image tensor in the range `[-1, 1]`. It
-                will be considered a `latent` if `image.shape[1]` is `4`; otherwise, it will be considered to be an
-                image representation and encoded using this pipeline's `vae` encoder.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-
-        Examples:
-        ```py
-        >>> from ppdiffusers import StableDiffusionLatentUpscalePipeline, StableDiffusionPipeline
-        >>> import paddle
-
-
-        >>> pipeline = StableDiffusionPipeline.from_pretrained(
-        ...     "CompVis/stable-diffusion-v1-4", paddle_dtype=paddle.float16
-        ... )
-
-        >>> model_id = "stabilityai/sd-x2-latent-upscaler"
-        >>> upscaler = StableDiffusionLatentUpscalePipeline.from_pretrained(model_id, paddle_dtype=paddle.float16)
-
-        >>> prompt = "a photo of an astronaut high resolution, unreal engine, ultra realistic"
-        >>> generator = paddle.Generator().manual_seed(33)
-
-        >>> low_res_latents = pipeline(prompt, generator=generator, output_type="latent").images
-
-        >>> with paddle.no_grad():
-        ...     image = pipeline.decode_latents(low_res_latents)
-        >>> image = pipeline.numpy_to_pil(image)[0]
-
-        >>> image.save("../images/a1.png")
-
-        >>> upscaled_image = upscaler(
-        ...     prompt=prompt,
-        ...     image=low_res_latents,
-        ...     num_inference_steps=20,
-        ...     guidance_scale=0,
-        ...     generator=generator,
-        ... ).images[0]
-
-        >>> upscaled_image.save("../images/a2.png")
-        ```
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-
-        # 1. Check inputs
-        self.check_inputs(prompt, image, callback_steps)
-
-        # 2. Define call parameters
-        batch_size = 1 if isinstance(prompt, str) else len(prompt)
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        if guidance_scale == 0:
-            prompt = [""] * batch_size
-
-        # 3. Encode input prompt
-        text_embeddings, text_pooler_out = self._encode_prompt(prompt, do_classifier_free_guidance, negative_prompt)
-
-        # 4. Preprocess image
-        image = preprocess(image)
-        image = image.cast(text_embeddings.dtype)
-        if image.shape[1] == 3:
-            # encode image if not in latent-space yet
-            image = self.vae.encode(image).latent_dist.sample() * self.vae.config.scaling_factor
-
-        # 5. set timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps = self.scheduler.timesteps
-
-        batch_multiplier = 2 if do_classifier_free_guidance else 1
-        image = image[None, :] if image.ndim == 3 else image
-        image = paddle.concat([image] * batch_multiplier)
-
-        # 5. Add noise to image (set to be 0):
-        # (see below notes from the author):
-        # "the This step theoretically can make the model work better on out-of-distribution inputs, but mostly just seems to make it match the input less, so it's turned off by default."
-        noise_level = paddle.to_tensor([0.0], dtype=paddle.float32)
-        noise_level = paddle.concat([noise_level] * image.shape[0])
-        inv_noise_level = (noise_level**2 + 1) ** (-0.5)
-
-        # TODO F.interpolate donot support float16
-        image_cond = (
-            F.interpolate(image.cast("float32"), scale_factor=2, mode="nearest") * inv_noise_level[:, None, None, None]
-        )
-        image_cond = image_cond.cast(text_embeddings.dtype)
-
-        noise_level_embed = paddle.concat(
-            [
-                paddle.ones([text_pooler_out.shape[0], 64], dtype=text_pooler_out.dtype),
-                paddle.zeros([text_pooler_out.shape[0], 64], dtype=text_pooler_out.dtype),
-            ],
-            axis=1,
-        )
-
-        timestep_condition = paddle.concat([noise_level_embed, text_pooler_out], axis=1)
-
-        # 6. Prepare latent variables
-        height, width = image.shape[2:]
-        num_channels_latents = self.vae.config.latent_channels
-        latents = self.prepare_latents(
-            batch_size,
-            num_channels_latents,
-            height * 2,  # 2x upscale
-            width * 2,
-            text_embeddings.dtype,
-            generator,
-            latents,
-        )
-
-        # 7. Check that sizes of image and latents match
-        num_channels_image = image.shape[1]
-        if num_channels_latents + num_channels_image != self.unet.config.in_channels:
-            raise ValueError(
-                f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
-                f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
-                f" `num_channels_image`: {num_channels_image} "
-                f" = {num_channels_latents+num_channels_image}. Please verify the config of"
-                " `pipeline.unet` or your `image` input."
-            )
-
-        # 9. Denoising loop
-        num_warmup_steps = 0
-
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                sigma = self.scheduler.sigmas[i]
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-                scaled_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                scaled_model_input = paddle.concat(
-                    [scaled_model_input, image_cond.cast(scaled_model_input.dtype)], axis=1
-                )
-                # preconditioning parameter based on  Karras et al. (2022) (table 1)
-                timestep = paddle.log(sigma) * 0.25
-                noise_pred = self.unet(
-                    scaled_model_input,
-                    timestep,
-                    encoder_hidden_states=text_embeddings,
-                    timestep_cond=timestep_condition,
-                ).sample
-
-                # in original repo, the output contains a variance channel that's not used
-                noise_pred = noise_pred[:, :-1]
-
-                # apply preconditioning, based on table 1 in Karras et al. (2022)
-                inv_sigma = 1 / (sigma**2 + 1)
-                noise_pred = inv_sigma * latent_model_input + self.scheduler.scale_model_input(sigma, t) * noise_pred
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents).prev_sample
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        # 10. Post-processing
-        image = self.decode_latents(latents)
-
-        # 11. Convert to PIL
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image,)
-
-        return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_mega.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_mega.py
deleted file mode 100644
index 93a2487ee267..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_mega.py
+++ /dev/null
@@ -1,203 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import paddle
-import PIL.Image
-
-from ...utils import logging
-from .pipeline_stable_diffusion import StableDiffusionPipeline
-from .pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipeline
-from .pipeline_stable_diffusion_inpaint_legacy import (
-    StableDiffusionInpaintPipelineLegacy,
-)
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-class StableDiffusionMegaPipeline(StableDiffusionPipeline):
-    r"""
-    Pipeline for generation using Stable Diffusion.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular xxxx, etc.)
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], [`PNDMScheduler`], [`EulerDiscreteScheduler`], [`EulerAncestralDiscreteScheduler`]
-            or [`DPMSolverMultistepScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPFeatureExtractor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-    _optional_components = ["safety_checker", "feature_extractor"]
-
-    def __call__(self, *args, **kwargs):
-        return self.text2img(*args, **kwargs)
-
-    def text2img(
-        self,
-        prompt: Union[str, List[str]] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-    ):
-
-        expected_components = inspect.signature(StableDiffusionPipeline.__init__).parameters.keys()
-        components = {name: component for name, component in self.components.items() if name in expected_components}
-        temp_pipeline = StableDiffusionPipeline(
-            **components, requires_safety_checker=self.config.requires_safety_checker
-        )
-        output = temp_pipeline(
-            prompt=prompt,
-            height=height,
-            width=width,
-            num_inference_steps=num_inference_steps,
-            guidance_scale=guidance_scale,
-            negative_prompt=negative_prompt,
-            num_images_per_prompt=num_images_per_prompt,
-            eta=eta,
-            generator=generator,
-            latents=latents,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            output_type=output_type,
-            return_dict=return_dict,
-            callback=callback,
-            callback_steps=callback_steps,
-            cross_attention_kwargs=cross_attention_kwargs,
-        )
-        return output
-
-    def img2img(
-        self,
-        prompt: Union[str, List[str]] = None,
-        image: Union[paddle.Tensor, PIL.Image.Image] = None,
-        strength: float = 0.8,
-        num_inference_steps: Optional[int] = 50,
-        guidance_scale: Optional[float] = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: Optional[float] = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        **kwargs,
-    ):
-        expected_components = inspect.signature(StableDiffusionImg2ImgPipeline.__init__).parameters.keys()
-        components = {name: component for name, component in self.components.items() if name in expected_components}
-        temp_pipeline = StableDiffusionImg2ImgPipeline(
-            **components, requires_safety_checker=self.config.requires_safety_checker
-        )
-        output = temp_pipeline(
-            prompt=prompt,
-            image=image,
-            strength=strength,
-            num_inference_steps=num_inference_steps,
-            guidance_scale=guidance_scale,
-            negative_prompt=negative_prompt,
-            num_images_per_prompt=num_images_per_prompt,
-            eta=eta,
-            generator=generator,
-            negative_prompt_embeds=negative_prompt_embeds,
-            prompt_embeds=prompt_embeds,
-            output_type=output_type,
-            return_dict=return_dict,
-            callback=callback,
-            callback_steps=callback_steps,
-            **kwargs,
-        )
-
-        return output
-
-    def inpaint_legacy(
-        self,
-        prompt: Union[str, List[str]],
-        image: Union[paddle.Tensor, PIL.Image.Image] = None,
-        mask_image: Union[paddle.Tensor, PIL.Image.Image] = None,
-        strength: float = 0.8,
-        num_inference_steps: Optional[int] = 50,
-        guidance_scale: Optional[float] = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        add_predicted_noise: Optional[bool] = False,
-        eta: Optional[float] = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        **kwargs,
-    ):
-        expected_components = inspect.signature(StableDiffusionInpaintPipelineLegacy.__init__).parameters.keys()
-        components = {name: component for name, component in self.components.items() if name in expected_components}
-        temp_pipeline = StableDiffusionInpaintPipelineLegacy(
-            **components, requires_safety_checker=self.config.requires_safety_checker
-        )
-        output = temp_pipeline(
-            prompt=prompt,
-            image=image,
-            mask_image=mask_image,
-            strength=strength,
-            num_inference_steps=num_inference_steps,
-            guidance_scale=guidance_scale,
-            negative_prompt=negative_prompt,
-            num_images_per_prompt=num_images_per_prompt,
-            add_predicted_noise=add_predicted_noise,
-            eta=eta,
-            generator=generator,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            output_type=output_type,
-            return_dict=return_dict,
-            callback=callback,
-            callback_steps=callback_steps,
-            **kwargs,
-        )
-
-        return output
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py
deleted file mode 100644
index f93b115ec7eb..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py
+++ /dev/null
@@ -1,731 +0,0 @@
-# Copyright 2023 TIME Authors and The HuggingFace Team. All rights reserved."
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import inspect
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import paddle
-
-from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
-
-from ...loaders import TextualInversionLoaderMixin
-from ...models import AutoencoderKL, UNet2DConditionModel
-from ...schedulers import PNDMScheduler
-from ...schedulers.scheduling_utils import SchedulerMixin
-from ...utils import logging, randn_tensor
-from ..pipeline_utils import DiffusionPipeline
-from . import StableDiffusionPipelineOutput
-from .safety_checker import StableDiffusionSafetyChecker
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-AUGS_CONST = ["A photo of ", "An image of ", "A picture of "]
-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> import paddle
-        >>> from ppdiffusers import StableDiffusionModelEditingPipeline
-        >>> model_ckpt = "CompVis/stable-diffusion-v1-4"
-        >>> pipe = StableDiffusionModelEditingPipeline.from_pretrained(model_ckpt)
-        >>> source_prompt = "A pack of roses"
-        >>> destination_prompt = "A pack of blue roses"
-        >>> pipe.edit_model(source_prompt, destination_prompt)
-        >>> prompt = "A field of roses"
-        >>> image = pipe(prompt).images[0]
-        ```
-"""
-
-
-class StableDiffusionModelEditingPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
-    r"""
-    Pipeline for text-to-image model editing using "Editing Implicit Assumptions in Text-to-Image Diffusion Models".
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.).
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPImageProcessor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-        with_to_k ([`bool`]):
-            Whether to edit the key projection matrices along wiht the value projection matrices.
-        with_augs ([`list`]):
-            Textual augmentations to apply while editing the text-to-image model. Set to [] for no augmentations.
-    """
-    _optional_components = ["safety_checker", "feature_extractor"]
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: SchedulerMixin,
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPImageProcessor,
-        requires_safety_checker: bool = True,
-        with_to_k: bool = True,
-        with_augs: list = AUGS_CONST,
-    ):
-        super().__init__()
-
-        if isinstance(scheduler, PNDMScheduler):
-            logger.error("PNDMScheduler for this pipeline is currently not supported.")
-
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-
-        self.with_to_k = with_to_k
-        self.with_augs = with_augs
-
-        # get cross-attention layers
-        ca_layers = []
-
-        def append_ca(net_):
-            if net_.__class__.__name__ == "CrossAttention":
-                ca_layers.append(net_)
-            elif hasattr(net_, "children"):
-                for net__ in net_.children():
-                    append_ca(net__)
-
-        # recursively find all cross-attention layers in unet
-        for net in self.unet.named_children():
-            if "down" in net[0]:
-                append_ca(net[1])
-            elif "up" in net[0]:
-                append_ca(net[1])
-            elif "mid" in net[0]:
-                append_ca(net[1])
-
-        # get projection matrices
-        self.ca_clip_layers = [l for l in ca_layers if l.to_v.in_features == 768]
-        self.projection_matrices = [l.to_v for l in self.ca_clip_layers]
-        self.og_matrices = [copy.deepcopy(l.to_v) for l in self.ca_clip_layers]
-        if self.with_to_k:
-            self.projection_matrices = self.projection_matrices + [l.to_k for l in self.ca_clip_layers]
-            self.og_matrices = self.og_matrices + [copy.deepcopy(l.to_k) for l in self.ca_clip_layers]
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
-    def enable_vae_slicing(self):
-        r"""
-        Enable sliced VAE decoding.
-        When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several
-        steps. This is useful to save some memory and allow larger batch sizes.
-        """
-        self.vae.enable_slicing()
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
-    def disable_vae_slicing(self):
-        r"""
-        Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
-        computing decoding in one step.
-        """
-        self.vae.disable_slicing()
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
-    def _encode_prompt(
-        self,
-        prompt,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-        """
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
-
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask
-            else:
-                attention_mask = None
-
-            prompt_embeds = self.text_encoder(
-                text_input_ids,
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        prompt_embeds = prompt_embeds.cast(dtype=self.text_encoder.dtype)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
-
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = uncond_input.attention_mask
-            else:
-                attention_mask = None
-
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids,
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.cast(dtype=self.text_encoder.dtype)
-
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
-
-        return prompt_embeds
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
-    def run_safety_checker(self, image, dtype):
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
-            )
-        else:
-            has_nsfw_concept = None
-        return image, has_nsfw_concept
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
-    def decode_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
-        image = self.vae.decode(latents).sample
-        image = (image / 2 + 0.5).clip(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        image = image.transpose([0, 2, 3, 1]).cast("float32").numpy()
-        return image
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
-    def check_inputs(
-        self,
-        prompt,
-        height,
-        width,
-        callback_steps,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-    ):
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, dtype=dtype)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    @paddle.no_grad()
-    def edit_model(
-        self,
-        source_prompt: str,
-        destination_prompt: str,
-        lamb: float = 0.1,
-        restart_params: bool = True,
-    ):
-        r"""
-        Apply model editing via closed-form solution (see Eq. 5 in the TIME paper https://arxiv.org/abs/2303.08084)
-        Args:
-            source_prompt (`str`):
-                The source prompt containing the concept to be edited.
-            destination_prompt (`str`):
-                The destination prompt. Must contain all words from source_prompt with additional ones to specify the
-                target edit.
-            lamb (`float`, *optional*, defaults to 0.1):
-                The lambda parameter specifying the regularization intesity. Smaller values increase the editing power.
-            restart_params (`bool`, *optional*, defaults to True):
-                Restart the model parameters to their pre-trained version before editing. This is done to avoid edit
-                compounding. When it is False, edits accumulate.
-        """
-
-        # restart LDM parameters
-        if restart_params:
-            num_ca_clip_layers = len(self.ca_clip_layers)
-            for idx_, l in enumerate(self.ca_clip_layers):
-                l.to_v = copy.deepcopy(self.og_matrices[idx_])
-                self.projection_matrices[idx_] = l.to_v
-                if self.with_to_k:
-                    l.to_k = copy.deepcopy(self.og_matrices[num_ca_clip_layers + idx_])
-                    self.projection_matrices[num_ca_clip_layers + idx_] = l.to_k
-
-        # set up sentences
-        old_texts = [source_prompt]
-        new_texts = [destination_prompt]
-        # add augmentations
-        base = old_texts[0] if old_texts[0][0:1] != "A" else "a" + old_texts[0][1:]
-        for aug in self.with_augs:
-            old_texts.append(aug + base)
-        base = new_texts[0] if new_texts[0][0:1] != "A" else "a" + new_texts[0][1:]
-        for aug in self.with_augs:
-            new_texts.append(aug + base)
-
-        # prepare input k* and v*
-        old_embs, new_embs = [], []
-        for old_text, new_text in zip(old_texts, new_texts):
-            text_input = self.tokenizer(
-                [old_text, new_text],
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-            text_embeddings = self.text_encoder(text_input.input_ids)[0]
-            old_emb, new_emb = text_embeddings
-            old_embs.append(old_emb)
-            new_embs.append(new_emb)
-
-        # identify corresponding destinations for each token in old_emb
-        idxs_replaces = []
-        for old_text, new_text in zip(old_texts, new_texts):
-            tokens_a = self.tokenizer(old_text).input_ids
-            tokens_b = self.tokenizer(new_text).input_ids
-            tokens_a = [
-                self.tokenizer.encode("a ")["input_ids"][1] if self.tokenizer.decode(t) == "an" else t
-                for t in tokens_a
-            ]
-            tokens_b = [
-                self.tokenizer.encode("a ")["input_ids"][1] if self.tokenizer.decode(t) == "an" else t
-                for t in tokens_b
-            ]
-            num_orig_tokens = len(tokens_a)
-            idxs_replace = []
-            j = 0
-            for i in range(num_orig_tokens):
-                curr_token = tokens_a[i]
-                while tokens_b[j] != curr_token:
-                    j += 1
-                idxs_replace.append(j)
-                j += 1
-            while j < 77:
-                idxs_replace.append(j)
-                j += 1
-            while len(idxs_replace) < 77:
-                idxs_replace.append(76)
-            idxs_replaces.append(idxs_replace)
-
-        # prepare batch: for each pair of setences, old context and new values
-        contexts, valuess = [], []
-        for old_emb, new_emb, idxs_replace in zip(old_embs, new_embs, idxs_replaces):
-            context = old_emb.detach()
-            values = []
-            with paddle.no_grad():
-                for layer in self.projection_matrices:
-                    values.append(layer(new_emb[idxs_replace]).detach())
-            contexts.append(context)
-            valuess.append(values)
-
-        # edit the model
-        for layer_num in range(len(self.projection_matrices)):
-            # mat1 = \lambda W + \sum{v k^T}
-            mat1 = lamb * self.projection_matrices[layer_num].weight
-
-            # mat2 = \lambda I + \sum{k k^T}
-            mat2 = lamb * paddle.eye(self.projection_matrices[layer_num].weight.shape[1])
-
-            # aggregate sums for mat1, mat2
-            for context, values in zip(contexts, valuess):
-                context_vector = context.reshape([context.shape[0], context.shape[1], 1])
-                context_vector_T = context.reshape([context.shape[0], 1, context.shape[1]])
-                value_vector = values[layer_num].reshape([values[layer_num].shape[0], values[layer_num].shape[1], 1])
-                for_mat1 = (value_vector @ context_vector_T).sum(axis=0)
-                for_mat2 = (context_vector @ context_vector_T).sum(axis=0)
-                mat1 += for_mat1
-                mat2 += for_mat2
-
-            # update projection matrix
-            mat = mat1 @ paddle.inverse(mat2)
-            self.projection_matrices[layer_num].weight = paddle.create_parameter(
-                shape=mat.shape, dtype=mat.dtype, default_initializer=paddle.nn.initializer.Assign(mat)
-            )
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: int = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
-        Examples:
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * self.vae_scale_factor
-        width = width or self.unet.config.sample_size * self.vae_scale_factor
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
-        )
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-        )
-
-        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps = self.scheduler.timesteps
-
-        # 5. Prepare latent variables
-        num_channels_latents = self.unet.config.in_channels
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            prompt_embeds.dtype,
-            generator,
-            latents,
-        )
-
-        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 7. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                # predict the noise residual
-                noise_pred = self.unet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                ).sample
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        if output_type == "latent":
-            image = latents
-            has_nsfw_concept = None
-        elif output_type == "pil":
-            # 8. Post-processing
-            image = self.decode_latents(latents)
-
-            # 9. Run safety checker
-            image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
-
-            # 10. Convert to PIL
-            image = self.numpy_to_pil(image)
-        else:
-            # 8. Post-processing
-            image = self.decode_latents(latents)
-
-            # 9. Run safety checker
-            image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py
deleted file mode 100644
index 277a66ced598..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py
+++ /dev/null
@@ -1,601 +0,0 @@
-# Copyright 2023 MultiDiffusion Authors and The HuggingFace Team. All rights reserved."
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import paddle
-
-from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
-
-from ...loaders import TextualInversionLoaderMixin
-from ...models import AutoencoderKL, UNet2DConditionModel
-from ...schedulers import DDIMScheduler, PNDMScheduler
-from ...utils import logging, randn_tensor, replace_example_docstring
-from ..pipeline_utils import DiffusionPipeline
-from . import StableDiffusionPipelineOutput
-from .safety_checker import StableDiffusionSafetyChecker
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> import paddle
-        >>> from ppdiffusers import StableDiffusionPanoramaPipeline, DDIMScheduler
-
-        >>> model_ckpt = "stabilityai/stable-diffusion-2-base"
-        >>> scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler")
-        >>> pipe = StableDiffusionPanoramaPipeline.from_pretrained(
-        ...     model_ckpt, scheduler=scheduler, paddle_dtype=paddle.float16
-        ... )
-
-        >>> prompt = "a photo of the dolomites"
-        >>> image = pipe(prompt).images[0]
-        ```
-"""
-
-
-class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
-    r"""
-    Pipeline for text-to-image generation using "MultiDiffusion: Fusing Diffusion Paths for Controlled Image
-    Generation".
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.).
-
-    To generate panorama-like images, be sure to pass the `width` parameter accordingly when using the pipeline. Our
-    recommendation for the `width` value is 2048. This is the default value of the `width` parameter for this pipeline.
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. The original work
-            on Multi Diffsion used the [`DDIMScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPImageProcessor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-    _optional_components = ["safety_checker", "feature_extractor"]
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: DDIMScheduler,
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPImageProcessor,
-        requires_safety_checker: bool = True,
-    ):
-        super().__init__()
-
-        if isinstance(scheduler, PNDMScheduler):
-            logger.error("PNDMScheduler for this pipeline is currently not supported.")
-
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. PaddleNLP team, diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                f"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
-    def _encode_prompt(
-        self,
-        prompt,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`)
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-        """
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
-
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask
-            else:
-                attention_mask = None
-
-            prompt_embeds = self.text_encoder(
-                text_input_ids,
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
-
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = uncond_input.attention_mask
-            else:
-                attention_mask = None
-
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids,
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
-
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
-
-        return prompt_embeds
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
-    def run_safety_checker(self, image, dtype):
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
-            )
-        else:
-            has_nsfw_concept = None
-        return image, has_nsfw_concept
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
-    def decode_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
-        image = self.vae.decode(latents).sample
-        image = (image / 2 + 0.5).clip(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        image = image.transpose([0, 2, 3, 1]).cast("float32").numpy()
-        return image
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
-    def check_inputs(
-        self,
-        prompt,
-        height,
-        width,
-        callback_steps,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-    ):
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, dtype=dtype)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    def get_views(self, panorama_height, panorama_width, window_size=64, stride=8):
-        # Here, we define the mappings F_i (see Eq. 7 in the MultiDiffusion paper https://arxiv.org/abs/2302.08113)
-        panorama_height /= 8
-        panorama_width /= 8
-        num_blocks_height = (panorama_height - window_size) // stride + 1
-        num_blocks_width = (panorama_width - window_size) // stride + 1
-        total_num_blocks = int(num_blocks_height * num_blocks_width)
-        views = []
-        for i in range(total_num_blocks):
-            h_start = int((i // num_blocks_width) * stride)
-            h_end = h_start + window_size
-            w_start = int((i % num_blocks_width) * stride)
-            w_end = w_start + window_size
-            views.append((h_start, h_end, w_start, w_end))
-        return views
-
-    @paddle.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        height: Optional[int] = 512,
-        width: Optional[int] = 2048,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            height (`int`, *optional*, defaults to 512:
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to 2048):
-                The width in pixels of the generated image. The width is kept to a high number because the
-                    pipeline is supposed to be used for generating panorama-like images.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [ppdiffusers.cross_attention](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/ppdiffusers/ppdiffusers/models/cross_attention.py).
-
-        Examples:
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * self.vae_scale_factor
-        width = width or self.unet.config.sample_size * self.vae_scale_factor
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
-        )
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-        )
-
-        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps = self.scheduler.timesteps
-
-        # 5. Prepare latent variables
-        num_channels_latents = self.unet.config.in_channels
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            prompt_embeds.dtype,
-            generator,
-            latents,
-        )
-
-        # 6. Define panorama grid and initialize views for synthesis.
-        views = self.get_views(height, width)
-        count = paddle.zeros_like(latents)
-        value = paddle.zeros_like(latents)
-
-        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 8. Denoising loop
-        # Each denoising step also includes refinement of the latents with respect to the
-        # views.
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                count.zero_()
-                value.zero_()
-
-                # generate views
-                # Here, we iterate through different spatial crops of the latents and denoise them. These
-                # denoised (latent) crops are then averaged to produce the final latent
-                # for the current timestep via MultiDiffusion. Please see Sec. 4.1 in the
-                # MultiDiffusion paper for more details: https://arxiv.org/abs/2302.08113
-                for h_start, h_end, w_start, w_end in views:
-                    # get the latents corresponding to the current view coordinates
-                    latents_for_view = latents[:, :, h_start:h_end, w_start:w_end]
-
-                    # expand the latents if we are doing classifier free guidance
-                    latent_model_input = (
-                        paddle.concat([latents_for_view] * 2) if do_classifier_free_guidance else latents_for_view
-                    )
-                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                    # predict the noise residual
-                    noise_pred = self.unet(
-                        latent_model_input,
-                        t,
-                        encoder_hidden_states=prompt_embeds,
-                        cross_attention_kwargs=cross_attention_kwargs,
-                    ).sample
-
-                    # perform guidance
-                    if do_classifier_free_guidance:
-                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                    # compute the previous noisy sample x_t -> x_t-1
-                    latents_view_denoised = self.scheduler.step(
-                        noise_pred, t, latents_for_view, **extra_step_kwargs
-                    ).prev_sample
-                    value[:, :, h_start:h_end, w_start:w_end] += latents_view_denoised
-                    count[:, :, h_start:h_end, w_start:w_end] += 1
-
-                # take the MultiDiffusion step. Eq. 5 in MultiDiffusion paper: https://arxiv.org/abs/2302.08113
-                latents = paddle.where(count > 0, value / count, value)
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        # 8. Post-processing
-        image = self.decode_latents(latents)
-
-        # 9. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
-
-        # 10. Convert to PIL
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
deleted file mode 100644
index bb3f67b4c2a2..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
+++ /dev/null
@@ -1,1182 +0,0 @@
-# Copyright 2023 Pix2Pix Zero Authors and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from dataclasses import dataclass
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import numpy as np
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-import paddle.optimizer
-import PIL
-
-from paddlenlp.transformers import (
-    BlipForConditionalGeneration,
-    BlipProcessor,
-    CLIPImageProcessor,
-    CLIPTextModel,
-    CLIPTokenizer,
-)
-
-from ...loaders import TextualInversionLoaderMixin
-from ...models import AutoencoderKL, UNet2DConditionModel
-from ...models.attention_processor import Attention
-from ...schedulers import (
-    DDIMScheduler,
-    DDPMScheduler,
-    EulerAncestralDiscreteScheduler,
-    LMSDiscreteScheduler,
-)
-from ...schedulers.scheduling_ddim_inverse import DDIMInverseScheduler
-from ...utils import (
-    PIL_INTERPOLATION,
-    BaseOutput,
-    deprecate,
-    logging,
-    randint_tensor,
-    randn_tensor,
-    replace_example_docstring,
-)
-from ..pipeline_utils import DiffusionPipeline
-from . import StableDiffusionPipelineOutput
-from .safety_checker import StableDiffusionSafetyChecker
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-@dataclass
-class Pix2PixInversionPipelineOutput(BaseOutput, TextualInversionLoaderMixin):
-    """
-    Output class for Stable Diffusion pipelines.
-
-    Args:
-        latents (`paddle.Tensor`)
-            inverted latents tensor
-        images (`List[PIL.Image.Image]` or `np.ndarray`)
-            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
-            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
-    """
-
-    latents: paddle.Tensor
-    images: Union[List[PIL.Image.Image], np.ndarray]
-
-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> import requests
-        >>> import paddle
-
-        >>> from ppdiffusers import DDIMScheduler, StableDiffusionPix2PixZeroPipeline
-
-
-        >>> def download(embedding_url, local_filepath):
-        ...     r = requests.get(embedding_url)
-        ...     with open(local_filepath, "wb") as f:
-        ...         f.write(r.content)
-
-
-        >>> model_ckpt = "CompVis/stable-diffusion-v1-4"
-        >>> pipeline = StableDiffusionPix2PixZeroPipeline.from_pretrained(model_ckpt, paddle_dtype=paddle.float16)
-        >>> pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
-
-        >>> prompt = "a high resolution painting of a cat in the style of van gough"
-        >>> source_emb_url = "https://hf.co/datasets/sayakpaul/sample-datasets/resolve/main/cat.pt"
-        >>> target_emb_url = "https://hf.co/datasets/sayakpaul/sample-datasets/resolve/main/dog.pt"
-
-        >>> for url in [source_emb_url, target_emb_url]:
-        ...     download(url, url.split("/")[-1])
-
-        >>> src_embeds = paddle.load(source_emb_url.split("/")[-1])
-        >>> target_embeds = paddle.load(target_emb_url.split("/")[-1])
-        >>> images = pipeline(
-        ...     prompt,
-        ...     source_embeds=src_embeds,
-        ...     target_embeds=target_embeds,
-        ...     num_inference_steps=50,
-        ...     cross_attention_guidance_amount=0.15,
-        ... ).images
-
-        >>> images[0].save("edited_image_dog.png")
-        ```
-"""
-
-EXAMPLE_INVERT_DOC_STRING = """
-    Examples:
-        ```py
-        >>> import paddle
-        >>> from paddlenlp.transformers import BlipForConditionalGeneration, BlipProcessor
-        >>> from ppdiffusers import DDIMScheduler, DDIMInverseScheduler, StableDiffusionPix2PixZeroPipeline
-
-        >>> import requests
-        >>> from PIL import Image
-
-        >>> captioner_id = "Salesforce/blip-image-captioning-base"
-        >>> processor = BlipProcessor.from_pretrained(captioner_id)
-        >>> model = BlipForConditionalGeneration.from_pretrained(
-        ...     captioner_id, paddle_dtype=paddle.float16,
-        ... )
-
-        >>> sd_model_ckpt = "CompVis/stable-diffusion-v1-4"
-        >>> pipeline = StableDiffusionPix2PixZeroPipeline.from_pretrained(
-        ...     sd_model_ckpt,
-        ...     caption_generator=model,
-        ...     caption_processor=processor,
-        ...     paddle_dtype=paddle.float16,
-        ...     safety_checker=None,
-        ... )
-
-        >>> pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
-        >>> pipeline.inverse_scheduler = DDIMInverseScheduler.from_config(pipeline.scheduler.config)
-        >>> pipeline.enable_model_cpu_offload()
-
-        >>> img_url = "https://github.com/pix2pixzero/pix2pix-zero/raw/main/assets/test_images/cats/cat_6.png"
-
-        >>> raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB").resize((512, 512))
-        >>> # generate caption
-        >>> caption = pipeline.generate_caption(raw_image)
-
-        >>> # "a photography of a cat with flowers and dai dai daie - daie - daie kasaii"
-        >>> inv_latents = pipeline.invert(caption, image=raw_image).latents
-        >>> # we need to generate source and target embeds
-
-        >>> source_prompts = ["a cat sitting on the street", "a cat playing in the field", "a face of a cat"]
-
-        >>> target_prompts = ["a dog sitting on the street", "a dog playing in the field", "a face of a dog"]
-
-        >>> source_embeds = pipeline.get_embeds(source_prompts)
-        >>> target_embeds = pipeline.get_embeds(target_prompts)
-        >>> # the latents can then be used to edit a real image
-
-        >>> image = pipeline(
-        ...     caption,
-        ...     source_embeds=source_embeds,
-        ...     target_embeds=target_embeds,
-        ...     num_inference_steps=50,
-        ...     cross_attention_guidance_amount=0.15,
-        ...     generator=generator,
-        ...     latents=inv_latents,
-        ...     negative_prompt=caption,
-        ... ).images[0]
-        >>> image.save("edited_image.png")
-        ```
-"""
-
-
-# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
-def preprocess(image):
-    if isinstance(image, paddle.Tensor):
-        return image
-    elif isinstance(image, PIL.Image.Image):
-        image = [image]
-
-    if isinstance(image[0], PIL.Image.Image):
-        w, h = image[0].size
-        w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
-
-        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
-        image = np.concatenate(image, axis=0)
-        image = np.array(image).astype(np.float32) / 255.0
-        image = image.transpose(0, 3, 1, 2)
-        image = 2.0 * image - 1.0
-        image = paddle.to_tensor(image)
-    elif isinstance(image[0], paddle.Tensor):
-        image = paddle.concat(image, axis=0)
-    return image
-
-
-def prepare_unet(unet: UNet2DConditionModel):
-    """Modifies the UNet (`unet`) to perform Pix2Pix Zero optimizations."""
-    pix2pix_zero_attn_procs = {}
-    for name in unet.attn_processors.keys():
-        module_name = name.replace(".processor", "")
-        module: nn.Layer = unet.get_sublayer(module_name)
-        if "attn2" in name:
-            pix2pix_zero_attn_procs[name] = Pix2PixZeroAttnProcessor(is_pix2pix_zero=True)
-            for params in module.parameters():
-                params.stop_gradient = False
-        else:
-            pix2pix_zero_attn_procs[name] = Pix2PixZeroAttnProcessor(is_pix2pix_zero=False)
-            for params in module.parameters():
-                params.stop_gradient = True
-
-    unet.set_attn_processor(pix2pix_zero_attn_procs)
-    return unet
-
-
-class Pix2PixZeroL2Loss:
-    def __init__(self):
-        self.loss = 0.0
-
-    def compute_loss(self, predictions, targets):
-        self.loss += ((predictions - targets) ** 2).sum((1, 2)).mean(0)
-
-
-class Pix2PixZeroAttnProcessor:
-    """An attention processor class to store the attention weights.
-    In Pix2Pix Zero, it happens during computations in the cross-attention blocks."""
-
-    def __init__(self, is_pix2pix_zero=False):
-        self.is_pix2pix_zero = is_pix2pix_zero
-        if self.is_pix2pix_zero:
-            self.reference_cross_attn_map = {}
-
-    def __call__(
-        self,
-        attn: Attention,
-        hidden_states,
-        encoder_hidden_states=None,
-        attention_mask=None,
-        timestep=None,
-        loss=None,
-    ):
-        batch_size, sequence_length, _ = hidden_states.shape
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-        query = attn.to_q(hidden_states)
-
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-
-        key = attn.to_k(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states)
-
-        query = attn.head_to_batch_dim(query)
-        key = attn.head_to_batch_dim(key)
-        value = attn.head_to_batch_dim(value)
-
-        attention_probs = attn.get_attention_scores(query, key, attention_mask)
-        if self.is_pix2pix_zero and timestep is not None:
-            # new bookkeeping to save the attention weights.
-            if loss is None:
-                self.reference_cross_attn_map[timestep.item()] = attention_probs.detach().flatten(0, 1)
-            # compute loss
-            elif loss is not None:
-                prev_attn_probs = self.reference_cross_attn_map.pop(timestep.item())
-                loss.compute_loss(attention_probs.flatten(0, 1), prev_attn_probs)
-
-        hidden_states = paddle.matmul(attention_probs, value)
-        hidden_states = attn.batch_to_head_dim(hidden_states)
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-
-        return hidden_states
-
-
-class StableDiffusionPix2PixZeroPipeline(DiffusionPipeline):
-    r"""
-    Pipeline for pixel-levl image editing using Pix2Pix Zero. Based on Stable Diffusion.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], [`EulerAncestralDiscreteScheduler`], or [`DDPMScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPImageProcessor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-        requires_safety_checker (bool):
-            Whether the pipeline requires a safety checker. We recommend setting it to True if you're using the
-            pipeline publicly.
-    """
-    _optional_components = [
-        "safety_checker",
-        "feature_extractor",
-        "caption_generator",
-        "caption_processor",
-        "inverse_scheduler",
-    ]
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: Union[DDPMScheduler, DDIMScheduler, EulerAncestralDiscreteScheduler, LMSDiscreteScheduler],
-        feature_extractor: CLIPImageProcessor,
-        safety_checker: StableDiffusionSafetyChecker,
-        inverse_scheduler: DDIMInverseScheduler,
-        caption_generator: BlipForConditionalGeneration,
-        caption_processor: BlipProcessor,
-        requires_safety_checker: bool = True,
-    ):
-        super().__init__()
-
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. PaddleNLP team, diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                f"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-            caption_processor=caption_processor,
-            caption_generator=caption_generator,
-            inverse_scheduler=inverse_scheduler,
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
-    def _encode_prompt(
-        self,
-        prompt,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-        """
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
-
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask
-            else:
-                attention_mask = None
-
-            prompt_embeds = self.text_encoder(
-                text_input_ids,
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
-
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = uncond_input.attention_mask
-            else:
-                attention_mask = None
-
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids,
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
-
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
-
-        return prompt_embeds
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
-    def run_safety_checker(self, image, dtype):
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
-            )
-        else:
-            has_nsfw_concept = None
-        return image, has_nsfw_concept
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
-    def decode_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
-        image = self.vae.decode(latents).sample
-        image = (image / 2 + 0.5).clip(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        image = image.transpose([0, 2, 3, 1]).cast("float32").numpy()
-        return image
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    def check_inputs(
-        self,
-        prompt,
-        image,
-        source_embeds,
-        target_embeds,
-        callback_steps,
-        prompt_embeds=None,
-    ):
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-        if source_embeds is None and target_embeds is None:
-            raise ValueError("`source_embeds` and `target_embeds` cannot be undefined.")
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-    #  Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, dtype=dtype)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    @paddle.no_grad()
-    def generate_caption(self, images):
-        """Generates caption for a given image."""
-        # make sure cast caption_generator position_ids dtype int64
-        try:
-            self.caption_generator.text_decoder.bert.embeddings.position_ids = (
-                self.caption_generator.text_decoder.bert.embeddings.position_ids.cast("int64")
-            )
-        except Exception:
-            pass
-        text = "a photography of"
-
-        inputs = self.caption_processor(images=images, text=text, return_tensors="pd")
-        inputs["pixel_values"] = inputs["pixel_values"].cast(self.caption_generator.dtype)
-        outputs = self.caption_generator.generate(**inputs, max_length=128)[0]
-
-        # offload caption generator
-        caption = self.caption_processor.batch_decode(outputs, skip_special_tokens=True)[0]
-        return text + " " + caption
-
-    def construct_direction(self, embs_source: paddle.Tensor, embs_target: paddle.Tensor):
-        """Constructs the edit direction to steer the image generation process semantically."""
-        return (embs_target.mean(0) - embs_source.mean(0)).unsqueeze(0)
-
-    @paddle.no_grad()
-    def get_embeds(self, prompt: List[str], batch_size: int = 16) -> paddle.Tensor:
-        num_prompts = len(prompt)
-        embeds = []
-        for i in range(0, num_prompts, batch_size):
-            prompt_slice = prompt[i : i + batch_size]
-
-            input_ids = self.tokenizer(
-                prompt_slice,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pd",
-            ).input_ids
-
-            embeds.append(self.text_encoder(input_ids)[0])
-
-        return paddle.concat(embeds, axis=0).mean(0)[None]
-
-    def prepare_image_latents(self, image, batch_size, dtype, generator=None):
-        if not isinstance(image, (paddle.Tensor, PIL.Image.Image, list)):
-            raise ValueError(
-                f"`image` has to be of type `paddle.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
-            )
-
-        image = image.cast(dtype)
-
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if isinstance(generator, list):
-            latents = [self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)]
-            latents = paddle.concat(latents, axis=0)
-        else:
-            latents = self.vae.encode(image).latent_dist.sample(generator)
-
-        latents = self.vae.config.scaling_factor * latents
-
-        if batch_size != latents.shape[0]:
-            if batch_size % latents.shape[0] == 0:
-                # expand image_latents for batch_size
-                deprecation_message = (
-                    f"You have passed {batch_size} text prompts (`prompt`), but only {latents.shape[0]} initial"
-                    " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
-                    " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
-                    " your script to pass as many initial images as text prompts to suppress this warning."
-                )
-                deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
-                additional_latents_per_image = batch_size // latents.shape[0]
-                latents = paddle.concat([latents] * additional_latents_per_image, axis=0)
-            else:
-                raise ValueError(
-                    f"Cannot duplicate `image` of batch size {latents.shape[0]} to {batch_size} text prompts."
-                )
-        else:
-            latents = paddle.concat([latents], aixs=0)
-
-        return latents
-
-    def get_epsilon(self, model_output: paddle.Tensor, sample: paddle.Tensor, timestep: int):
-        pred_type = self.inverse_scheduler.config.prediction_type
-        alpha_prod_t = self.inverse_scheduler.alphas_cumprod[timestep]
-        beta_prod_t = 1 - alpha_prod_t
-        if pred_type == "epsilon":
-            return model_output
-        elif pred_type == "sample":
-            return (sample - alpha_prod_t ** (0.5) * model_output) / beta_prod_t ** (0.5)
-        elif pred_type == "v_prediction":
-            return (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
-        else:
-            raise ValueError(
-                f"prediction_type given as {pred_type} must be one of `epsilon`, `sample`, or `v_prediction`"
-            )
-
-    def auto_corr_loss(self, hidden_states, generator=None):
-        reg_loss = 0.0
-        for i in range(hidden_states.shape[0]):
-            for j in range(hidden_states.shape[1]):
-                noise = hidden_states[i : i + 1, j : j + 1, :, :]
-                while True:
-                    roll_amount = randint_tensor(noise.shape[2] // 2, shape=(1,), generator=generator).item()
-                    reg_loss += (noise * paddle.roll(noise, shifts=roll_amount, axis=2)).mean() ** 2
-                    reg_loss += (noise * paddle.roll(noise, shifts=roll_amount, axis=3)).mean() ** 2
-
-                    if noise.shape[2] <= 8:
-                        break
-                    noise = F.avg_pool2d(noise, kernel_size=2)
-        return reg_loss
-
-    def kl_divergence(self, hidden_states):
-        mean = hidden_states.mean()
-        var = hidden_states.var()
-        return var + mean**2 - 1 - paddle.log(var + 1e-7)
-
-    @paddle.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        prompt: Optional[Union[str, List[str]]] = None,
-        image: Optional[Union[paddle.Tensor, PIL.Image.Image]] = None,
-        source_embeds: paddle.Tensor = None,
-        target_embeds: paddle.Tensor = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        cross_attention_guidance_amount: float = 0.1,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            source_embeds (`paddle.Tensor`):
-                Source concept embeddings. Generation of the embeddings as per the [original
-                paper](https://arxiv.org/abs/2302.03027). Used in discovering the edit direction.
-            target_embeds (`paddle.Tensor`):
-                Target concept embeddings. Generation of the embeddings as per the [original
-                paper](https://arxiv.org/abs/2302.03027). Used in discovering the edit direction.
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            cross_attention_guidance_amount (`float`, defaults to 0.1):
-                Amount of guidance needed from the reference cross-attention maps.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-
-        Examples:
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        # 0. Define the spatial resolutions.
-        height = height or self.unet.config.sample_size * self.vae_scale_factor
-        width = width or self.unet.config.sample_size * self.vae_scale_factor
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt,
-            image,
-            source_embeds,
-            target_embeds,
-            callback_steps,
-            prompt_embeds,
-        )
-
-        # 3. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-        if cross_attention_kwargs is None:
-            cross_attention_kwargs = {}
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-        )
-
-        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps = self.scheduler.timesteps
-
-        # 5. Generate the inverted noise from the input image or any other image
-        # generated from the input prompt.
-        num_channels_latents = self.unet.config.in_channels
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            prompt_embeds.dtype,
-            generator,
-            latents,
-        )
-        latents_init = latents.clone()
-
-        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 8. Rejig the UNet so that we can obtain the cross-attenion maps and
-        # use them for guiding the subsequent image generation.
-        self.unet = prepare_unet(self.unet)
-
-        # 7. Denoising loop where we obtain the cross-attention maps.
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                # predict the noise residual
-                noise_pred = self.unet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs={"timestep": t},
-                ).sample
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        # 8. Compute the edit directions.
-        edit_direction = self.construct_direction(source_embeds, target_embeds)
-
-        # 9. Edit the prompt embeddings as per the edit directions discovered.
-        prompt_embeds_edit = prompt_embeds.clone()
-        prompt_embeds_edit[1:2] += edit_direction
-
-        # 10. Second denoising loop to generate the edited image.
-        latents = latents_init
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                # we want to learn the latent such that it steers the generation
-                # process towards the edited direction, so make the make initial
-                # noise learnable
-                x_in = latent_model_input.detach().clone()
-                x_in.stop_gradient = False
-
-                # optimizer
-                opt = paddle.optimizer.SGD(parameters=[x_in], learning_rate=cross_attention_guidance_amount)
-
-                with paddle.set_grad_enabled(True):
-                    # initialize loss
-                    loss = Pix2PixZeroL2Loss()
-
-                    # predict the noise residual
-                    noise_pred = self.unet(
-                        x_in,
-                        t,
-                        encoder_hidden_states=prompt_embeds_edit.detach(),
-                        cross_attention_kwargs={"timestep": t, "loss": loss},
-                    ).sample
-
-                    loss.loss.backward(retain_graph=False)
-                    opt.step()
-
-                # recompute the noise
-                noise_pred = self.unet(
-                    x_in.detach(),
-                    t,
-                    encoder_hidden_states=prompt_embeds_edit,
-                    cross_attention_kwargs={"timestep": None},
-                ).sample
-
-                latents = x_in.detach().chunk(2)[0]
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-
-        # 11. Post-process the latents.
-        edited_image = self.decode_latents(latents)
-
-        # 12. Run the safety checker.
-        edited_image, has_nsfw_concept = self.run_safety_checker(edited_image, prompt_embeds.dtype)
-
-        # 13. Convert to PIL.
-        if output_type == "pil":
-            edited_image = self.numpy_to_pil(edited_image)
-
-        if not return_dict:
-            return (edited_image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=edited_image, nsfw_content_detected=has_nsfw_concept)
-
-    @paddle.no_grad()
-    @replace_example_docstring(EXAMPLE_INVERT_DOC_STRING)
-    def invert(
-        self,
-        prompt: Optional[str] = None,
-        image: Union[paddle.Tensor, PIL.Image.Image] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 1,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        cross_attention_guidance_amount: float = 0.1,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        lambda_auto_corr: float = 20.0,
-        lambda_kl: float = 20.0,
-        num_reg_steps: int = 5,
-        num_auto_corr_rolls: int = 5,
-    ):
-        r"""
-        Function used to generate inverted latents given a prompt and image.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            image (`PIL.Image.Image`, *optional*):
-                `Image`, or tensor representing an image batch which will be used for conditioning.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 1):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            cross_attention_guidance_amount (`float`, defaults to 0.1):
-                Amount of guidance needed from the reference cross-attention maps.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            lambda_auto_corr (`float`, *optional*, defaults to 20.0):
-                Lambda parameter to control auto correction
-            lambda_kl (`float`, *optional*, defaults to 20.0):
-                Lambda parameter to control Kullback–Leibler divergence output
-            num_reg_steps (`int`, *optional*, defaults to 5):
-                Number of regularization loss steps
-            num_auto_corr_rolls (`int`, *optional*, defaults to 5):
-                Number of auto correction roll steps
-
-        Examples:
-
-        Returns:
-            [`~pipelines.stable_diffusion.pipeline_stable_diffusion_pix2pix_zero.Pix2PixInversionPipelineOutput`] or
-            `tuple`:
-            [`~pipelines.stable_diffusion.pipeline_stable_diffusion_pix2pix_zero.Pix2PixInversionPipelineOutput`] if
-            `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is the inverted
-            latents tensor and then second is the corresponding decoded image.
-        """
-        # 1. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-        if cross_attention_kwargs is None:
-            cross_attention_kwargs = {}
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Preprocess image
-        image = preprocess(image)
-
-        # 4. Prepare latent variables
-        latents = self.prepare_image_latents(image, batch_size, self.vae.dtype, generator)
-
-        # 5. Encode input prompt
-        num_images_per_prompt = 1
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            prompt_embeds=prompt_embeds,
-        )
-
-        # 4. Prepare timesteps
-        self.inverse_scheduler.set_timesteps(num_inference_steps)
-        timesteps = self.inverse_scheduler.timesteps
-
-        # 6. Rejig the UNet so that we can obtain the cross-attenion maps and
-        # use them for guiding the subsequent image generation.
-        self.unet = prepare_unet(self.unet)
-
-        # 7. Denoising loop where we obtain the cross-attention maps.
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.inverse_scheduler.order
-        with self.progress_bar(total=num_inference_steps - 1) as progress_bar:
-            for i, t in enumerate(timesteps[:-1]):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.inverse_scheduler.scale_model_input(latent_model_input, t)
-
-                # predict the noise residual
-                noise_pred = self.unet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs={"timestep": t},
-                ).sample
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                # regularization of the noise prediction
-                with paddle.set_grad_enabled(True):
-                    for _ in range(num_reg_steps):
-                        if lambda_auto_corr > 0:
-                            for _ in range(num_auto_corr_rolls):
-                                var = noise_pred.detach().clone()
-                                var.stop_gradient = False
-
-                                # Derive epsilon from model output before regularizing to IID standard normal
-                                var_epsilon = self.get_epsilon(var, latent_model_input.detach(), t)
-
-                                l_ac = self.auto_corr_loss(var_epsilon, generator=generator)
-                                l_ac.backward()
-
-                                grad = var.grad.detach() / num_auto_corr_rolls
-                                noise_pred = noise_pred - lambda_auto_corr * grad
-
-                        if lambda_kl > 0:
-                            var = noise_pred.detach().clone()
-                            var.stop_gradient = False
-
-                            # Derive epsilon from model output before regularizing to IID standard normal
-                            var_epsilon = self.get_epsilon(var, latent_model_input.detach(), t)
-
-                            l_kld = self.kl_divergence(var_epsilon)
-                            l_kld.backward()
-
-                            grad = var.grad.detach()
-                            noise_pred = noise_pred - lambda_kl * grad
-
-                        noise_pred = noise_pred.detach()
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.inverse_scheduler.step(noise_pred, t, latents).prev_sample
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and (i + 1) % self.inverse_scheduler.order == 0
-                ):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        inverted_latents = latents.detach().clone()
-
-        # 8. Post-processing
-        image = self.decode_latents(latents.detach())
-
-        # 9. Convert to PIL.
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (inverted_latents, image)
-
-        return Pix2PixInversionPipelineOutput(latents=inverted_latents, images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py
deleted file mode 100644
index 78b8915013cb..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py
+++ /dev/null
@@ -1,717 +0,0 @@
-# Copyright 2022 Susung Hong and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import paddle
-import paddle.nn.functional as F
-
-from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
-
-from ...loaders import TextualInversionLoaderMixin
-from ...models import AutoencoderKL, UNet2DConditionModel
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import logging, randn_tensor, replace_example_docstring
-from ..pipeline_utils import DiffusionPipeline
-from . import StableDiffusionPipelineOutput
-from .safety_checker import StableDiffusionSafetyChecker
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> import paddle
-        >>> from ppdiffusers import StableDiffusionSAGPipeline
-
-        >>> pipe = StableDiffusionSAGPipeline.from_pretrained(
-        ...     "runwayml/stable-diffusion-v1-5", paddle_dtype=paddle.float16
-        ... )
-
-        >>> prompt = "a photo of an astronaut riding a horse on mars"
-        >>> image = pipe(prompt, sag_scale=0.75).images[0]
-        ```
-"""
-
-
-# processes and stores attention probabilities
-class CrossAttnStoreProcessor:
-    def __init__(self):
-        self.attention_probs = None
-
-    def __call__(
-        self,
-        attn,
-        hidden_states,
-        encoder_hidden_states=None,
-        attention_mask=None,
-    ):
-        batch_size, sequence_length, _ = hidden_states.shape
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-        query = attn.to_q(hidden_states)
-
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-
-        key = attn.to_k(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states)
-
-        query = attn.head_to_batch_dim(query)
-        key = attn.head_to_batch_dim(key)
-        value = attn.head_to_batch_dim(value)
-
-        attention_probs = attn.get_attention_scores(query, key, attention_mask)
-        # we need to flatten this (0, 1)
-        self.attention_probs = attention_probs.flatten(0, 1)
-        hidden_states = paddle.matmul(attention_probs, value)
-        hidden_states = attn.batch_to_head_dim(hidden_states)
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-
-        return hidden_states
-
-
-# Modified to get self-attention guidance scale in this paper (https://arxiv.org/pdf/2210.00939.pdf) as an input
-class StableDiffusionSAGPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
-    r"""
-    Pipeline for text-to-image generation using Stable Diffusion.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPImageProcessor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-    _optional_components = ["safety_checker", "feature_extractor"]
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPImageProcessor,
-        requires_safety_checker: bool = True,
-    ):
-        super().__init__()
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
-    def _encode_prompt(
-        self,
-        prompt,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-        """
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
-
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask
-            else:
-                attention_mask = None
-
-            prompt_embeds = self.text_encoder(
-                text_input_ids,
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
-
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = uncond_input.attention_mask
-            else:
-                attention_mask = None
-
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids,
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
-
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
-
-        return prompt_embeds
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
-    def run_safety_checker(self, image, dtype):
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
-            )
-        else:
-            has_nsfw_concept = None
-        return image, has_nsfw_concept
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
-    def decode_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
-        image = self.vae.decode(latents).sample
-        image = (image / 2 + 0.5).clip(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        image = image.transpose([0, 2, 3, 1]).cast("float32").numpy()
-        return image
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
-    def check_inputs(
-        self,
-        prompt,
-        height,
-        width,
-        callback_steps,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-    ):
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, dtype=dtype)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    @paddle.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        sag_scale: float = 0.75,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            sag_scale (`float`, *optional*, defaults to 0.75):
-                SAG scale as defined in [Improving Sample Quality of Diffusion Models Using Self-Attention Guidance]
-                (https://arxiv.org/abs/2210.00939). `sag_scale` is defined as `s_s` of equation (24) of SAG paper:
-                https://arxiv.org/pdf/2210.00939.pdf. Typically chosen between [0, 1.0] for better quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
-                `self.processor` in
-                [ppdiffusers.cross_attention](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/ppdiffusers/ppdiffusers/models/cross_attention.py).
-
-        Examples:
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * self.vae_scale_factor
-        width = width or self.unet.config.sample_size * self.vae_scale_factor
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
-        )
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-        # and `sag_scale` is` `s` of equation (16)
-        # of the self-attentnion guidance paper: https://arxiv.org/pdf/2210.00939.pdf
-        # `sag_scale = 0` means no self-attention guidance
-        do_self_attention_guidance = sag_scale > 0.0
-
-        # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-        )
-
-        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps = self.scheduler.timesteps
-
-        # 5. Prepare latent variables
-        num_channels_latents = self.unet.config.in_channels
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            prompt_embeds.dtype,
-            generator,
-            latents,
-        )
-
-        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 7. Denoising loop
-        store_processor = CrossAttnStoreProcessor()
-        self.unet.mid_block.attentions[0].transformer_blocks[0].attn1.processor = store_processor
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-
-        map_size = None
-
-        def get_map_size(module, input, output):
-            nonlocal map_size
-            map_size = output.sample.shape[-2:]
-
-        forward_hook = self.unet.mid_block.attentions[0].register_forward_post_hook(get_map_size)
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                # predict the noise residual
-                noise_pred = self.unet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                ).sample
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                # perform self-attention guidance with the stored self-attentnion map
-                if do_self_attention_guidance:
-                    # classifier-free guidance produces two chunks of attention map
-                    # and we only use unconditional one according to equation (25)
-                    # in https://arxiv.org/pdf/2210.00939.pdf
-                    if do_classifier_free_guidance:
-                        # DDIM-like prediction of x0
-                        pred_x0 = self.pred_x0(latents, noise_pred_uncond, t)
-                        # get the stored attention maps
-                        uncond_attn, cond_attn = store_processor.attention_probs.chunk(2)
-                        # self-attention-based degrading of latents
-                        degraded_latents = self.sag_masking(
-                            pred_x0, uncond_attn, map_size, t, self.pred_epsilon(latents, noise_pred_uncond, t)
-                        )
-                        uncond_emb, _ = prompt_embeds.chunk(2)
-                        # forward and give guidance
-                        degraded_pred = self.unet(degraded_latents, t, encoder_hidden_states=uncond_emb).sample
-                        noise_pred += sag_scale * (noise_pred_uncond - degraded_pred)
-                    else:
-                        # DDIM-like prediction of x0
-                        pred_x0 = self.pred_x0(latents, noise_pred, t)
-                        # get the stored attention maps
-                        cond_attn = store_processor.attention_probs
-                        # self-attention-based degrading of latents
-                        degraded_latents = self.sag_masking(
-                            pred_x0, cond_attn, map_size, t, self.pred_epsilon(latents, noise_pred, t)
-                        )
-                        # forward and give guidance
-                        degraded_pred = self.unet(degraded_latents, t, encoder_hidden_states=prompt_embeds).sample
-                        noise_pred += sag_scale * (noise_pred - degraded_pred)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-        forward_hook.remove()
-
-        # 8. Post-processing
-        image = self.decode_latents(latents)
-
-        # 9. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
-
-        # 10. Convert to PIL
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
-
-    def sag_masking(self, original_latents, attn_map, map_size, t, eps):
-        # Same masking process as in SAG paper: https://arxiv.org/pdf/2210.00939.pdf
-        bh, hw1, hw2 = attn_map.shape
-        b, latent_channel, latent_h, latent_w = original_latents.shape
-        h = self.unet.config.attention_head_dim
-        if isinstance(h, list):
-            h = h[-1]
-
-        # Produce attention mask
-        attn_map = attn_map.reshape([b, h, hw1, hw2])
-        attn_mask = attn_map.mean(1, keepdim=False).sum(1, keepdim=False) > 1.0
-
-        attn_mask = (
-            attn_mask.reshape([b, map_size[0], map_size[1]])
-            .unsqueeze(1)
-            .tile([1, latent_channel, 1, 1])
-            .cast(attn_map.dtype)
-        )
-        attn_mask = F.interpolate(attn_mask, (latent_h, latent_w))
-
-        # Blur according to the self-attention mask
-        degraded_latents = gaussian_blur_2d(original_latents, kernel_size=9, sigma=1.0)
-        degraded_latents = degraded_latents * attn_mask + original_latents * (1 - attn_mask)
-
-        # Noise it again to match the noise level
-        degraded_latents = self.scheduler.add_noise(degraded_latents, noise=eps, timesteps=t)
-
-        return degraded_latents
-
-    # Modified from diffusers.schedulers.scheduling_ddim.DDIMScheduler.step
-    # Note: there are some schedulers that clip or do not return x_0 (PNDMScheduler, DDIMScheduler, etc.)
-    def pred_x0(self, sample, model_output, timestep):
-        alpha_prod_t = self.scheduler.alphas_cumprod[timestep]
-
-        beta_prod_t = 1 - alpha_prod_t
-        if self.scheduler.config.prediction_type == "epsilon":
-            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
-        elif self.scheduler.config.prediction_type == "sample":
-            pred_original_sample = model_output
-        elif self.scheduler.config.prediction_type == "v_prediction":
-            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
-            # predict V
-            model_output = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
-        else:
-            raise ValueError(
-                f"prediction_type given as {self.scheduler.config.prediction_type} must be one of `epsilon`, `sample`,"
-                " or `v_prediction`"
-            )
-
-        return pred_original_sample
-
-    def pred_epsilon(self, sample, model_output, timestep):
-        alpha_prod_t = self.scheduler.alphas_cumprod[timestep]
-
-        beta_prod_t = 1 - alpha_prod_t
-        if self.scheduler.config.prediction_type == "epsilon":
-            pred_eps = model_output
-        elif self.scheduler.config.prediction_type == "sample":
-            pred_eps = (sample - (alpha_prod_t**0.5) * model_output) / (beta_prod_t**0.5)
-        elif self.scheduler.config.prediction_type == "v_prediction":
-            pred_eps = (beta_prod_t**0.5) * sample + (alpha_prod_t**0.5) * model_output
-        else:
-            raise ValueError(
-                f"prediction_type given as {self.scheduler.config.prediction_type} must be one of `epsilon`, `sample`,"
-                " or `v_prediction`"
-            )
-
-        return pred_eps
-
-
-# Gaussian blur
-def gaussian_blur_2d(img, kernel_size, sigma):
-    ksize_half = (kernel_size - 1) * 0.5
-    x = paddle.linspace(-ksize_half, ksize_half, num=kernel_size)
-
-    pdf = paddle.exp(-0.5 * (x / sigma).pow(2))
-
-    x_kernel = pdf / pdf.sum()
-    x_kernel = x_kernel.cast(img.dtype)
-
-    kernel2d = paddle.matmul(x_kernel[:, None], x_kernel[None, :])
-    kernel2d = kernel2d.expand([img.shape[-3], 1, kernel2d.shape[0], kernel2d.shape[1]])
-
-    padding = [kernel_size // 2, kernel_size // 2, kernel_size // 2, kernel_size // 2]
-
-    img = F.pad(img, padding, mode="reflect")
-    img = F.conv2d(img, kernel2d, groups=img.shape[-3])
-
-    return img
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
deleted file mode 100644
index 691e4591d974..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
+++ /dev/null
@@ -1,549 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import Callable, List, Optional, Union
-
-import numpy as np
-import paddle
-import PIL
-
-from paddlenlp.transformers import CLIPTextModel, CLIPTokenizer
-
-from ...models import AutoencoderKL, UNet2DConditionModel
-from ...schedulers import DDPMScheduler, KarrasDiffusionSchedulers
-from ...utils import deprecate, logging, randn_tensor
-from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-def preprocess(image):
-    if isinstance(image, paddle.Tensor):
-        return image
-    elif isinstance(image, PIL.Image.Image):
-        image = [image]
-
-    if isinstance(image[0], PIL.Image.Image):
-        w, h = image[0].size
-        w, h = map(lambda x: x - x % 64, (w, h))  # resize to integer multiple of 64
-
-        image = [np.array(i.resize((w, h)))[None, :] for i in image]
-        image = np.concatenate(image, axis=0)
-        image = np.array(image).astype(np.float32) / 255.0
-        image = image.transpose(0, 3, 1, 2)
-        image = 2.0 * image - 1.0
-        image = paddle.to_tensor(image)
-    elif isinstance(image[0], paddle.Tensor):
-        image = paddle.concat(image, axis=0)
-    return image
-
-
-class StableDiffusionUpscalePipeline(DiffusionPipeline):
-    r"""
-    Pipeline for text-guided image super-resolution using Stable Diffusion 2.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        low_res_scheduler ([`SchedulerMixin`]):
-            A scheduler used to add initial noise to the low res conditioning image. It must be an instance of
-            [`DDPMScheduler`].
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-    """
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        low_res_scheduler: DDPMScheduler,
-        scheduler: KarrasDiffusionSchedulers,
-        max_noise_level: int = 350,
-    ):
-        super().__init__()
-
-        # check if vae has a config attribute `scaling_factor` and if it is set to 0.08333, else set it to 0.08333 and deprecate
-        is_vae_scaling_factor_set_to_0_08333 = (
-            hasattr(vae.config, "scaling_factor") and vae.config.scaling_factor == 0.08333
-        )
-        if not is_vae_scaling_factor_set_to_0_08333:
-            deprecation_message = (
-                "The configuration file of the vae does not contain `scaling_factor` or it is set to"
-                f" {vae.config.scaling_factor}, which seems highly unlikely. If your checkpoint is a fine-tuned"
-                " version of `stabilityai/stable-diffusion-x4-upscaler` you should change 'scaling_factor' to 0.08333"
-                " Please make sure to update the config accordingly, as not doing so might lead to incorrect results"
-                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be"
-                " very nice if you could open a Pull Request for the `vae/config.json` file"
-            )
-            deprecate("wrong scaling_factor", "1.0.0", deprecation_message, standard_warn=False)
-            vae.register_to_config(scaling_factor=0.08333)
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            low_res_scheduler=low_res_scheduler,
-            scheduler=scheduler,
-        )
-        self.register_to_config(max_noise_level=max_noise_level)
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
-    def _encode_prompt(
-        self,
-        prompt,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-             prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
-                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-        """
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask
-            else:
-                attention_mask = None
-
-            prompt_embeds = self.text_encoder(
-                text_input_ids,
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = uncond_input.attention_mask
-            else:
-                attention_mask = None
-
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids,
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
-
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
-
-        return prompt_embeds
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
-    def decode_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
-        image = self.vae.decode(latents).sample
-        image = (image / 2 + 0.5).clip(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        image = image.transpose([0, 2, 3, 1]).cast("float32").numpy()
-        return image
-
-    def check_inputs(self, prompt, image, noise_level, callback_steps):
-        if not isinstance(prompt, str) and not isinstance(prompt, list):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if (
-            not isinstance(image, paddle.Tensor)
-            and not isinstance(image, PIL.Image.Image)
-            and not isinstance(image, list)
-        ):
-            raise ValueError(
-                f"`image` has to be of type `paddle.Tensor`, `PIL.Image.Image` or `list` but is {type(image)}"
-            )
-
-        # verify batch size of prompt and image are same if image is a list or tensor
-        if isinstance(image, list) or isinstance(image, paddle.Tensor):
-            if isinstance(prompt, str):
-                batch_size = 1
-            else:
-                batch_size = len(prompt)
-            if isinstance(image, list):
-                image_batch_size = len(image)
-            else:
-                image_batch_size = image.shape[0]
-            if batch_size != image_batch_size:
-                raise ValueError(
-                    f"`prompt` has batch size {batch_size} and `image` has batch size {image_batch_size}."
-                    " Please make sure that passed `prompt` matches the batch size of `image`."
-                )
-
-        # check noise level
-        if noise_level > self.config.max_noise_level:
-            raise ValueError(f"`noise_level` has to be <= {self.config.max_noise_level} but is {noise_level}")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height, width)
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, dtype=dtype)
-        else:
-            if latents.shape != list(shape):
-                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
-            latents = latents
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        image: Union[paddle.Tensor, PIL.Image.Image, List[PIL.Image.Image]] = None,
-        num_inference_steps: int = 75,
-        guidance_scale: float = 9.0,
-        noise_level: int = 20,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            image (`PIL.Image.Image` or List[`PIL.Image.Image`] or `paddle.Tensor`):
-                `Image`, or tensor representing an image batch which will be upscaled. *
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
-                is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-
-        Examples:
-        ```py
-        >>> import requests
-        >>> from PIL import Image
-        >>> from io import BytesIO
-        >>> from ppdiffusers import StableDiffusionUpscalePipeline
-        >>> import torch
-
-        >>> # load model and scheduler
-        >>> model_id = "stabilityai/stable-diffusion-x4-upscaler"
-        >>> pipeline = StableDiffusionUpscalePipeline.from_pretrained(
-        ...     model_id, revision="fp16", paddle_dtype=paddle.float16
-        ... )
-
-        >>> # let's download an  image
-        >>> url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-upscale/low_res_cat.png"
-        >>> response = requests.get(url)
-        >>> low_res_img = Image.open(BytesIO(response.content)).convert("RGB")
-        >>> low_res_img = low_res_img.resize((128, 128))
-        >>> prompt = "a white cat"
-
-        >>> upscaled_image = pipeline(prompt=prompt, image=low_res_img).images[0]
-        >>> upscaled_image.save("upsampled_cat.png")
-        ```
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        # 1. Check inputs
-        self.check_inputs(prompt, image, noise_level, callback_steps)
-
-        if image is None:
-            raise ValueError("`image` input cannot be undefined.")
-
-        # 2. Define call parameters
-        batch_size = 1 if isinstance(prompt, str) else len(prompt)
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-        )
-
-        # 4. Preprocess image
-        image = preprocess(image)
-        image = image.cast(prompt_embeds.dtype)
-
-        # 5. set timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps = self.scheduler.timesteps
-
-        # 5. Add noise to image
-        noise_level = paddle.to_tensor([noise_level], dtype="int64")
-        noise = randn_tensor(image.shape, generator=generator, dtype=prompt_embeds.dtype)
-        image = self.low_res_scheduler.add_noise(image, noise, noise_level)
-
-        batch_multiplier = 2 if do_classifier_free_guidance else 1
-        image = paddle.concat([image] * batch_multiplier * num_images_per_prompt)
-        noise_level = paddle.concat([noise_level] * image.shape[0])
-
-        # 6. Prepare latent variables
-        height, width = image.shape[2:]
-        num_channels_latents = self.vae.config.latent_channels
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            prompt_embeds.dtype,
-            generator,
-            latents,
-        )
-
-        # 7. Check that sizes of image and latents match
-        num_channels_image = image.shape[1]
-        if num_channels_latents + num_channels_image != self.unet.config.in_channels:
-            raise ValueError(
-                f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
-                f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
-                f" `num_channels_image`: {num_channels_image} "
-                f" = {num_channels_latents+num_channels_image}. Please verify the config of"
-                " `pipeline.unet` or your `image` input."
-            )
-
-        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 9. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-
-                # concat latents, mask, masked_image_latents in the channel dimension
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-                latent_model_input = paddle.concat([latent_model_input, image.cast(latent_model_input.dtype)], axis=1)
-
-                # predict the noise residual
-                noise_pred = self.unet(
-                    latent_model_input, t, encoder_hidden_states=prompt_embeds, class_labels=noise_level
-                ).sample
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-        # 10. Post-processing
-        # make sure the VAE is in float32 mode, as it overflows in float16
-        if self.vae.dtype != paddle.float32:
-            self.vae.to(dtype=paddle.float32)
-        image = self.decode_latents(latents.cast("float32"))
-
-        # 11. Convert to PIL
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image,)
-
-        return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
deleted file mode 100644
index fa289a9c3f8b..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
+++ /dev/null
@@ -1,838 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-
-import paddle
-
-from paddlenlp.transformers import (
-    CLIPTextModel,
-    CLIPTextModelWithProjection,
-    CLIPTokenizer,
-)
-from paddlenlp.transformers.clip.modeling import CLIPTextModelOutput
-
-from ...loaders import TextualInversionLoaderMixin
-from ...models import AutoencoderKL, PriorTransformer, UNet2DConditionModel
-from ...models.embeddings import get_timestep_embedding
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import logging, randn_tensor, replace_example_docstring
-
-# from ...utils import is_accelerate_available, is_accelerate_version, logging, randn_tensor, replace_example_docstring
-from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
-from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> import paddle
-        >>> from ppdiffusers import StableUnCLIPPipeline
-
-        >>> pipe = StableUnCLIPPipeline.from_pretrained(
-        ...     "fusing/stable-unclip-2-1-l", paddle_dtype=paddle.float16
-        ... )  # TODO update model path
-
-        >>> prompt = "a photo of an astronaut riding a horse on mars"
-        >>> images = pipe(prompt).images
-        >>> images[0].save("astronaut_horse.png")
-        ```
-"""
-
-
-class StableUnCLIPPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
-    """
-    Pipeline for text-to-image generation using stable unCLIP.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        prior_tokenizer ([`CLIPTokenizer`]):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        prior_text_encoder ([`CLIPTextModelWithProjection`]):
-            Frozen text-encoder.
-        prior ([`PriorTransformer`]):
-            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
-        prior_scheduler ([`KarrasDiffusionSchedulers`]):
-            Scheduler used in the prior denoising process.
-        image_normalizer ([`StableUnCLIPImageNormalizer`]):
-            Used to normalize the predicted image embeddings before the noise is applied and un-normalize the image
-            embeddings after the noise has been applied.
-        image_noising_scheduler ([`KarrasDiffusionSchedulers`]):
-            Noise schedule for adding noise to the predicted image embeddings. The amount of noise to add is determined
-            by `noise_level` in `StableUnCLIPPipeline.__call__`.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder.
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`KarrasDiffusionSchedulers`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-    """
-
-    # prior components
-    prior_tokenizer: CLIPTokenizer
-    prior_text_encoder: CLIPTextModelWithProjection
-    prior: PriorTransformer
-    prior_scheduler: KarrasDiffusionSchedulers
-
-    # image noising components
-    image_normalizer: StableUnCLIPImageNormalizer
-    image_noising_scheduler: KarrasDiffusionSchedulers
-
-    # regular denoising components
-    tokenizer: CLIPTokenizer
-    text_encoder: CLIPTextModel
-    unet: UNet2DConditionModel
-    scheduler: KarrasDiffusionSchedulers
-
-    vae: AutoencoderKL
-
-    def __init__(
-        self,
-        # prior components
-        prior_tokenizer: CLIPTokenizer,
-        prior_text_encoder: CLIPTextModelWithProjection,
-        prior: PriorTransformer,
-        prior_scheduler: KarrasDiffusionSchedulers,
-        # image noising components
-        image_normalizer: StableUnCLIPImageNormalizer,
-        image_noising_scheduler: KarrasDiffusionSchedulers,
-        # regular denoising components
-        tokenizer: CLIPTokenizer,
-        text_encoder: CLIPTextModelWithProjection,
-        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
-        # vae
-        vae: AutoencoderKL,
-    ):
-        super().__init__()
-
-        self.register_modules(
-            prior_tokenizer=prior_tokenizer,
-            prior_text_encoder=prior_text_encoder,
-            prior=prior,
-            prior_scheduler=prior_scheduler,
-            image_normalizer=image_normalizer,
-            image_noising_scheduler=image_noising_scheduler,
-            tokenizer=tokenizer,
-            text_encoder=text_encoder,
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-        )
-
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-
-    # Copied from ppdiffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline._encode_prompt with _encode_prompt->_encode_prior_prompt, tokenizer->prior_tokenizer, text_encoder->prior_text_encoder
-    def _encode_prior_prompt(
-        self,
-        prompt,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        text_model_output: Optional[Union[CLIPTextModelOutput, Tuple]] = None,
-        text_attention_mask: Optional[paddle.Tensor] = None,
-    ):
-        if text_model_output is None:
-            batch_size = len(prompt) if isinstance(prompt, list) else 1
-            # get prompt text embeddings
-            text_inputs = self.prior_tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.prior_tokenizer.model_max_length,
-                return_attention_mask=True,
-                truncation=True,
-                return_tensors="pd",
-            )
-            text_input_ids = text_inputs.input_ids
-            text_mask = text_inputs.attention_mask
-
-            untruncated_ids = self.prior_tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.prior_tokenizer.batch_decode(
-                    untruncated_ids[:, self.prior_tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.prior_tokenizer.model_max_length} tokens: {removed_text}"
-                )
-                text_input_ids = text_input_ids[:, : self.prior_tokenizer.model_max_length]
-
-            prior_text_encoder_output = self.prior_text_encoder(text_input_ids)
-
-            prompt_embeds = prior_text_encoder_output.text_embeds
-            prior_text_encoder_hidden_states = prior_text_encoder_output.last_hidden_state
-
-        else:
-            batch_size = text_model_output[0].shape[0]
-            prompt_embeds, prior_text_encoder_hidden_states = text_model_output[0], text_model_output[1]
-            text_mask = text_attention_mask
-
-        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, axis=0)
-        prior_text_encoder_hidden_states = prior_text_encoder_hidden_states.repeat_interleave(
-            num_images_per_prompt, axis=0
-        )
-
-        text_mask = text_mask.repeat_interleave(num_images_per_prompt, axis=0)
-
-        if do_classifier_free_guidance:
-            uncond_tokens = [""] * batch_size
-
-            uncond_input = self.prior_tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=self.prior_tokenizer.model_max_length,
-                return_attention_mask=True,
-                truncation=True,
-                return_tensors="pd",
-            )
-            uncond_text_mask = uncond_input.attention_mask
-            negative_prompt_embeds_prior_text_encoder_output = self.prior_text_encoder(uncond_input.input_ids)
-
-            negative_prompt_embeds = negative_prompt_embeds_prior_text_encoder_output.text_embeds
-            uncond_prior_text_encoder_hidden_states = (
-                negative_prompt_embeds_prior_text_encoder_output.last_hidden_state
-            )
-
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-
-            seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len])
-
-            seq_len = uncond_prior_text_encoder_hidden_states.shape[1]
-            uncond_prior_text_encoder_hidden_states = uncond_prior_text_encoder_hidden_states.tile(
-                [1, num_images_per_prompt, 1]
-            )
-            uncond_prior_text_encoder_hidden_states = uncond_prior_text_encoder_hidden_states.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1]
-            )
-            uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, axis=0)
-
-            # done duplicates
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
-            prior_text_encoder_hidden_states = paddle.concat(
-                [uncond_prior_text_encoder_hidden_states, prior_text_encoder_hidden_states]
-            )
-
-            text_mask = paddle.concat([uncond_text_mask, text_mask])
-
-        return prompt_embeds, prior_text_encoder_hidden_states, text_mask
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
-    def _encode_prompt(
-        self,
-        prompt,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-             prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-        """
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
-
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask
-            else:
-                attention_mask = None
-
-            prompt_embeds = self.text_encoder(
-                text_input_ids,
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
-
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = uncond_input.attention_mask
-            else:
-                attention_mask = None
-
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids,
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
-
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
-
-        return prompt_embeds
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
-    def decode_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
-        image = self.vae.decode(latents).sample
-        image = (image / 2 + 0.5).clip(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        image = image.transpose([0, 2, 3, 1]).cast("float32").numpy()
-        return image
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs with prepare_extra_step_kwargs->prepare_prior_extra_step_kwargs, scheduler->prior_scheduler
-    def prepare_prior_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the prior_scheduler step, since not all prior_schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other prior_schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.prior_scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the prior_scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.prior_scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    def check_inputs(
-        self,
-        prompt,
-        height,
-        width,
-        callback_steps,
-        noise_level,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-    ):
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Please make sure to define only one of the two."
-            )
-
-        if prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-
-        if prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                "Provide either `negative_prompt` or `negative_prompt_embeds`. Cannot leave both `negative_prompt` and `negative_prompt_embeds` undefined."
-            )
-
-        if prompt is not None and negative_prompt is not None:
-            if type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-        if noise_level < 0 or noise_level >= self.image_noising_scheduler.config.num_train_timesteps:
-            raise ValueError(
-                f"`noise_level` must be between 0 and {self.image_noising_scheduler.config.num_train_timesteps - 1}, inclusive."
-            )
-
-    # Copied from ppdiffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
-    def prepare_latents(self, shape, dtype, generator, latents, scheduler):
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, dtype=dtype)
-        else:
-            if latents.shape != list(shape):
-                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
-            latents = latents
-
-        latents = latents * scheduler.init_noise_sigma
-        return latents
-
-    def noise_image_embeddings(
-        self,
-        image_embeds: paddle.Tensor,
-        noise_level: int,
-        noise: Optional[paddle.Tensor] = None,
-        generator: Optional[paddle.Generator] = None,
-    ):
-        """
-        Add noise to the image embeddings. The amount of noise is controlled by a `noise_level` input. A higher
-        `noise_level` increases the variance in the final un-noised images.
-
-        The noise is applied in two ways
-        1. A noise schedule is applied directly to the embeddings
-        2. A vector of sinusoidal time embeddings are appended to the output.
-
-        In both cases, the amount of noise is controlled by the same `noise_level`.
-
-        The embeddings are normalized before the noise is applied and un-normalized after the noise is applied.
-        """
-        if noise is None:
-            noise = randn_tensor(image_embeds.shape, generator=generator, dtype=image_embeds.dtype)
-
-        noise_level = paddle.to_tensor([noise_level] * image_embeds.shape[0])
-
-        image_embeds = self.image_normalizer.scale(image_embeds)
-
-        image_embeds = self.image_noising_scheduler.add_noise(image_embeds, timesteps=noise_level, noise=noise)
-
-        image_embeds = self.image_normalizer.unscale(image_embeds)
-
-        noise_level = get_timestep_embedding(
-            timesteps=noise_level, embedding_dim=image_embeds.shape[-1], flip_sin_to_cos=True, downscale_freq_shift=0
-        )
-
-        # `get_timestep_embeddings` does not contain any weights and will always return f32 tensors,
-        # but we might actually be running in fp16. so we need to cast here.
-        # there might be better ways to encapsulate this.
-        noise_level = noise_level.cast(image_embeds.dtype)
-
-        image_embeds = paddle.concat((image_embeds, noise_level), axis=1)
-
-        return image_embeds
-
-    @paddle.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        # regular denoising process args
-        prompt: Optional[Union[str, List[str]]] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 20,
-        guidance_scale: float = 10.0,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[paddle.Generator] = None,
-        latents: Optional[paddle.Tensor] = None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        noise_level: int = 0,
-        # prior args
-        prior_num_inference_steps: int = 25,
-        prior_guidance_scale: float = 4.0,
-        prior_latents: Optional[paddle.Tensor] = None,
-    ):
-        """
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 20):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 10.0):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [ppdiffusers.cross_attention](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/ppdiffusers/ppdiffusers/models/cross_attention.py).
-            noise_level (`int`, *optional*, defaults to `0`):
-                The amount of noise to add to the image embeddings. A higher `noise_level` increases the variance in
-                the final un-noised images. See `StableUnCLIPPipeline.noise_image_embeddings` for details.
-            prior_num_inference_steps (`int`, *optional*, defaults to 25):
-                The number of denoising steps in the prior denoising process. More denoising steps usually lead to a
-                higher quality image at the expense of slower inference.
-            prior_guidance_scale (`float`, *optional*, defaults to 4.0):
-                Guidance scale for the prior denoising process as defined in [Classifier-Free Diffusion
-                Guidance](https://arxiv.org/abs/2207.12598). `prior_guidance_scale` is defined as `w` of equation 2. of
-                [Imagen Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting
-                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
-                the text `prompt`, usually at the expense of lower image quality.
-            prior_latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                embedding generation in the prior denoising process. Can be used to tweak the same generation with
-                different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied
-                random `generator`.
-
-        Examples:
-
-        Returns:
-            [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~ pipeline_utils.ImagePipelineOutput`] if `return_dict` is
-            True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated images.
-        """
-        # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * self.vae_scale_factor
-        width = width or self.unet.config.sample_size * self.vae_scale_factor
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt=prompt,
-            height=height,
-            width=width,
-            callback_steps=callback_steps,
-            noise_level=noise_level,
-            negative_prompt=negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-        )
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        batch_size = batch_size * num_images_per_prompt
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        prior_do_classifier_free_guidance = prior_guidance_scale > 1.0
-
-        # 3. Encode input prompt
-        prior_prompt_embeds, prior_text_encoder_hidden_states, prior_text_mask = self._encode_prior_prompt(
-            prompt=prompt,
-            num_images_per_prompt=num_images_per_prompt,
-            do_classifier_free_guidance=prior_do_classifier_free_guidance,
-        )
-
-        # 4. Prepare prior timesteps
-        self.prior_scheduler.set_timesteps(prior_num_inference_steps)
-        prior_timesteps_tensor = self.prior_scheduler.timesteps
-
-        # 5. Prepare prior latent variables
-        embedding_dim = self.prior.config.embedding_dim
-        prior_latents = self.prepare_latents(
-            (batch_size, embedding_dim),
-            prior_prompt_embeds.dtype,
-            generator,
-            prior_latents,
-            self.prior_scheduler,
-        )
-
-        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        prior_extra_step_kwargs = self.prepare_prior_extra_step_kwargs(generator, eta)
-
-        # 7. Prior denoising loop
-        for i, t in enumerate(self.progress_bar(prior_timesteps_tensor)):
-            # expand the latents if we are doing classifier free guidance
-            latent_model_input = (
-                paddle.concat([prior_latents] * 2) if prior_do_classifier_free_guidance else prior_latents
-            )
-            latent_model_input = self.prior_scheduler.scale_model_input(latent_model_input, t)
-
-            predicted_image_embedding = self.prior(
-                latent_model_input,
-                timestep=t,
-                proj_embedding=prior_prompt_embeds,
-                encoder_hidden_states=prior_text_encoder_hidden_states,
-                attention_mask=prior_text_mask,
-            ).predicted_image_embedding
-
-            if prior_do_classifier_free_guidance:
-                predicted_image_embedding_uncond, predicted_image_embedding_text = predicted_image_embedding.chunk(2)
-                predicted_image_embedding = predicted_image_embedding_uncond + prior_guidance_scale * (
-                    predicted_image_embedding_text - predicted_image_embedding_uncond
-                )
-
-            prior_latents = self.prior_scheduler.step(
-                predicted_image_embedding,
-                timestep=t,
-                sample=prior_latents,
-                **prior_extra_step_kwargs,
-            ).prev_sample
-
-            if callback is not None and i % callback_steps == 0:
-                callback(i, t, prior_latents)
-
-        prior_latents = self.prior.post_process_latents(prior_latents)
-
-        image_embeds = prior_latents
-
-        # done prior
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 8. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt=prompt,
-            num_images_per_prompt=num_images_per_prompt,
-            do_classifier_free_guidance=do_classifier_free_guidance,
-            negative_prompt=negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-        )
-
-        # 9. Prepare image embeddings
-        image_embeds = self.noise_image_embeddings(
-            image_embeds=image_embeds,
-            noise_level=noise_level,
-            generator=generator,
-        )
-
-        if do_classifier_free_guidance:
-            negative_prompt_embeds = paddle.zeros_like(image_embeds)
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            image_embeds = paddle.concat([negative_prompt_embeds, image_embeds])
-
-        # 10. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps = self.scheduler.timesteps
-
-        # 11. Prepare latent variables
-        num_channels_latents = self.unet.config.in_channels
-        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
-        latents = self.prepare_latents(
-            shape=shape,
-            dtype=prompt_embeds.dtype,
-            generator=generator,
-            latents=latents,
-            scheduler=self.scheduler,
-        )
-
-        # 12. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 13. Denoising loop
-        for i, t in enumerate(self.progress_bar(timesteps)):
-            latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-            # predict the noise residual
-            noise_pred = self.unet(
-                latent_model_input,
-                t,
-                encoder_hidden_states=prompt_embeds,
-                class_labels=image_embeds,
-                cross_attention_kwargs=cross_attention_kwargs,
-            ).sample
-
-            # perform guidance
-            if do_classifier_free_guidance:
-                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-            # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-            if callback is not None and i % callback_steps == 0:
-                callback(i, t, latents)
-
-        # 14. Post-processing
-        image = self.decode_latents(latents)
-
-        # 15. Convert to PIL
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image,)
-
-        return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
deleted file mode 100644
index 4fb7dcc34853..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
+++ /dev/null
@@ -1,725 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import paddle
-import PIL
-
-from paddlenlp.transformers import (
-    CLIPImageProcessor,
-    CLIPTextModel,
-    CLIPTokenizer,
-    CLIPVisionModelWithProjection,
-)
-
-from ...loaders import TextualInversionLoaderMixin
-from ...models import AutoencoderKL, UNet2DConditionModel
-from ...models.embeddings import get_timestep_embedding
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import logging, randn_tensor, replace_example_docstring
-from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
-from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> import requests
-        >>> import paddle
-        >>> from PIL import Image
-        >>> from io import BytesIO
-
-        >>> from ppdiffusers import StableUnCLIPImg2ImgPipeline
-
-        >>> pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
-        ...     "fusing/stable-unclip-2-1-l-img2img", paddle_dtype=paddle.float16
-        ... )  # TODO update model path
-
-        >>> url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
-
-        >>> response = requests.get(url)
-        >>> init_image = Image.open(BytesIO(response.content)).convert("RGB")
-        >>> init_image = init_image.resize((768, 512))
-
-        >>> prompt = "A fantasy landscape, trending on artstation"
-
-        >>> images = pipe(prompt, init_image).images
-        >>> images[0].save("fantasy_landscape.png")
-        ```
-"""
-
-
-class StableUnCLIPImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
-    """
-    Pipeline for text-guided image to image generation using stable unCLIP.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        feature_extractor ([`CLIPImageProcessor`]):
-            Feature extractor for image pre-processing before being encoded.
-        image_encoder ([`CLIPVisionModelWithProjection`]):
-            CLIP vision model for encoding images.
-        image_normalizer ([`StableUnCLIPImageNormalizer`]):
-            Used to normalize the predicted image embeddings before the noise is applied and un-normalize the image
-            embeddings after the noise has been applied.
-        image_noising_scheduler ([`KarrasDiffusionSchedulers`]):
-            Noise schedule for adding noise to the predicted image embeddings. The amount of noise to add is determined
-            by `noise_level` in `StableUnCLIPPipeline.__call__`.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder.
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`KarrasDiffusionSchedulers`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-    """
-
-    # image encoding components
-    feature_extractor: CLIPImageProcessor
-    image_encoder: CLIPVisionModelWithProjection
-
-    # image noising components
-    image_normalizer: StableUnCLIPImageNormalizer
-    image_noising_scheduler: KarrasDiffusionSchedulers
-
-    # regular denoising components
-    tokenizer: CLIPTokenizer
-    text_encoder: CLIPTextModel
-    unet: UNet2DConditionModel
-    scheduler: KarrasDiffusionSchedulers
-
-    vae: AutoencoderKL
-
-    def __init__(
-        self,
-        # image encoding components
-        feature_extractor: CLIPImageProcessor,
-        image_encoder: CLIPVisionModelWithProjection,
-        # image noising components
-        image_normalizer: StableUnCLIPImageNormalizer,
-        image_noising_scheduler: KarrasDiffusionSchedulers,
-        # regular denoising components
-        tokenizer: CLIPTokenizer,
-        text_encoder: CLIPTextModel,
-        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
-        # vae
-        vae: AutoencoderKL,
-    ):
-        super().__init__()
-
-        self.register_modules(
-            feature_extractor=feature_extractor,
-            image_encoder=image_encoder,
-            image_normalizer=image_normalizer,
-            image_noising_scheduler=image_noising_scheduler,
-            tokenizer=tokenizer,
-            text_encoder=text_encoder,
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-        )
-
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
-    def _encode_prompt(
-        self,
-        prompt,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-             prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-        """
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
-
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask
-            else:
-                attention_mask = None
-
-            prompt_embeds = self.text_encoder(
-                text_input_ids,
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
-
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = uncond_input.attention_mask
-            else:
-                attention_mask = None
-
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids,
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
-
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
-
-        return prompt_embeds
-
-    def _encode_image(
-        self,
-        image,
-        batch_size,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        noise_level,
-        generator,
-        image_embeds,
-    ):
-        dtype = self.image_encoder.dtype
-
-        if isinstance(image, PIL.Image.Image):
-            # the image embedding should repeated so it matches the total batch size of the prompt
-            repeat_by = batch_size
-        else:
-            # assume the image input is already properly batched and just needs to be repeated so
-            # it matches the num_images_per_prompt.
-            #
-            # NOTE(will) this is probably missing a few number of side cases. I.e. batched/non-batched
-            # `image_embeds`. If those happen to be common use cases, let's think harder about
-            # what the expected dimensions of inputs should be and how we handle the encoding.
-            repeat_by = num_images_per_prompt
-
-        if image_embeds is None:
-            if not isinstance(image, paddle.Tensor):
-                image = self.feature_extractor(images=image, return_tensors="pd").pixel_values
-
-            image = image.cast(dtype)
-            image_embeds = self.image_encoder(image).image_embeds
-
-        image_embeds = self.noise_image_embeddings(
-            image_embeds=image_embeds,
-            noise_level=noise_level,
-            generator=generator,
-        )
-
-        # duplicate image embeddings for each generation per prompt, using mps friendly method
-        image_embeds = image_embeds.unsqueeze(1)
-        bs_embed, seq_len, _ = image_embeds.shape
-        image_embeds = image_embeds.tile([1, repeat_by, 1])
-        image_embeds = image_embeds.reshape([bs_embed * repeat_by, seq_len, -1])
-        image_embeds = image_embeds.squeeze(1)
-
-        if do_classifier_free_guidance:
-            negative_prompt_embeds = paddle.zeros_like(image_embeds)
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            image_embeds = paddle.concat([negative_prompt_embeds, image_embeds])
-
-        return image_embeds
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
-    def decode_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
-        image = self.vae.decode(latents).sample
-        image = (image / 2 + 0.5).clip(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        image = image.transpose([0, 2, 3, 1]).cast("float32").numpy()
-        return image
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    def check_inputs(
-        self,
-        prompt,
-        image,
-        height,
-        width,
-        callback_steps,
-        noise_level,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-        image_embeds=None,
-    ):
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Please make sure to define only one of the two."
-            )
-
-        if prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-
-        if prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                "Provide either `negative_prompt` or `negative_prompt_embeds`. Cannot leave both `negative_prompt` and `negative_prompt_embeds` undefined."
-            )
-
-        if prompt is not None and negative_prompt is not None:
-            if type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-        if noise_level < 0 or noise_level >= self.image_noising_scheduler.config.num_train_timesteps:
-            raise ValueError(
-                f"`noise_level` must be between 0 and {self.image_noising_scheduler.config.num_train_timesteps - 1}, inclusive."
-            )
-
-        if image is not None and image_embeds is not None:
-            raise ValueError(
-                "Provide either `image` or `image_embeds`. Please make sure to define only one of the two."
-            )
-
-        if image is None and image_embeds is None:
-            raise ValueError(
-                "Provide either `image` or `image_embeds`. Cannot leave both `image` and `image_embeds` undefined."
-            )
-
-        if image is not None:
-            if (
-                not isinstance(image, paddle.Tensor)
-                and not isinstance(image, PIL.Image.Image)
-                and not isinstance(image, list)
-            ):
-                raise ValueError(
-                    "`image` has to be of type `paddle.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
-                    f" {type(image)}"
-                )
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, dtype=dtype)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_unclip.StableUnCLIPPipeline.noise_image_embeddings
-    def noise_image_embeddings(
-        self,
-        image_embeds: paddle.Tensor,
-        noise_level: int,
-        noise: Optional[paddle.Tensor] = None,
-        generator: Optional[paddle.Generator] = None,
-    ):
-        """
-        Add noise to the image embeddings. The amount of noise is controlled by a `noise_level` input. A higher
-        `noise_level` increases the variance in the final un-noised images.
-
-        The noise is applied in two ways
-        1. A noise schedule is applied directly to the embeddings
-        2. A vector of sinusoidal time embeddings are appended to the output.
-
-        In both cases, the amount of noise is controlled by the same `noise_level`.
-
-        The embeddings are normalized before the noise is applied and un-normalized after the noise is applied.
-        """
-        if noise is None:
-            noise = randn_tensor(image_embeds.shape, generator=generator, dtype=image_embeds.dtype)
-        noise_level = paddle.to_tensor([noise_level] * image_embeds.shape[0]).reshape([image_embeds.shape[0]])
-
-        image_embeds = self.image_normalizer.scale(image_embeds)
-
-        image_embeds = self.image_noising_scheduler.add_noise(image_embeds, timesteps=noise_level, noise=noise)
-
-        image_embeds = self.image_normalizer.unscale(image_embeds)
-
-        noise_level = get_timestep_embedding(
-            timesteps=noise_level, embedding_dim=image_embeds.shape[-1], flip_sin_to_cos=True, downscale_freq_shift=0
-        )
-
-        # `get_timestep_embeddings` does not contain any weights and will always return f32 tensors,
-        # but we might actually be running in fp16. so we need to cast here.
-        # there might be better ways to encapsulate this.
-        noise_level = noise_level.cast(image_embeds.dtype)
-        image_embeds = paddle.concat((image_embeds, noise_level), 1)
-
-        return image_embeds
-
-    @paddle.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        image: Union[paddle.Tensor, PIL.Image.Image] = None,
-        prompt: Union[str, List[str]] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 20,
-        guidance_scale: float = 10,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[paddle.Generator] = None,
-        latents: Optional[paddle.Tensor] = None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        noise_level: int = 0,
-        image_embeds: Optional[paddle.Tensor] = None,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, either `prompt_embeds` will be
-                used or prompt is initialized to `""`.
-            image (`paddle.Tensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch. The image will be encoded to its CLIP embedding which
-                the unet will be conditioned on. Note that the image is _not_ encoded by the vae and then used as the
-                latents in the denoising process such as in the standard stable diffusion text guided image variation
-                process.
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 20):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 10.0):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [ppdiffusers.cross_attention](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/ppdiffusers/ppdiffusers/models/cross_attention.py).
-            noise_level (`int`, *optional*, defaults to `0`):
-                The amount of noise to add to the image embeddings. A higher `noise_level` increases the variance in
-                the final un-noised images. See `StableUnCLIPPipeline.noise_image_embeddings` for details.
-            image_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated CLIP embeddings to condition the unet on. Note that these are not latents to be used in
-                the denoising process. If you want to provide pre-generated latents, pass them to `__call__` as
-                `latents`.
-
-        Examples:
-
-        Returns:
-            [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~ pipeline_utils.ImagePipelineOutput`] if `return_dict` is
-            True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated images.
-        """
-        # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * self.vae_scale_factor
-        width = width or self.unet.config.sample_size * self.vae_scale_factor
-
-        if prompt is None and prompt_embeds is None:
-            prompt = len(image) * [""] if isinstance(image, list) else ""
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt=prompt,
-            image=image,
-            height=height,
-            width=width,
-            callback_steps=callback_steps,
-            noise_level=noise_level,
-            negative_prompt=negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            image_embeds=image_embeds,
-        )
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        batch_size = batch_size * num_images_per_prompt
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt=prompt,
-            num_images_per_prompt=num_images_per_prompt,
-            do_classifier_free_guidance=do_classifier_free_guidance,
-            negative_prompt=negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-        )
-
-        # 4. Encoder input image
-        noise_level = paddle.to_tensor(noise_level)
-        image_embeds = self._encode_image(
-            image=image,
-            batch_size=batch_size,
-            num_images_per_prompt=num_images_per_prompt,
-            do_classifier_free_guidance=do_classifier_free_guidance,
-            noise_level=noise_level,
-            generator=generator,
-            image_embeds=image_embeds,
-        )
-
-        # 5. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps = self.scheduler.timesteps
-
-        # 6. Prepare latent variables
-        num_channels_latents = self.unet.config.in_channels
-        latents = self.prepare_latents(
-            batch_size=batch_size,
-            num_channels_latents=num_channels_latents,
-            height=height,
-            width=width,
-            dtype=prompt_embeds.dtype,
-            generator=generator,
-            latents=latents,
-        )
-
-        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 8. Denoising loop
-        for i, t in enumerate(self.progress_bar(timesteps)):
-            latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-            # predict the noise residual
-            noise_pred = self.unet(
-                latent_model_input,
-                t,
-                encoder_hidden_states=prompt_embeds,
-                class_labels=image_embeds,
-                cross_attention_kwargs=cross_attention_kwargs,
-            ).sample
-
-            # perform guidance
-            if do_classifier_free_guidance:
-                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-            # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-            if callback is not None and i % callback_steps == 0:
-                callback(i, t, latents)
-
-        # 9. Post-processing
-        image = self.decode_latents(latents)
-
-        # 10. Convert to PIL
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image,)
-
-        return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/safety_checker.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/safety_checker.py
deleted file mode 100644
index 45d37f264ed1..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/safety_checker.py
+++ /dev/null
@@ -1,129 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import paddle
-import paddle.nn.functional as F
-
-from paddlenlp.transformers import (
-    CLIPPretrainedModel,
-    CLIPVisionConfig,
-    CLIPVisionModel,
-)
-
-from ...utils import logging
-
-logger = logging.get_logger(__name__)
-
-
-def cosine_distance(image_embeds, text_embeds):
-    normalized_image_embeds = F.normalize(image_embeds)
-    normalized_text_embeds = F.normalize(text_embeds)
-    return paddle.matmul(normalized_image_embeds, normalized_text_embeds, transpose_y=True)
-
-
-class StableDiffusionSafetyChecker(CLIPPretrainedModel):
-    config_class = CLIPVisionConfig
-
-    def __init__(self, config: CLIPVisionConfig):
-        super().__init__(config)
-
-        self.clip = CLIPVisionModel(config)
-        self.vision_projection = paddle.create_parameter(
-            (config.hidden_size, config.projection_dim), dtype=paddle.get_default_dtype()
-        )
-
-        self.register_buffer("concept_embeds", paddle.ones([17, config.projection_dim]))
-        self.register_buffer("special_care_embeds", paddle.ones([3, config.projection_dim]))
-
-        self.register_buffer("concept_embeds_weights", paddle.ones([17]))
-        self.register_buffer("special_care_embeds_weights", paddle.ones([3]))
-
-    @paddle.no_grad()
-    def forward(self, clip_input, images):
-        pooled_output = self.clip(clip_input)[1]  # pooled_output
-        image_embeds = paddle.matmul(pooled_output, self.vision_projection)
-
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        special_cos_dist = cosine_distance(image_embeds, self.special_care_embeds).astype("float32").numpy()
-        cos_dist = cosine_distance(image_embeds, self.concept_embeds).astype("float32").numpy()
-
-        result = []
-        batch_size = image_embeds.shape[0]
-        for i in range(batch_size):
-            result_img = {"special_scores": {}, "special_care": [], "concept_scores": {}, "bad_concepts": []}
-
-            # increase this value to create a stronger `nfsw` filter
-            # at the cost of increasing the possibility of filtering benign images
-            adjustment = 0.0
-
-            for concept_idx in range(len(special_cos_dist[0])):
-                concept_cos = special_cos_dist[i][concept_idx]
-                concept_threshold = self.special_care_embeds_weights[concept_idx].item()
-                result_img["special_scores"][concept_idx] = round(concept_cos - concept_threshold + adjustment, 3)
-                if result_img["special_scores"][concept_idx] > 0:
-                    result_img["special_care"].append({concept_idx, result_img["special_scores"][concept_idx]})
-                    adjustment = 0.01
-
-            for concept_idx in range(len(cos_dist[0])):
-                concept_cos = cos_dist[i][concept_idx]
-                concept_threshold = self.concept_embeds_weights[concept_idx].item()
-                result_img["concept_scores"][concept_idx] = round(concept_cos - concept_threshold + adjustment, 3)
-                if result_img["concept_scores"][concept_idx] > 0:
-                    result_img["bad_concepts"].append(concept_idx)
-
-            result.append(result_img)
-
-        has_nsfw_concepts = [len(res["bad_concepts"]) > 0 for res in result]
-
-        for idx, has_nsfw_concept in enumerate(has_nsfw_concepts):
-            if has_nsfw_concept:
-                if paddle.is_tensor(images) or paddle.is_tensor(images[0]):
-                    images[idx] = paddle.zeros_like(images[idx])  # black image
-                else:
-                    images[idx] = np.zeros(images[idx].shape)  # black image
-
-        if any(has_nsfw_concepts):
-            logger.warning(
-                "Potential NSFW content was detected in one or more images. A black image will be returned instead."
-                " Try again with a different prompt and/or seed."
-            )
-
-        return images, has_nsfw_concepts
-
-    def forward_fastdeploy(self, clip_input: paddle.Tensor, images: paddle.Tensor):
-        pooled_output = self.clip(clip_input)[1]  # pooled_output
-        image_embeds = paddle.matmul(pooled_output, self.vision_projection)
-
-        special_cos_dist = cosine_distance(image_embeds, self.special_care_embeds)
-        cos_dist = cosine_distance(image_embeds, self.concept_embeds)
-
-        # increase this value to create a stronger `nsfw` filter
-        # at the cost of increasing the possibility of filtering benign images
-        adjustment = 0.0
-
-        special_scores = special_cos_dist - self.special_care_embeds_weights + adjustment
-        # special_scores = special_scores.round(decimals=3)
-        special_care = paddle.any(special_scores > 0, axis=1)
-        special_adjustment = special_care * 0.01
-        special_adjustment = special_adjustment.unsqueeze(1).expand([-1, cos_dist.shape[1]])
-
-        concept_scores = (cos_dist - self.concept_embeds_weights) + special_adjustment
-        # concept_scores = concept_scores.round(decimals=3)
-        has_nsfw_concepts = paddle.any(concept_scores > 0, axis=1)
-
-        images[has_nsfw_concepts] = 0.0  # black image
-
-        return images, has_nsfw_concepts
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py
deleted file mode 100644
index 6d9af5262ff1..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional
-
-import paddle
-import paddle.nn as nn
-
-from ...configuration_utils import ConfigMixin, register_to_config
-from ...models.modeling_utils import ModelMixin
-
-
-class StableUnCLIPImageNormalizer(ModelMixin, ConfigMixin):
-    """
-    This class is used to hold the mean and standard deviation of the CLIP embedder used in stable unCLIP.
-
-    It is used to normalize the image embeddings before the noise is applied and un-normalize the noised image
-    embeddings.
-    """
-
-    @register_to_config
-    def __init__(
-        self,
-        embedding_dim: int = 768,
-    ):
-        super().__init__()
-
-        self.mean = self.create_parameter(
-            (1, embedding_dim), dtype=paddle.get_default_dtype(), default_initializer=nn.initializer.Constant(0.0)
-        )
-        self.std = self.create_parameter(
-            (1, embedding_dim), dtype=paddle.get_default_dtype(), default_initializer=nn.initializer.Constant(1.0)
-        )
-
-    def to(
-        self,
-        device: Optional[str] = None,
-        dtype: Optional[paddle.dtype] = None,
-    ):
-        if dtype is not None:
-            self.mean = self.create_parameter(
-                self.mean.shape,
-                dtype=dtype,
-                default_initializer=paddle.nn.initializer.Assign(self.mean.numpy()),
-            )
-            self.std = self.create_parameter(
-                self.std.shape, dtype=dtype, default_initializer=paddle.nn.initializer.Assign(self.std.numpy())
-            )
-        if device is not None:
-            self.mean._to(device)
-            self.std._to(device)
-
-        return self
-
-    def scale(self, embeds):
-        embeds = (embeds - self.mean) * 1.0 / self.std
-        return embeds
-
-    def unscale(self, embeds):
-        embeds = (embeds * self.std) + self.mean
-        return embeds
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_safe/__init__.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_safe/__init__.py
deleted file mode 100644
index bd434e6c18f6..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_safe/__init__.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from dataclasses import dataclass
-from typing import List, Optional, Union
-
-import numpy as np
-import PIL.Image
-
-from ...utils import BaseOutput, is_paddle_available, is_paddlenlp_available
-
-
-@dataclass
-class SafetyConfig(object):
-    WEAK = {
-        "sld_warmup_steps": 15,
-        "sld_guidance_scale": 20,
-        "sld_threshold": 0.0,
-        "sld_momentum_scale": 0.0,
-        "sld_mom_beta": 0.0,
-    }
-    MEDIUM = {
-        "sld_warmup_steps": 10,
-        "sld_guidance_scale": 1000,
-        "sld_threshold": 0.01,
-        "sld_momentum_scale": 0.3,
-        "sld_mom_beta": 0.4,
-    }
-    STRONG = {
-        "sld_warmup_steps": 7,
-        "sld_guidance_scale": 2000,
-        "sld_threshold": 0.025,
-        "sld_momentum_scale": 0.5,
-        "sld_mom_beta": 0.7,
-    }
-    MAX = {
-        "sld_warmup_steps": 0,
-        "sld_guidance_scale": 5000,
-        "sld_threshold": 1.0,
-        "sld_momentum_scale": 0.5,
-        "sld_mom_beta": 0.7,
-    }
-
-
-@dataclass
-class StableDiffusionSafePipelineOutput(BaseOutput):
-    """
-    Output class for Safe Stable Diffusion pipelines.
-
-    Args:
-        images (`List[PIL.Image.Image]` or `np.ndarray`)
-            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
-            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
-        nsfw_content_detected (`List[bool]`)
-            List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, or `None` if safety checking could not be performed.
-        images (`List[PIL.Image.Image]` or `np.ndarray`)
-            List of denoised PIL images that were flagged by the safety checker any may contain "not-safe-for-work"
-            (nsfw) content, or `None` if no safety check was performed or no images were flagged.
-        applied_safety_concept (`str`)
-            The safety concept that was applied for safety guidance, or `None` if safety guidance was disabled
-    """
-
-    images: Union[List[PIL.Image.Image], np.ndarray]
-    nsfw_content_detected: Optional[List[bool]]
-    unsafe_images: Optional[Union[List[PIL.Image.Image], np.ndarray]]
-    applied_safety_concept: Optional[str]
-
-
-if is_paddle_available() and is_paddlenlp_available():
-    from .pipeline_stable_diffusion_safe import StableDiffusionPipelineSafe
-    from .safety_checker import SafeStableDiffusionSafetyChecker
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
deleted file mode 100644
index 9567778f7b8d..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
+++ /dev/null
@@ -1,713 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-import warnings
-from typing import Callable, List, Optional, Union
-
-import numpy as np
-import paddle
-from packaging import version
-
-from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
-
-from ...configuration_utils import FrozenDict
-from ...models import AutoencoderKL, UNet2DConditionModel
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import deprecate, logging, randn_tensor
-from ..pipeline_utils import DiffusionPipeline
-from . import StableDiffusionSafePipelineOutput
-from .safety_checker import SafeStableDiffusionSafetyChecker
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-class StableDiffusionPipelineSafe(DiffusionPipeline):
-    r"""
-    Pipeline for text-to-image generation using Safe Latent Diffusion.
-
-    The implementation is based on the [`StableDiffusionPipeline`]
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPImageProcessor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-
-    _optional_components = ["safety_checker", "feature_extractor"]
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
-        safety_checker: SafeStableDiffusionSafetyChecker,
-        feature_extractor: CLIPImageProcessor,
-        requires_safety_checker: bool = True,
-    ):
-        super().__init__()
-        safety_concept: Optional[str] = (
-            "an image showing hate, harassment, violence, suffering, humiliation, harm, suicide, sexual, nudity,"
-            " bodily fluids, blood, obscene gestures, illegal activity, drug use, theft, vandalism, weapons, child"
-            " abuse, brutality, cruelty"
-        )
-
-        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
-                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
-                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
-                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
-                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file"
-            )
-            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["steps_offset"] = 1
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
-                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
-                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
-                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
-                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
-            )
-            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["clip_sample"] = False
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. PaddleNLP team, diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                f"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-
-        is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse(
-            version.parse(unet.config._ppdiffusers_version).base_version
-        ) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
-        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
-            deprecation_message = (
-                "The configuration file of the unet has set the default `sample_size` to smaller than"
-                " 64 which seems highly unlikely .If your checkpoint is a fine-tuned version of any of the"
-                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
-                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
-                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
-                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
-                " in the config might lead to incorrect results in future versions. If you have downloaded this"
-                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
-                " the `unet/config.json` file"
-            )
-            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(unet.config)
-            new_config["sample_size"] = 64
-            unet._internal_dict = FrozenDict(new_config)
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-        self._safety_text_concept = safety_concept
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-
-    @property
-    def safety_concept(self):
-        r"""
-        Getter method for the safety concept used with SLD
-
-        Returns:
-            `str`: The text describing the safety concept
-        """
-        return self._safety_text_concept
-
-    @safety_concept.setter
-    def safety_concept(self, concept):
-        r"""
-        Setter method for the safety concept used with SLD
-
-        Args:
-            concept (`str`):
-                The text of the new safety concept
-        """
-        self._safety_text_concept = concept
-
-    def _encode_prompt(
-        self,
-        prompt,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt,
-        enable_safety_guidance,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-        """
-        batch_size = len(prompt) if isinstance(prompt, list) else 1
-
-        text_inputs = self.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=self.tokenizer.model_max_length,
-            truncation=True,
-            return_tensors="pd",
-        )
-        text_input_ids = text_inputs.input_ids
-        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
-
-        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
-            text_input_ids, untruncated_ids
-        ):
-            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
-            logger.warning(
-                "The following part of your input was truncated because CLIP can only handle sequences up to"
-                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-            )
-
-        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-            attention_mask = text_inputs.attention_mask
-        else:
-            attention_mask = None
-
-        prompt_embeds = self.text_encoder(
-            text_input_ids,
-            attention_mask=attention_mask,
-        )
-        prompt_embeds = prompt_embeds[0]
-
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            max_length = text_input_ids.shape[-1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = uncond_input.attention_mask
-            else:
-                attention_mask = None
-
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids,
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-            # Encode the safety concept text
-            if enable_safety_guidance:
-                safety_concept_input = self.tokenizer(
-                    [self._safety_text_concept],
-                    padding="max_length",
-                    max_length=max_length,
-                    truncation=True,
-                    return_tensors="pd",
-                )
-                safety_embeddings = self.text_encoder(safety_concept_input.input_ids)[0]
-
-                # duplicate safety embeddings for each generation per prompt, using mps friendly method
-                seq_len = safety_embeddings.shape[1]
-                safety_embeddings = safety_embeddings.tile([batch_size, num_images_per_prompt, 1])
-                safety_embeddings = safety_embeddings.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-                # For classifier free guidance + sld, we need to do three forward passes.
-                # Here we concatenate the unconditional and text embeddings into a single batch
-                # to avoid doing three forward passes
-                prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds, safety_embeddings])
-
-            else:
-                # For classifier free guidance, we need to do two forward passes.
-                # Here we concatenate the unconditional and text embeddings into a single batch
-                # to avoid doing two forward passes
-                prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
-
-        return prompt_embeds
-
-    def run_safety_checker(self, image, dtype, enable_safety_guidance):
-        if self.safety_checker is not None:
-            images = image.copy()
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd")
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.cast(dtype)
-            )
-            flagged_images = np.zeros((2, *image.shape[1:]))
-            if any(has_nsfw_concept):
-                logger.warning(
-                    "Potential NSFW content was detected in one or more images. A black image will be returned"
-                    " instead."
-                    f"{'You may look at this images in the `unsafe_images` variable of the output at your own discretion.' if enable_safety_guidance else 'Try again with a different prompt and/or seed.'}"
-                )
-                for idx, has_nsfw_concept in enumerate(has_nsfw_concept):
-                    if has_nsfw_concept:
-                        flagged_images[idx] = images[idx]
-                        image[idx] = np.zeros(image[idx].shape)  # black image
-        else:
-            has_nsfw_concept = None
-            flagged_images = None
-        return image, has_nsfw_concept, flagged_images
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
-    def decode_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
-        image = self.vae.decode(latents).sample
-        image = (image / 2 + 0.5).clip(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        image = image.transpose([0, 2, 3, 1]).cast("float32").numpy()
-        return image
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
-    def check_inputs(
-        self,
-        prompt,
-        height,
-        width,
-        callback_steps,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-    ):
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, dtype=dtype)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    def perform_safety_guidance(
-        self,
-        enable_safety_guidance,
-        safety_momentum,
-        noise_guidance,
-        noise_pred_out,
-        i,
-        sld_guidance_scale,
-        sld_warmup_steps,
-        sld_threshold,
-        sld_momentum_scale,
-        sld_mom_beta,
-    ):
-        # Perform SLD guidance
-        if enable_safety_guidance:
-            if safety_momentum is None:
-                safety_momentum = paddle.zeros_like(noise_guidance)
-            noise_pred_text, noise_pred_uncond = noise_pred_out[0], noise_pred_out[1]
-            noise_pred_safety_concept = noise_pred_out[2]
-
-            # Equation 6
-            scale = paddle.clip(
-                paddle.abs((noise_pred_text - noise_pred_safety_concept)) * sld_guidance_scale, max=1.0
-            )
-
-            # Equation 6
-            safety_concept_scale = paddle.where(
-                (noise_pred_text - noise_pred_safety_concept) >= sld_threshold, paddle.zeros_like(scale), scale
-            )
-
-            # Equation 4
-            noise_guidance_safety = paddle.multiply(
-                (noise_pred_safety_concept - noise_pred_uncond), safety_concept_scale
-            )
-
-            # Equation 7
-            noise_guidance_safety = noise_guidance_safety + sld_momentum_scale * safety_momentum
-
-            # Equation 8
-            safety_momentum = sld_mom_beta * safety_momentum + (1 - sld_mom_beta) * noise_guidance_safety
-
-            if i >= sld_warmup_steps:  # Warmup
-                # Equation 3
-                noise_guidance = noise_guidance - noise_guidance_safety
-        return noise_guidance, safety_momentum
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]],
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        sld_guidance_scale: Optional[float] = 1000,
-        sld_warmup_steps: Optional[int] = 10,
-        sld_threshold: Optional[float] = 0.01,
-        sld_momentum_scale: Optional[float] = 0.3,
-        sld_mom_beta: Optional[float] = 0.4,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`):
-                The prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            sld_guidance_scale (`float`, *optional*, defaults to 1000):
-                Safe latent guidance as defined in [Safe Latent Diffusion](https://arxiv.org/abs/2211.05105).
-                `sld_guidance_scale` is defined as sS of Eq. 6. If set to be less than 1, safety guidance will be
-                disabled.
-            sld_warmup_steps (`int`, *optional*, defaults to 10):
-                Number of warmup steps for safety guidance. SLD will only be applied for diffusion steps greater than
-                `sld_warmup_steps`. `sld_warmup_steps` is defined as `delta` of [Safe Latent
-                Diffusion](https://arxiv.org/abs/2211.05105).
-            sld_threshold (`float`, *optional*, defaults to 0.01):
-                Threshold that separates the hyperplane between appropriate and inappropriate images. `sld_threshold`
-                is defined as `lamda` of Eq. 5 in [Safe Latent Diffusion](https://arxiv.org/abs/2211.05105).
-            sld_momentum_scale (`float`, *optional*, defaults to 0.3):
-                Scale of the SLD momentum to be added to the safety guidance at each diffusion step. If set to 0.0
-                momentum will be disabled. Momentum is already built up during warmup, i.e. for diffusion steps smaller
-                than `sld_warmup_steps`. `sld_momentum_scale` is defined as `sm` of Eq. 7 in [Safe Latent
-                Diffusion](https://arxiv.org/abs/2211.05105).
-            sld_mom_beta (`float`, *optional*, defaults to 0.4):
-                Defines how safety guidance momentum builds up. `sld_mom_beta` indicates how much of the previous
-                momentum will be kept. Momentum is already built up during warmup, i.e. for diffusion steps smaller
-                than `sld_warmup_steps`. `sld_mom_beta` is defined as `beta m` of Eq. 8 in [Safe Latent
-                Diffusion](https://arxiv.org/abs/2211.05105).
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * self.vae_scale_factor
-        width = width or self.unet.config.sample_size * self.vae_scale_factor
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(prompt, height, width, callback_steps)
-
-        # 2. Define call parameters
-        batch_size = 1 if isinstance(prompt, str) else len(prompt)
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        enable_safety_guidance = sld_guidance_scale > 1.0 and do_classifier_free_guidance
-        if not enable_safety_guidance:
-            warnings.warn("Safety checker disabled!")
-
-        # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt, num_images_per_prompt, do_classifier_free_guidance, negative_prompt, enable_safety_guidance
-        )
-
-        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps = self.scheduler.timesteps
-
-        # 5. Prepare latent variables
-        num_channels_latents = self.unet.config.in_channels
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            prompt_embeds.dtype,
-            generator,
-            latents,
-        )
-
-        # 6. Prepare extra step kwargs.
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        safety_momentum = None
-
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = (
-                    paddle.concat([latents] * (3 if enable_safety_guidance else 2))
-                    if do_classifier_free_guidance
-                    else latents
-                )
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                # predict the noise residual
-                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_out = noise_pred.chunk((3 if enable_safety_guidance else 2))
-                    noise_pred_uncond, noise_pred_text = noise_pred_out[0], noise_pred_out[1]
-
-                    # default classifier free guidance
-                    noise_guidance = noise_pred_text - noise_pred_uncond
-
-                    # Perform SLD guidance
-                    if enable_safety_guidance:
-                        if safety_momentum is None:
-                            safety_momentum = paddle.zeros_like(noise_guidance)
-                        noise_pred_safety_concept = noise_pred_out[2]
-
-                        # Equation 6
-                        scale = paddle.clip(
-                            paddle.abs((noise_pred_text - noise_pred_safety_concept)) * sld_guidance_scale, max=1.0
-                        )
-
-                        # Equation 6
-                        safety_concept_scale = paddle.where(
-                            (noise_pred_text - noise_pred_safety_concept) >= sld_threshold,
-                            paddle.zeros_like(scale),
-                            scale,
-                        )
-
-                        # Equation 4
-                        noise_guidance_safety = paddle.multiply(
-                            (noise_pred_safety_concept - noise_pred_uncond), safety_concept_scale
-                        )
-
-                        # Equation 7
-                        noise_guidance_safety = noise_guidance_safety + sld_momentum_scale * safety_momentum
-
-                        # Equation 8
-                        safety_momentum = sld_mom_beta * safety_momentum + (1 - sld_mom_beta) * noise_guidance_safety
-
-                        if i >= sld_warmup_steps:  # Warmup
-                            # Equation 3
-                            noise_guidance = noise_guidance - noise_guidance_safety
-
-                    noise_pred = noise_pred_uncond + guidance_scale * noise_guidance
-
-                    # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        # 8. Post-processing
-        image = self.decode_latents(latents)
-
-        # 9. Run safety checker
-        image, has_nsfw_concept, flagged_images = self.run_safety_checker(
-            image, prompt_embeds.dtype, enable_safety_guidance
-        )
-
-        # 10. Convert to PIL
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-            if flagged_images is not None:
-                flagged_images = self.numpy_to_pil(flagged_images)
-
-        if not return_dict:
-            return (
-                image,
-                has_nsfw_concept,
-                self._safety_text_concept if enable_safety_guidance else None,
-                flagged_images,
-            )
-
-        return StableDiffusionSafePipelineOutput(
-            images=image,
-            nsfw_content_detected=has_nsfw_concept,
-            applied_safety_concept=self._safety_text_concept if enable_safety_guidance else None,
-            unsafe_images=flagged_images,
-        )
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_safe/safety_checker.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_safe/safety_checker.py
deleted file mode 100644
index 5a6624e8ab57..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_safe/safety_checker.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.nn.functional as F
-
-from paddlenlp.transformers import (
-    CLIPPretrainedModel,
-    CLIPVisionConfig,
-    CLIPVisionModel,
-)
-
-from ...utils import logging
-
-logger = logging.get_logger(__name__)
-
-
-def cosine_distance(image_embeds, text_embeds):
-    normalized_image_embeds = F.normalize(image_embeds)
-    normalized_text_embeds = F.normalize(text_embeds)
-    return paddle.matmul(normalized_image_embeds, normalized_text_embeds, transpose_y=True)
-
-
-class SafeStableDiffusionSafetyChecker(CLIPPretrainedModel):
-    config_class = CLIPVisionConfig
-
-    def __init__(self, config: CLIPVisionConfig):
-        super().__init__(config)
-        self.clip = CLIPVisionModel(config)
-
-        self.vision_projection = paddle.create_parameter(
-            (config.hidden_size, config.projection_dim), dtype=paddle.get_default_dtype()
-        )
-
-        self.register_buffer("concept_embeds", paddle.ones([17, config.projection_dim]))
-        self.register_buffer("special_care_embeds", paddle.ones([3, config.projection_dim]))
-
-        self.register_buffer("concept_embeds_weights", paddle.ones([17]))
-        self.register_buffer("special_care_embeds_weights", paddle.ones([3]))
-
-    @paddle.no_grad()
-    def forward(self, clip_input, images):
-        pooled_output = self.clip(clip_input)[1]  # pooled_output
-        image_embeds = paddle.matmul(pooled_output, self.vision_projection)
-
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
-        special_cos_dist = cosine_distance(image_embeds, self.special_care_embeds).astype("float32").numpy()
-        cos_dist = cosine_distance(image_embeds, self.concept_embeds).astype("float32").numpy()
-
-        result = []
-        batch_size = image_embeds.shape[0]
-        for i in range(batch_size):
-            result_img = {"special_scores": {}, "special_care": [], "concept_scores": {}, "bad_concepts": []}
-
-            # increase this value to create a stronger `nfsw` filter
-            # at the cost of increasing the possibility of filtering benign images
-            adjustment = 0.0
-
-            for concept_idx in range(len(special_cos_dist[0])):
-                concept_cos = special_cos_dist[i][concept_idx]
-                concept_threshold = self.special_care_embeds_weights[concept_idx].item()
-                result_img["special_scores"][concept_idx] = round(concept_cos - concept_threshold + adjustment, 3)
-                if result_img["special_scores"][concept_idx] > 0:
-                    result_img["special_care"].append({concept_idx, result_img["special_scores"][concept_idx]})
-                    adjustment = 0.01
-
-            for concept_idx in range(len(cos_dist[0])):
-                concept_cos = cos_dist[i][concept_idx]
-                concept_threshold = self.concept_embeds_weights[concept_idx].item()
-                result_img["concept_scores"][concept_idx] = round(concept_cos - concept_threshold + adjustment, 3)
-                if result_img["concept_scores"][concept_idx] > 0:
-                    result_img["bad_concepts"].append(concept_idx)
-
-            result.append(result_img)
-
-        has_nsfw_concepts = [len(res["bad_concepts"]) > 0 for res in result]
-
-        return images, has_nsfw_concepts
-
-    def forward_fastdeploy(self, clip_input: paddle.Tensor, images: paddle.Tensor):
-        pooled_output = self.clip(clip_input)[1]  # pooled_output
-        image_embeds = paddle.matmul(pooled_output, self.vision_projection)
-
-        special_cos_dist = cosine_distance(image_embeds, self.special_care_embeds)
-        cos_dist = cosine_distance(image_embeds, self.concept_embeds)
-
-        # increase this value to create a stronger `nsfw` filter
-        # at the cost of increasing the possibility of filtering benign images
-        adjustment = 0.0
-
-        special_scores = special_cos_dist - self.special_care_embeds_weights + adjustment
-        # special_scores = special_scores.round(decimals=3)
-        special_care = paddle.any(special_scores > 0, axis=1)
-        special_adjustment = special_care * 0.01
-        special_adjustment = special_adjustment.unsqueeze(1).expand([-1, cos_dist.shape[1]])
-
-        concept_scores = (cos_dist - self.concept_embeds_weights) + special_adjustment
-        # concept_scores = concept_scores.round(decimals=3)
-        has_nsfw_concepts = paddle.any(concept_scores > 0, axis=1)
-
-        return images, has_nsfw_concepts
diff --git a/ppdiffusers/ppdiffusers/pipelines/stochastic_karras_ve/__init__.py b/ppdiffusers/ppdiffusers/pipelines/stochastic_karras_ve/__init__.py
deleted file mode 100644
index 2527b387eea7..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stochastic_karras_ve/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# flake8: noqa
-from .pipeline_stochastic_karras_ve import KarrasVePipeline
diff --git a/ppdiffusers/ppdiffusers/pipelines/stochastic_karras_ve/pipeline_stochastic_karras_ve.py b/ppdiffusers/ppdiffusers/pipelines/stochastic_karras_ve/pipeline_stochastic_karras_ve.py
deleted file mode 100644
index d06ace369622..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/stochastic_karras_ve/pipeline_stochastic_karras_ve.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import List, Optional, Tuple, Union
-
-import paddle
-
-from ...models import UNet2DModel
-from ...schedulers import KarrasVeScheduler
-from ...utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
-
-
-class KarrasVePipeline(DiffusionPipeline):
-    r"""
-    Stochastic sampling from Karras et al. [1] tailored to the Variance-Expanding (VE) models [2]. Use Algorithm 2 and
-    the VE column of Table 1 from [1] for reference.
-
-    [1] Karras, Tero, et al. "Elucidating the Design Space of Diffusion-Based Generative Models."
-    https://arxiv.org/abs/2206.00364 [2] Song, Yang, et al. "Score-based generative modeling through stochastic
-    differential equations." https://arxiv.org/abs/2011.13456
-
-    Parameters:
-        unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image.
-        scheduler ([`KarrasVeScheduler`]):
-            Scheduler for the diffusion process to be used in combination with `unet` to denoise the encoded image.
-    """
-
-    # add type hints for linting
-    unet: UNet2DModel
-    scheduler: KarrasVeScheduler
-
-    def __init__(self, unet: UNet2DModel, scheduler: KarrasVeScheduler):
-        super().__init__()
-        self.register_modules(unet=unet, scheduler=scheduler)
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        batch_size: int = 1,
-        num_inference_steps: int = 50,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        **kwargs,
-    ) -> Union[Tuple, ImagePipelineOutput]:
-        r"""
-        Args:
-            batch_size (`int`, *optional*, defaults to 1):
-                The number of images to generate.
-            generator (`paddle.Generator`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
-
-        Returns:
-            [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is
-            True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
-        """
-
-        img_size = self.unet.config.sample_size
-        shape = (batch_size, 3, img_size, img_size)
-
-        model = self.unet
-
-        # sample x_0 ~ N(0, sigma_0^2 * I)
-        sample = randn_tensor(shape, generator=generator) * self.scheduler.init_noise_sigma
-
-        self.scheduler.set_timesteps(num_inference_steps)
-
-        for t in self.progress_bar(self.scheduler.timesteps):
-            # here sigma_t == t_i from the paper
-            sigma = self.scheduler.schedule[t]
-            sigma_prev = self.scheduler.schedule[t - 1] if t > 0 else 0
-
-            # 1. Select temporarily increased noise level sigma_hat
-            # 2. Add new noise to move from sample_i to sample_hat
-            sample_hat, sigma_hat = self.scheduler.add_noise_to_input(sample, sigma, generator=generator)
-
-            # 3. Predict the noise residual given the noise magnitude `sigma_hat`
-            # The model inputs and output are adjusted by following eq. (213) in [1].
-            model_output = (sigma_hat / 2) * model((sample_hat + 1) / 2, sigma_hat / 2).sample
-
-            # 4. Evaluate dx/dt at sigma_hat
-            # 5. Take Euler step from sigma to sigma_prev
-            step_output = self.scheduler.step(model_output, sigma_hat, sigma_prev, sample_hat)
-
-            if sigma_prev != 0:
-                # 6. Apply 2nd order correction
-                # The model inputs and output are adjusted by following eq. (213) in [1].
-                model_output = (sigma_prev / 2) * model((step_output.prev_sample + 1) / 2, sigma_prev / 2).sample
-                step_output = self.scheduler.step_correct(
-                    model_output,
-                    sigma_hat,
-                    sigma_prev,
-                    sample_hat,
-                    step_output.prev_sample,
-                    step_output["derivative"],
-                )
-            sample = step_output.prev_sample
-
-        sample = (sample / 2 + 0.5).clip(0, 1)
-        image = sample.transpose([0, 2, 3, 1]).numpy()
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image,)
-
-        return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/__init__.py b/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/__init__.py
deleted file mode 100644
index 649c39a7ecda..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/__init__.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from dataclasses import dataclass
-from typing import List, Optional, Union
-
-import numpy as np
-import paddle
-
-from ...utils import (
-    BaseOutput,
-    OptionalDependencyNotAvailable,
-    is_paddle_available,
-    is_paddlenlp_available,
-)
-
-
-@dataclass
-class TextToVideoSDPipelineOutput(BaseOutput):
-    """
-    Output class for text to video pipelines.
-
-    Args:
-        frames (`List[np.ndarray]` or `paddle.Tensor`)
-            List of denoised frames (essentially images) as NumPy arrays of shape `(height, width, num_channels)` or as
-            a `paddle` tensor. NumPy array present the denoised images of the diffusion pipeline. The length of the list
-            denotes the video length i.e., the number of frames.
-    """
-
-    frames: Union[List[np.ndarray], paddle.Tensor]
-
-
-try:
-    if not (is_paddlenlp_available() and is_paddle_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ...utils.dummy_paddle_and_paddlenlp_objects import *
-else:
-    from .pipeline_text_to_video_synth import TextToVideoSDPipeline
-    from .pipeline_text_to_video_zero import TextToVideoZeroPipeline
diff --git a/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py b/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
deleted file mode 100644
index e7c01964a7fd..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
+++ /dev/null
@@ -1,481 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import numpy as np
-import paddle
-
-from paddlenlp.transformers import CLIPTextModel, CLIPTokenizer
-
-from ...loaders import TextualInversionLoaderMixin
-from ...models import AutoencoderKL, UNet3DConditionModel
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import logging, randn_tensor, replace_example_docstring
-from ..pipeline_utils import DiffusionPipeline
-from . import TextToVideoSDPipelineOutput
-
-logger = logging.get_logger(__name__)
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> import paddle
-        >>> from ppdiffusers import TextToVideoSDPipeline
-        >>> from ppdiffusers.utils import export_to_video
-
-        >>> pipe = TextToVideoSDPipeline.from_pretrained(
-        ...     "damo-vilab/text-to-video-ms-1.7b",
-        ... )
-
-        >>> prompt = "Spiderman is surfing"
-        >>> video_frames = pipe(prompt).frames
-        >>> video_path = export_to_video(video_frames)
-        >>> video_path
-        ```
-"""
-
-
-def tensor2vid(video: paddle.Tensor, mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) -> List[np.ndarray]:
-    # This code is copied from https://github.com/modelscope/modelscope/blob/1509fdb973e5871f37148a4b5e5964cafd43e64d/modelscope/pipelines/multi_modal/text_to_video_synthesis_pipeline.py#L78
-    # reshape to ncfhw
-    mean = paddle.to_tensor(mean).reshape((1, -1, 1, 1, 1))
-    std = paddle.to_tensor(std).reshape((1, -1, 1, 1, 1))
-    video = video.multiply(std)
-    video = video.add(mean)
-
-    video.clip_(min=0, max=1)
-    i, c, f, h, w = video.shape
-    images = video.transpose([2, 3, 0, 4, 1]).reshape((f, h, i * w, c))
-    images = images.unbind(axis=0)
-    images = [(image.cpu().numpy() * 255).astype("uint8") for image in images]
-    return images
-
-
-class TextToVideoSDPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
-    """
-    Pipeline for text-to-video generation.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Same as Stable Diffusion 2.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class CLIPTokenizer.
-        unet ([`UNet3DConditionModel`]): Conditional U-Net architecture to denoise the encoded video latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-    """
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet3DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
-    ):
-        super().__init__()
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-
-    def _encode_prompt(
-        self,
-        prompt,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-    ):
-        """
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-        """
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-        if prompt_embeds is None:
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    f"The following part of your input was truncated because CLIP can only handle sequences up to {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask
-            else:
-                attention_mask = None
-            prompt_embeds = self.text_encoder(text_input_ids, attention_mask=attention_mask)
-            prompt_embeds = prompt_embeds[0]
-        prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype)
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape((bs_embed * num_images_per_prompt, seq_len, -1))
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} != {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`: {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens, padding="max_length", max_length=max_length, truncation=True, return_tensors="pd"
-            )
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = uncond_input.attention_mask
-            else:
-                attention_mask = None
-            negative_prompt_embeds = self.text_encoder(uncond_input.input_ids, attention_mask=attention_mask)
-            negative_prompt_embeds = negative_prompt_embeds[0]
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype)
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape((batch_size * num_images_per_prompt, seq_len, -1))
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
-        return prompt_embeds
-
-    def decode_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
-        batch_size, channels, num_frames, height, width = latents.shape
-        latents = latents.transpose([0, 2, 1, 3, 4]).reshape((batch_size * num_frames, channels, height, width))
-        image = self.vae.decode(latents).sample
-        video = (
-            image[None, :].reshape((batch_size, num_frames, -1) + tuple(image.shape[2:])).transpose([0, 2, 1, 3, 4])
-        )
-        video = video.cast("float32")
-        return video
-
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    def check_inputs(
-        self,
-        prompt,
-        height,
-        width,
-        callback_steps,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-    ):
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-        if (
-            callback_steps is None
-            or callback_steps is not None
-            and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type {type(callback_steps)}."
-            )
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`: {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    f"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds` {negative_prompt_embeds.shape}."
-                )
-
-    def prepare_latents(
-        self, batch_size, num_channels_latents, num_frames, height, width, dtype, generator, latents=None
-    ):
-        shape = (
-            batch_size,
-            num_channels_latents,
-            num_frames,
-            height // self.vae_scale_factor,
-            width // self.vae_scale_factor,
-        )
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, dtype=dtype)
-
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    @paddle.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_frames: int = 16,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 9.0,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "np",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: int = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-    ):
-        """
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the video generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated video.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated video.
-            num_frames (`int`, *optional*, defaults to 16):
-                The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds
-                amounts to 2 seconds of video.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate videos that are closely linked to the text `prompt`,
-                usually at the expense of lower video quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the video generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
-                One or a list of [paddle generator(s)]
-                to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for video
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`. Latents should be of shape
-                `(batch_size, num_channel, num_frames, height, width)`.
-            prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`paddle.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, *optional*, defaults to `"np"`):
-                The output format of the generate video. Choose between `paddle.Tensor` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.TextToVideoSDPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [ppdiffusers.cross_attention](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/ppdiffusers/ppdiffusers/models/cross_attention.py).
-
-
-        Examples:
-
-        Returns:
-            [`~pipelines.stable_diffusion.TextToVideoSDPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.TextToVideoSDPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated frames.
-        """
-        # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * self.vae_scale_factor
-        width = width or self.unet.config.sample_size * self.vae_scale_factor
-        num_images_per_prompt = 1
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
-        )
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-        # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-        )
-
-        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-
-        timesteps = self.scheduler.timesteps
-
-        # 5. Prepare latent variables
-        num_channels_latents = self.unet.config.in_channels
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            num_frames,
-            height,
-            width,
-            prompt_embeds.dtype,
-            generator,
-            latents,
-        )
-
-        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-        # 7. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-                # predict the noise residual
-                noise_pred = self.unet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                ).sample
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(chunks=2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                # reshape latents
-                bsz, channel, frames, width, height = latents.shape
-                latents = latents.transpose([0, 2, 1, 3, 4]).reshape((bsz * frames, channel, width, height))
-                noise_pred = noise_pred.transpose([0, 2, 1, 3, 4]).reshape((bsz * frames, channel, width, height))
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-                latents = latents[None, :].reshape((bsz, frames, channel, width, height)).transpose([0, 2, 1, 3, 4])
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-        video_tensor = self.decode_latents(latents)
-        if output_type == "pd":
-            video = video_tensor
-        else:
-            video = tensor2vid(video_tensor)
-        if not return_dict:
-            return (video,)
-        return TextToVideoSDPipelineOutput(frames=video)
diff --git a/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py b/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py
deleted file mode 100644
index d59929cd0898..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py
+++ /dev/null
@@ -1,536 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-from dataclasses import dataclass
-from typing import Callable, List, Optional, Union
-
-import numpy as np
-import paddle
-import paddle.nn.functional as F
-import PIL
-
-from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
-from ppdiffusers.models import AutoencoderKL, UNet2DConditionModel
-from ppdiffusers.pipelines.stable_diffusion import (
-    StableDiffusionPipeline,
-    StableDiffusionSafetyChecker,
-)
-from ppdiffusers.schedulers import KarrasDiffusionSchedulers
-from ppdiffusers.utils import BaseOutput
-
-
-def rearrange_0(tensor, f):
-    F, C, H, W = tensor.shape
-    tensor = paddle.transpose(x=paddle.reshape(x=tensor, shape=(F // f, f, C, H, W)), perm=(0, 2, 1, 3, 4))
-    return tensor
-
-
-def rearrange_1(tensor):
-    B, C, F, H, W = tensor.shape
-    return paddle.reshape(x=paddle.transpose(x=tensor, perm=(0, 2, 1, 3, 4)), shape=(B * F, C, H, W))
-
-
-def rearrange_3(tensor, f):
-    F, D, C = tensor.shape
-    return paddle.reshape(x=tensor, shape=(F // f, f, D, C))
-
-
-def rearrange_4(tensor):
-    B, F, D, C = tensor.shape
-    return paddle.reshape(x=tensor, shape=(B * F, D, C))
-
-
-class CrossFrameAttnProcessor:
-    """
-    Cross frame attention processor. For each frame the self-attention is replaced with attention with first frame
-
-    Args:
-        batch_size: The number that represents actual batch size, other than the frames.
-            For example, using calling unet with a single prompt and num_images_per_prompt=1, batch_size should be
-            equal to 2, due to classifier-free guidance.
-    """
-
-    def __init__(self, batch_size=2):
-        self.batch_size = batch_size
-
-    def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None):
-        batch_size, sequence_length, _ = hidden_states.shape
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-        query = attn.to_q(hidden_states)
-        is_cross_attention = encoder_hidden_states is not None
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-        key = attn.to_k(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states)
-
-        # Sparse Attention
-        if not is_cross_attention:
-            video_length = key.shape[0] // self.batch_size
-            first_frame_index = [0] * video_length
-
-            # rearrange keys to have batch and frames in the 1st and 2nd dims respectively
-            key = rearrange_3(key, video_length)
-            key = key.index_select(paddle.to_tensor(first_frame_index), 1)
-            # rearrange values to have batch and frames in the 1st and 2nd dims respectively
-            value = rearrange_3(value, video_length)
-            value = value.index_select(paddle.to_tensor(first_frame_index), 1)
-
-            # rearrange back to original shape
-            key = rearrange_4(key)
-            value = rearrange_4(value)
-        query = attn.head_to_batch_dim(query, out_dim=3)
-        key = attn.head_to_batch_dim(key, out_dim=3)
-        value = attn.head_to_batch_dim(value, out_dim=3)
-        attention_probs = attn.get_attention_scores(query, key, attention_mask)
-        hidden_states = paddle.bmm(x=attention_probs, y=value)
-        hidden_states = attn.batch_to_head_dim(hidden_states, in_dim=3)
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-        return hidden_states
-
-
-@dataclass
-class TextToVideoPipelineOutput(BaseOutput):
-    images: Union[List[PIL.Image.Image], np.ndarray]
-    nsfw_content_detected: Optional[List[bool]]
-
-
-def coords_grid(batch, ht, wd):
-    coords = paddle.meshgrid(paddle.arange(end=ht), paddle.arange(end=wd))
-    coords = paddle.stack(x=coords[::-1], axis=0).astype(dtype="float32")
-    return coords[None].tile(repeat_times=[batch, 1, 1, 1])
-
-
-def warp_single_latent(latent, reference_flow):
-    """
-    Warp latent of a single frame with given flow
-
-    Args:
-        latent: latent code of a single frame
-        reference_flow: flow which to warp the latent with
-
-    Returns:
-        warped: warped latent
-    """
-    _, _, H, W = reference_flow.shape
-    _, _, h, w = latent.shape
-    if isinstance(latent.dtype, paddle.dtype):
-        dtype = latent.dtype
-    elif isinstance(latent.dtype, str) and latent.dtype not in ["cpu", "cuda", "ipu", "xpu"]:
-        dtype = latent.dtype
-    elif isinstance(latent.dtype, paddle.Tensor):
-        dtype = latent.dtype.dtype
-    else:
-        dtype = coords_grid(1, H, W).dtype
-    coords0 = coords_grid(1, H, W).cast(dtype)
-    coords_t0 = coords0 + reference_flow
-    coords_t0[:, (0)] /= W
-    coords_t0[:, (1)] /= H
-    coords_t0 = coords_t0 * 2.0 - 1.0
-    coords_t0 = F.interpolate(x=coords_t0, size=(h, w), mode="bilinear")
-    coords_t0 = paddle.transpose(x=coords_t0, perm=(0, 2, 3, 1))
-    warped = F.grid_sample(x=latent, grid=coords_t0, mode="nearest", padding_mode="reflection")
-    return warped
-
-
-def create_motion_field(motion_field_strength_x, motion_field_strength_y, frame_ids, dtype):
-    """
-    Create translation motion field
-
-    Args:
-        motion_field_strength_x: motion strength along x-axis
-        motion_field_strength_y: motion strength along y-axis
-        frame_ids: indexes of the frames the latents of which are being processed.
-            This is needed when we perform chunk-by-chunk inference
-        dtype: dtype
-
-    Returns:
-
-    """
-    seq_length = len(frame_ids)
-    reference_flow = paddle.zeros(shape=(seq_length, 2, 512, 512), dtype=dtype)
-    for fr_idx in range(seq_length):
-        reference_flow[(fr_idx), (0), :, :] = motion_field_strength_x * frame_ids[fr_idx]
-        reference_flow[(fr_idx), (1), :, :] = motion_field_strength_y * frame_ids[fr_idx]
-    return reference_flow
-
-
-def create_motion_field_and_warp_latents(motion_field_strength_x, motion_field_strength_y, frame_ids, latents):
-    """
-    Creates translation motion and warps the latents accordingly
-
-    Args:
-        motion_field_strength_x: motion strength along x-axis
-        motion_field_strength_y: motion strength along y-axis
-        frame_ids: indexes of the frames the latents of which are being processed.
-            This is needed when we perform chunk-by-chunk inference
-        latents: latent codes of frames
-
-    Returns:
-        warped_latents: warped latents
-    """
-    motion_field = create_motion_field(
-        motion_field_strength_x=motion_field_strength_x,
-        motion_field_strength_y=motion_field_strength_y,
-        frame_ids=frame_ids,
-        dtype=latents.dtype,
-    )
-    warped_latents = latents.clone().detach()
-    for i in range(len(warped_latents)):
-        warped_latents[i] = warp_single_latent(latents[i][None], motion_field[i][None])
-    return warped_latents
-
-
-class TextToVideoZeroPipeline(StableDiffusionPipeline):
-    """
-    Pipeline for zero-shot text-to-video generation using Stable Diffusion.
-
-    This model inherits from [`StableDiffusionPipeline`]. Check the superclass documentation for the generic methods
-    the library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of CLIP, specifically
-            the clip-vit-large-patch14 variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class CLIPTokenizer.
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-        feature_extractor ([`CLIPImageProcessor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPImageProcessor,
-        requires_safety_checker: bool = True,
-    ):
-        super().__init__(
-            vae, text_encoder, tokenizer, unet, scheduler, safety_checker, feature_extractor, requires_safety_checker
-        )
-        self.unet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))
-
-    def forward_loop(self, x_t0, t0, t1, generator):
-        """
-        Perform ddpm forward process from time t0 to t1. This is the same as adding noise with corresponding variance.
-
-        Args:
-            x_t0: latent code at time t0
-            t0: t0
-            t1: t1
-            generator: paddle.Generator object
-
-        Returns:
-            x_t1: forward process applied to x_t0 from time t0 to t1.
-        """
-        eps = paddle.randn(shape=x_t0.shape, generator=generator, dtype=x_t0.dtype)
-        alpha_vec = paddle.prod(x=self.scheduler.alphas[t0:t1])
-        x_t1 = paddle.sqrt(x=alpha_vec) * x_t0 + paddle.sqrt(x=1 - alpha_vec) * eps
-        return x_t1
-
-    def backward_loop(
-        self,
-        latents,
-        timesteps,
-        prompt_embeds,
-        guidance_scale,
-        callback,
-        callback_steps,
-        num_warmup_steps,
-        extra_step_kwargs,
-        cross_attention_kwargs=None,
-    ):
-        """
-        Perform backward process given list of time steps
-
-        Args:
-            latents: Latents at time timesteps[0].
-            timesteps: time steps, along which to perform backward process.
-            prompt_embeds: Pre-generated text embeddings
-            guidance_scale:
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            extra_step_kwargs: extra_step_kwargs.
-            cross_attention_kwargs: cross_attention_kwargs.
-            num_warmup_steps: number of warmup steps.
-
-        Returns:
-            latents: latents of backward process output at time timesteps[-1]
-        """
-        do_classifier_free_guidance = guidance_scale > 1.0
-        num_steps = (len(timesteps) - num_warmup_steps) // self.scheduler.order
-        with self.progress_bar(total=num_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = paddle.concat(x=[latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                # predict the noise residual
-                noise_pred = self.unet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                ).sample
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(chunks=2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or i + 1 > num_warmup_steps and (i + 1) % self.scheduler.order == 0:
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-        return latents.clone().detach()
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]],
-        video_length: Optional[int] = 8,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_videos_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        motion_field_strength_x: float = 12,
-        motion_field_strength_y: float = 12,
-        output_type: Optional[str] = "tensor",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        t0: int = 44,
-        t1: int = 47,
-    ):
-        """
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            video_length (`int`, *optional*, defaults to 8): The number of generated video frames
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            num_videos_per_prompt (`int`, *optional*, defaults to 1):
-                The number of videos to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
-                One or a list of paddle generator(s)
-                to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            output_type (`str`, *optional*, defaults to `"numpy"`):
-                The output format of the generated image. Choose between `"latent"` and `"numpy"`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            motion_field_strength_x (`float`, *optional*, defaults to 12):
-                Strength of motion in generated video along x-axis. See the [paper](https://arxiv.org/abs/2303.13439),
-                Sect. 3.3.1.
-            motion_field_strength_y (`float`, *optional*, defaults to 12):
-                Strength of motion in generated video along y-axis. See the [paper](https://arxiv.org/abs/2303.13439),
-                Sect. 3.3.1.
-            t0 (`int`, *optional*, defaults to 44):
-                Timestep t0. Should be in the range [0, num_inference_steps - 1]. See the
-                [paper](https://arxiv.org/abs/2303.13439), Sect. 3.3.1.
-            t1 (`int`, *optional*, defaults to 47):
-                Timestep t0. Should be in the range [t0 + 1, num_inference_steps - 1]. See the
-                [paper](https://arxiv.org/abs/2303.13439), Sect. 3.3.1.
-
-        Returns:
-            [`~pipelines.text_to_video_synthesis.TextToVideoPipelineOutput`]:
-                The output contains a ndarray of the generated images, when output_type != 'latent', otherwise a latent
-                codes of generated image, and a list of `bool`s denoting whether the corresponding generated image
-                likely represents "not-safe-for-work" (nsfw) content, according to the `safety_checker`.
-        """
-        assert video_length > 0
-        frame_ids = list(range(video_length))
-        assert num_videos_per_prompt == 1
-        if isinstance(prompt, str):
-            prompt = [prompt]
-        if isinstance(negative_prompt, str):
-            negative_prompt = [negative_prompt]
-
-        # Default height and width to unet
-        height = height or self.unet.config.sample_size * self.vae_scale_factor
-        width = width or self.unet.config.sample_size * self.vae_scale_factor
-
-        # Check inputs. Raise error if not correct
-        self.check_inputs(prompt, height, width, callback_steps)
-
-        # Define call parameters
-        batch_size = 1 if isinstance(prompt, str) else len(prompt)
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt, num_videos_per_prompt, do_classifier_free_guidance, negative_prompt
-        )
-
-        # Prepare timesteps
-        self.scheduler.set_timesteps(
-            num_inference_steps,
-        )
-        timesteps = self.scheduler.timesteps
-
-        # Prepare latent variables
-        num_channels_latents = self.unet.config.in_channels
-        latents = self.prepare_latents(
-            batch_size * num_videos_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            prompt_embeds.dtype,
-            generator,
-            latents,
-        )
-
-        # Prepare extra step kwargs.
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-
-        # Perform the first backward process up to time T_1
-        x_1_t1 = self.backward_loop(
-            timesteps=timesteps[: -t1 - 1],
-            prompt_embeds=prompt_embeds,
-            latents=latents,
-            guidance_scale=guidance_scale,
-            callback=callback,
-            callback_steps=callback_steps,
-            extra_step_kwargs=extra_step_kwargs,
-            num_warmup_steps=num_warmup_steps,
-        )
-        scheduler_copy = copy.deepcopy(self.scheduler)
-
-        # Perform the second backward process up to time T_0
-        x_1_t0 = self.backward_loop(
-            timesteps=timesteps[-t1 - 1 : -t0 - 1],
-            prompt_embeds=prompt_embeds,
-            latents=x_1_t1,
-            guidance_scale=guidance_scale,
-            callback=callback,
-            callback_steps=callback_steps,
-            extra_step_kwargs=extra_step_kwargs,
-            num_warmup_steps=0,
-        )
-
-        # Propagate first frame latents at time T_0 to remaining frames
-        x_2k_t0 = x_1_t0.tile(repeat_times=[video_length - 1, 1, 1, 1])
-
-        # Add motion in latents at time T_0
-        x_2k_t0 = create_motion_field_and_warp_latents(
-            motion_field_strength_x=motion_field_strength_x,
-            motion_field_strength_y=motion_field_strength_y,
-            latents=x_2k_t0,
-            frame_ids=frame_ids[1:],
-        )
-
-        # Perform forward process up to time T_1
-        x_2k_t1 = self.forward_loop(
-            x_t0=x_2k_t0, t0=timesteps[-t0 - 1].item(), t1=timesteps[-t1 - 1].item(), generator=generator
-        )
-
-        # Perform backward process from time T_1 to 0
-        x_1k_t1 = paddle.concat(x=[x_1_t1, x_2k_t1])
-        b, l, d = prompt_embeds.shape
-        prompt_embeds = (
-            prompt_embeds[:, (None)].tile(repeat_times=[1, video_length, 1, 1]).reshape([b * video_length, l, d])
-        )
-        self.scheduler = scheduler_copy
-        x_1k_0 = self.backward_loop(
-            timesteps=timesteps[-t1 - 1 :],
-            prompt_embeds=prompt_embeds,
-            latents=x_1k_t1,
-            guidance_scale=guidance_scale,
-            callback=callback,
-            callback_steps=callback_steps,
-            extra_step_kwargs=extra_step_kwargs,
-            num_warmup_steps=0,
-        )
-        latents = x_1k_0
-        paddle.device.cuda.empty_cache()
-        if output_type == "latent":
-            image = latents
-            has_nsfw_concept = None
-        else:
-            image = self.decode_latents(latents)
-            image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
-        if not return_dict:
-            return image, has_nsfw_concept
-        return TextToVideoPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/unclip/__init__.py b/ppdiffusers/ppdiffusers/pipelines/unclip/__init__.py
deleted file mode 100644
index f6aa36a3413d..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/unclip/__init__.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from ...utils import (
-    OptionalDependencyNotAvailable,
-    is_paddle_available,
-    is_paddlenlp_available,
-)
-
-try:
-    if not (is_paddlenlp_available() and is_paddle_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ...utils.dummy_paddle_and_paddlenlp_objects import (
-        UnCLIPImageVariationPipeline,
-        UnCLIPPipeline,
-    )
-else:
-    from .pipeline_unclip import UnCLIPPipeline
-    from .pipeline_unclip_image_variation import UnCLIPImageVariationPipeline
-    from .text_proj import UnCLIPTextProjModel
diff --git a/ppdiffusers/ppdiffusers/pipelines/unclip/pipeline_unclip.py b/ppdiffusers/ppdiffusers/pipelines/unclip/pipeline_unclip.py
deleted file mode 100644
index 12f693e26476..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/unclip/pipeline_unclip.py
+++ /dev/null
@@ -1,505 +0,0 @@
-# Copyright 2022 Kakao Brain and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import List, Optional, Tuple, Union
-
-import paddle
-import paddle.nn.functional as F
-
-from paddlenlp.transformers import CLIPTextModelWithProjection, CLIPTokenizer
-from paddlenlp.transformers.clip.modeling import CLIPTextModelOutput
-
-from ...models import PriorTransformer, UNet2DConditionModel, UNet2DModel
-from ...pipelines import DiffusionPipeline
-from ...pipelines.pipeline_utils import ImagePipelineOutput
-from ...schedulers import UnCLIPScheduler
-from ...utils import logging, randn_tensor
-from .text_proj import UnCLIPTextProjModel
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-class UnCLIPPipeline(DiffusionPipeline):
-    """
-    Pipeline for text-to-image generation using unCLIP
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        text_encoder ([`CLIPTextModelWithProjection`]):
-            Frozen text-encoder.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        prior ([`PriorTransformer`]):
-            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
-        text_proj ([`UnCLIPTextProjModel`]):
-            Utility class to prepare and combine the embeddings before they are passed to the decoder.
-        decoder ([`UNet2DConditionModel`]):
-            The decoder to invert the image embedding into an image.
-        super_res_first ([`UNet2DModel`]):
-            Super resolution unet. Used in all but the last step of the super resolution diffusion process.
-        super_res_last ([`UNet2DModel`]):
-            Super resolution unet. Used in the last step of the super resolution diffusion process.
-        prior_scheduler ([`UnCLIPScheduler`]):
-            Scheduler used in the prior denoising process. Just a modified DDPMScheduler.
-        decoder_scheduler ([`UnCLIPScheduler`]):
-            Scheduler used in the decoder denoising process. Just a modified DDPMScheduler.
-        super_res_scheduler ([`UnCLIPScheduler`]):
-            Scheduler used in the super resolution denoising process. Just a modified DDPMScheduler.
-
-    """
-
-    prior: PriorTransformer
-    decoder: UNet2DConditionModel
-    text_proj: UnCLIPTextProjModel
-    text_encoder: CLIPTextModelWithProjection
-    tokenizer: CLIPTokenizer
-    super_res_first: UNet2DModel
-    super_res_last: UNet2DModel
-
-    prior_scheduler: UnCLIPScheduler
-    decoder_scheduler: UnCLIPScheduler
-    super_res_scheduler: UnCLIPScheduler
-
-    def __init__(
-        self,
-        prior: PriorTransformer,
-        decoder: UNet2DConditionModel,
-        text_encoder: CLIPTextModelWithProjection,
-        tokenizer: CLIPTokenizer,
-        text_proj: UnCLIPTextProjModel,
-        super_res_first: UNet2DModel,
-        super_res_last: UNet2DModel,
-        prior_scheduler: UnCLIPScheduler,
-        decoder_scheduler: UnCLIPScheduler,
-        super_res_scheduler: UnCLIPScheduler,
-    ):
-        super().__init__()
-
-        self.register_modules(
-            prior=prior,
-            decoder=decoder,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            text_proj=text_proj,
-            super_res_first=super_res_first,
-            super_res_last=super_res_last,
-            prior_scheduler=prior_scheduler,
-            decoder_scheduler=decoder_scheduler,
-            super_res_scheduler=super_res_scheduler,
-        )
-
-    def prepare_latents(self, shape, dtype, generator, latents, scheduler):
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, dtype=dtype)
-        else:
-            if latents.shape != list(shape):
-                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
-
-        latents = latents * scheduler.init_noise_sigma
-        return latents
-
-    def _encode_prompt(
-        self,
-        prompt,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        text_model_output: Optional[Union[CLIPTextModelOutput, Tuple]] = None,
-        text_attention_mask: Optional[paddle.Tensor] = None,
-    ):
-        if text_model_output is None:
-            batch_size = len(prompt) if isinstance(prompt, list) else 1
-            # get prompt text embeddings
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_attention_mask=True,
-                return_tensors="pd",
-            )
-            text_input_ids = text_inputs.input_ids
-            text_mask = text_inputs.attention_mask
-
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-                text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
-
-            text_encoder_output = self.text_encoder(text_input_ids)
-
-            prompt_embeds = text_encoder_output.text_embeds
-            text_encoder_hidden_states = text_encoder_output.last_hidden_state
-
-        else:
-            batch_size = text_model_output[0].shape[0]
-            prompt_embeds, text_encoder_hidden_states = text_model_output[0], text_model_output[1]
-            text_mask = text_attention_mask
-
-        # duplicate text embeddings for each generation per prompt
-        seq_len = prompt_embeds.shape[1]
-        prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt])
-        prompt_embeds = prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len])
-
-        # duplicate text_encoder_hidden_states for each generation per prompt
-        seq_len = text_encoder_hidden_states.shape[1]
-        text_encoder_hidden_states = text_encoder_hidden_states.tile([1, num_images_per_prompt, 1])
-        text_encoder_hidden_states = text_encoder_hidden_states.reshape(
-            [batch_size * num_images_per_prompt, seq_len, -1]
-        )
-
-        # duplicate text_mask for each generation per prompt
-        seq_len = text_mask.shape[1]
-        text_mask = text_mask.tile([1, num_images_per_prompt])
-        text_mask = text_mask.reshape([batch_size * num_images_per_prompt, seq_len])
-
-        # prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, axis=0)
-        # text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, axis=0)
-        # text_mask = text_mask.repeat_interleave(num_images_per_prompt, axis=0)
-
-        if do_classifier_free_guidance:
-            uncond_tokens = [""] * batch_size
-
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                return_attention_mask=True,
-                truncation=True,
-                return_tensors="pd",
-            )
-            uncond_text_mask = uncond_input.attention_mask
-            negative_prompt_embeds_text_encoder_output = self.text_encoder(uncond_input.input_ids)
-
-            negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.text_embeds
-            uncond_text_encoder_hidden_states = negative_prompt_embeds_text_encoder_output.last_hidden_state
-
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-
-            seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len])
-
-            seq_len = uncond_text_encoder_hidden_states.shape[1]
-            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.tile([1, num_images_per_prompt, 1])
-            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1]
-            )
-
-            # duplicate uncond_text_mask for each generation per prompt
-            seq_len = uncond_text_mask.shape[1]
-            uncond_text_mask = uncond_text_mask.tile([1, num_images_per_prompt])
-            uncond_text_mask = uncond_text_mask.reshape([batch_size * num_images_per_prompt, seq_len])
-            # uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, axis=0)
-            # done duplicates
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
-            text_encoder_hidden_states = paddle.concat([uncond_text_encoder_hidden_states, text_encoder_hidden_states])
-
-            text_mask = paddle.concat([uncond_text_mask, text_mask])
-
-        return prompt_embeds, text_encoder_hidden_states, text_mask
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: int = 1,
-        prior_num_inference_steps: int = 25,
-        decoder_num_inference_steps: int = 25,
-        super_res_num_inference_steps: int = 7,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        prior_latents: Optional[paddle.Tensor] = None,
-        decoder_latents: Optional[paddle.Tensor] = None,
-        super_res_latents: Optional[paddle.Tensor] = None,
-        text_model_output: Optional[Union[CLIPTextModelOutput, Tuple]] = None,
-        text_attention_mask: Optional[paddle.Tensor] = None,
-        prior_guidance_scale: float = 4.0,
-        decoder_guidance_scale: float = 8.0,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-    ):
-        """
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`):
-                The prompt or prompts to guide the image generation. This can only be left undefined if
-                `text_model_output` and `text_attention_mask` is passed.
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            prior_num_inference_steps (`int`, *optional*, defaults to 25):
-                The number of denoising steps for the prior. More denoising steps usually lead to a higher quality
-                image at the expense of slower inference.
-            decoder_num_inference_steps (`int`, *optional*, defaults to 25):
-                The number of denoising steps for the decoder. More denoising steps usually lead to a higher quality
-                image at the expense of slower inference.
-            super_res_num_inference_steps (`int`, *optional*, defaults to 7):
-                The number of denoising steps for super resolution. More denoising steps usually lead to a higher
-                quality image at the expense of slower inference.
-            generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            prior_latents (`paddle.Tensor` of shape (batch size, embeddings dimension), *optional*):
-                Pre-generated noisy latents to be used as inputs for the prior.
-            decoder_latents (`paddle.Tensor` of shape (batch size, channels, height, width), *optional*):
-                Pre-generated noisy latents to be used as inputs for the decoder.
-            super_res_latents (`paddle.Tensor` of shape (batch size, channels, super res height, super res width), *optional*):
-                Pre-generated noisy latents to be used as inputs for the decoder.
-            prior_guidance_scale (`float`, *optional*, defaults to 4.0):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            decoder_guidance_scale (`float`, *optional*, defaults to 4.0):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            text_model_output (`CLIPTextModelOutput`, *optional*):
-                Pre-defined CLIPTextModel outputs that can be derived from the text encoder. Pre-defined text outputs
-                can be passed for tasks like text embedding interpolations. Make sure to also pass
-                `text_attention_mask` in this case. `prompt` can the be left to `None`.
-            text_attention_mask (`paddle.Tensor`, *optional*):
-                Pre-defined CLIP text attention mask that can be derived from the tokenizer. Pre-defined text attention
-                masks are necessary when passing `text_model_output`.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generated image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
-        """
-        if prompt is not None:
-            if isinstance(prompt, str):
-                batch_size = 1
-            elif isinstance(prompt, list):
-                batch_size = len(prompt)
-            else:
-                raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-        else:
-            batch_size = text_model_output[0].shape[0]
-
-        batch_size = batch_size * num_images_per_prompt
-
-        do_classifier_free_guidance = prior_guidance_scale > 1.0 or decoder_guidance_scale > 1.0
-
-        prompt_embeds, text_encoder_hidden_states, text_mask = self._encode_prompt(
-            prompt, num_images_per_prompt, do_classifier_free_guidance, text_model_output, text_attention_mask
-        )
-
-        # prior
-
-        self.prior_scheduler.set_timesteps(prior_num_inference_steps)
-        prior_timesteps_tensor = self.prior_scheduler.timesteps
-
-        embedding_dim = self.prior.config.embedding_dim
-
-        prior_latents = self.prepare_latents(
-            (batch_size, embedding_dim),
-            prompt_embeds.dtype,
-            generator,
-            prior_latents,
-            self.prior_scheduler,
-        )
-
-        for i, t in enumerate(self.progress_bar(prior_timesteps_tensor)):
-            # expand the latents if we are doing classifier free guidance
-            latent_model_input = paddle.concat([prior_latents] * 2) if do_classifier_free_guidance else prior_latents
-
-            predicted_image_embedding = self.prior(
-                latent_model_input,
-                timestep=t,
-                proj_embedding=prompt_embeds,
-                encoder_hidden_states=text_encoder_hidden_states,
-                attention_mask=text_mask,
-            ).predicted_image_embedding
-
-            if do_classifier_free_guidance:
-                predicted_image_embedding_uncond, predicted_image_embedding_text = predicted_image_embedding.chunk(2)
-                predicted_image_embedding = predicted_image_embedding_uncond + prior_guidance_scale * (
-                    predicted_image_embedding_text - predicted_image_embedding_uncond
-                )
-
-            if i + 1 == prior_timesteps_tensor.shape[0]:
-                prev_timestep = None
-            else:
-                prev_timestep = prior_timesteps_tensor[i + 1]
-
-            prior_latents = self.prior_scheduler.step(
-                predicted_image_embedding,
-                timestep=t,
-                sample=prior_latents,
-                generator=generator,
-                prev_timestep=prev_timestep,
-            ).prev_sample
-
-        prior_latents = self.prior.post_process_latents(prior_latents)
-
-        image_embeddings = prior_latents
-
-        # done prior
-
-        # decoder
-        text_encoder_hidden_states, additive_clip_time_embeddings = self.text_proj(
-            image_embeddings=image_embeddings,
-            prompt_embeds=prompt_embeds,
-            text_encoder_hidden_states=text_encoder_hidden_states,
-            do_classifier_free_guidance=do_classifier_free_guidance,
-        )
-
-        decoder_text_mask = F.pad(
-            text_mask.unsqueeze(0), (self.text_proj.clip_extra_context_tokens, 0), value=1, data_format="NCL"
-        ).squeeze(0)
-
-        self.decoder_scheduler.set_timesteps(decoder_num_inference_steps)
-        decoder_timesteps_tensor = self.decoder_scheduler.timesteps
-
-        num_channels_latents = self.decoder.config.in_channels
-        height = self.decoder.config.sample_size
-        width = self.decoder.config.sample_size
-
-        decoder_latents = self.prepare_latents(
-            (batch_size, num_channels_latents, height, width),
-            text_encoder_hidden_states.dtype,
-            generator,
-            decoder_latents,
-            self.decoder_scheduler,
-        )
-
-        for i, t in enumerate(self.progress_bar(decoder_timesteps_tensor)):
-            # expand the latents if we are doing classifier free guidance
-            latent_model_input = (
-                paddle.concat([decoder_latents] * 2) if do_classifier_free_guidance else decoder_latents
-            )
-
-            noise_pred = self.decoder(
-                sample=latent_model_input,
-                timestep=t,
-                encoder_hidden_states=text_encoder_hidden_states,
-                class_labels=additive_clip_time_embeddings,
-                attention_mask=decoder_text_mask,
-            ).sample
-
-            if do_classifier_free_guidance:
-                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                # paddle.split is not equal torch.split
-                noise_pred_uncond, _ = noise_pred_uncond.split(
-                    [latent_model_input.shape[1], noise_pred_uncond.shape[1] - latent_model_input.shape[1]], axis=1
-                )
-                noise_pred_text, predicted_variance = noise_pred_text.split(
-                    [latent_model_input.shape[1], noise_pred_text.shape[1] - latent_model_input.shape[1]], axis=1
-                )
-                noise_pred = noise_pred_uncond + decoder_guidance_scale * (noise_pred_text - noise_pred_uncond)
-                noise_pred = paddle.concat([noise_pred, predicted_variance], axis=1)
-
-            if i + 1 == decoder_timesteps_tensor.shape[0]:
-                prev_timestep = None
-            else:
-                prev_timestep = decoder_timesteps_tensor[i + 1]
-
-            # compute the previous noisy sample x_t -> x_t-1
-            decoder_latents = self.decoder_scheduler.step(
-                noise_pred, t, decoder_latents, prev_timestep=prev_timestep, generator=generator
-            ).prev_sample
-
-        decoder_latents = decoder_latents.clip(-1, 1)
-
-        image_small = decoder_latents
-
-        # done decoder
-
-        # super res
-
-        self.super_res_scheduler.set_timesteps(super_res_num_inference_steps)
-        super_res_timesteps_tensor = self.super_res_scheduler.timesteps
-
-        channels = self.super_res_first.config.in_channels // 2
-        height = self.super_res_first.config.sample_size
-        width = self.super_res_first.config.sample_size
-
-        super_res_latents = self.prepare_latents(
-            (batch_size, channels, height, width),
-            image_small.dtype,
-            generator,
-            super_res_latents,
-            self.super_res_scheduler,
-        )
-
-        interpolate_antialias = {}
-        if "antialias" in inspect.signature(F.interpolate).parameters:
-            interpolate_antialias["antialias"] = True
-
-        image_upscaled = F.interpolate(
-            image_small, size=[height, width], mode="bicubic", align_corners=False, **interpolate_antialias
-        )
-
-        for i, t in enumerate(self.progress_bar(super_res_timesteps_tensor)):
-            # no classifier free guidance
-
-            if i == super_res_timesteps_tensor.shape[0] - 1:
-                unet = self.super_res_last
-            else:
-                unet = self.super_res_first
-
-            latent_model_input = paddle.concat(
-                [super_res_latents, image_upscaled.cast(super_res_latents.dtype)], axis=1
-            )
-
-            noise_pred = unet(
-                sample=latent_model_input,
-                timestep=t,
-            ).sample
-
-            if i + 1 == super_res_timesteps_tensor.shape[0]:
-                prev_timestep = None
-            else:
-                prev_timestep = super_res_timesteps_tensor[i + 1]
-
-            # compute the previous noisy sample x_t -> x_t-1
-            super_res_latents = self.super_res_scheduler.step(
-                noise_pred, t, super_res_latents, prev_timestep=prev_timestep, generator=generator
-            ).prev_sample
-
-        image = super_res_latents
-        # done super res
-
-        # post processing
-
-        image = image * 0.5 + 0.5
-        image = image.clip(0, 1)
-        image = image.transpose([0, 2, 3, 1]).cast("float32").numpy()
-
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image,)
-
-        return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/unclip/pipeline_unclip_image_variation.py b/ppdiffusers/ppdiffusers/pipelines/unclip/pipeline_unclip_image_variation.py
deleted file mode 100644
index 717bfd20f66b..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/unclip/pipeline_unclip_image_variation.py
+++ /dev/null
@@ -1,441 +0,0 @@
-# Copyright 2022 Kakao Brain and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import List, Optional, Union
-
-import paddle
-import paddle.nn.functional as F
-import PIL
-
-from paddlenlp.transformers import (
-    CLIPImageProcessor,
-    CLIPTextModelWithProjection,
-    CLIPTokenizer,
-    CLIPVisionModelWithProjection,
-)
-
-from ...models import UNet2DConditionModel, UNet2DModel
-from ...pipelines import DiffusionPipeline, ImagePipelineOutput
-from ...schedulers import UnCLIPScheduler
-from ...utils import logging, randn_tensor
-from .text_proj import UnCLIPTextProjModel
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-class UnCLIPImageVariationPipeline(DiffusionPipeline):
-    """
-    Pipeline to generate variations from an input image using unCLIP
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        text_encoder ([`CLIPTextModelWithProjection`]):
-            Frozen text-encoder.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        feature_extractor ([`CLIPImageProcessor`]):
-            Model that extracts features from generated images to be used as inputs for the `image_encoder`.
-        image_encoder ([`CLIPVisionModelWithProjection`]):
-            Frozen CLIP image-encoder. unCLIP Image Variation uses the vision portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPVisionModelWithProjection),
-            specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        text_proj ([`UnCLIPTextProjModel`]):
-            Utility class to prepare and combine the embeddings before they are passed to the decoder.
-        decoder ([`UNet2DConditionModel`]):
-            The decoder to invert the image embedding into an image.
-        super_res_first ([`UNet2DModel`]):
-            Super resolution unet. Used in all but the last step of the super resolution diffusion process.
-        super_res_last ([`UNet2DModel`]):
-            Super resolution unet. Used in the last step of the super resolution diffusion process.
-        decoder_scheduler ([`UnCLIPScheduler`]):
-            Scheduler used in the decoder denoising process. Just a modified DDPMScheduler.
-        super_res_scheduler ([`UnCLIPScheduler`]):
-            Scheduler used in the super resolution denoising process. Just a modified DDPMScheduler.
-
-    """
-
-    decoder: UNet2DConditionModel
-    text_proj: UnCLIPTextProjModel
-    text_encoder: CLIPTextModelWithProjection
-    tokenizer: CLIPTokenizer
-    feature_extractor: CLIPImageProcessor
-    image_encoder: CLIPVisionModelWithProjection
-    super_res_first: UNet2DModel
-    super_res_last: UNet2DModel
-
-    decoder_scheduler: UnCLIPScheduler
-    super_res_scheduler: UnCLIPScheduler
-
-    def __init__(
-        self,
-        decoder: UNet2DConditionModel,
-        text_encoder: CLIPTextModelWithProjection,
-        tokenizer: CLIPTokenizer,
-        text_proj: UnCLIPTextProjModel,
-        feature_extractor: CLIPImageProcessor,
-        image_encoder: CLIPVisionModelWithProjection,
-        super_res_first: UNet2DModel,
-        super_res_last: UNet2DModel,
-        decoder_scheduler: UnCLIPScheduler,
-        super_res_scheduler: UnCLIPScheduler,
-    ):
-        super().__init__()
-
-        self.register_modules(
-            decoder=decoder,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            text_proj=text_proj,
-            feature_extractor=feature_extractor,
-            image_encoder=image_encoder,
-            super_res_first=super_res_first,
-            super_res_last=super_res_last,
-            decoder_scheduler=decoder_scheduler,
-            super_res_scheduler=super_res_scheduler,
-        )
-
-    # Copied from ppdiffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
-    def prepare_latents(self, shape, dtype, generator, latents, scheduler):
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, dtype=dtype)
-        else:
-            if latents.shape != list(shape):
-                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
-
-        latents = latents * scheduler.init_noise_sigma
-        return latents
-
-    def _encode_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guidance):
-        batch_size = len(prompt) if isinstance(prompt, list) else 1
-
-        # get prompt text embeddings
-        text_inputs = self.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=self.tokenizer.model_max_length,
-            return_attention_mask=True,
-            return_tensors="pd",
-        )
-        text_input_ids = text_inputs.input_ids
-        text_mask = text_inputs.attention_mask
-        text_encoder_output = self.text_encoder(text_input_ids)
-
-        prompt_embeds = text_encoder_output.text_embeds
-        text_encoder_hidden_states = text_encoder_output.last_hidden_state
-
-        # duplicate text embeddings for each generation per prompt
-        seq_len = prompt_embeds.shape[1]
-        prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt])
-        prompt_embeds = prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len])
-
-        # duplicate text_encoder_hidden_states for each generation per prompt
-        seq_len = text_encoder_hidden_states.shape[1]
-        text_encoder_hidden_states = text_encoder_hidden_states.tile([1, num_images_per_prompt, 1])
-        text_encoder_hidden_states = text_encoder_hidden_states.reshape(
-            [batch_size * num_images_per_prompt, seq_len, -1]
-        )
-
-        # duplicate text_mask for each generation per prompt
-        seq_len = text_mask.shape[1]
-        text_mask = text_mask.tile([1, num_images_per_prompt])
-        text_mask = text_mask.reshape([batch_size * num_images_per_prompt, seq_len])
-
-        # prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, axis=0)
-        # text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, axis=0)
-        # text_mask = text_mask.repeat_interleave(num_images_per_prompt, axis=0)
-
-        if do_classifier_free_guidance:
-            uncond_tokens = [""] * batch_size
-
-            max_length = text_input_ids.shape[-1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                return_attention_mask=True,
-                truncation=True,
-                return_tensors="pd",
-            )
-            uncond_text_mask = uncond_input.attention_mask
-            negative_prompt_embeds_text_encoder_output = self.text_encoder(uncond_input.input_ids)
-
-            negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.text_embeds
-            uncond_text_encoder_hidden_states = negative_prompt_embeds_text_encoder_output.last_hidden_state
-
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-
-            seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len])
-
-            seq_len = uncond_text_encoder_hidden_states.shape[1]
-            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.tile([1, num_images_per_prompt, 1])
-            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.reshape(
-                [batch_size * num_images_per_prompt, seq_len, -1]
-            )
-
-            # duplicate uncond_text_mask for each generation per prompt
-            seq_len = uncond_text_mask.shape[1]
-            uncond_text_mask = uncond_text_mask.tile([1, num_images_per_prompt])
-            uncond_text_mask = uncond_text_mask.reshape([batch_size * num_images_per_prompt, seq_len])
-            # uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, axis=0)
-            # done duplicates
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
-            text_encoder_hidden_states = paddle.concat([uncond_text_encoder_hidden_states, text_encoder_hidden_states])
-
-            text_mask = paddle.concat([uncond_text_mask, text_mask])
-
-        return prompt_embeds, text_encoder_hidden_states, text_mask
-
-    def _encode_image(self, image, num_images_per_prompt, image_embeddings: Optional[paddle.Tensor] = None):
-
-        dtype = self.image_encoder.dtype
-
-        if image_embeddings is None:
-            if not isinstance(image, paddle.Tensor):
-                image = self.feature_extractor(images=image, return_tensors="pd").pixel_values
-
-            image = image.cast(dtype)
-            image_embeddings = self.image_encoder(image).image_embeds
-
-        batch_size, seq_len = image_embeddings.shape
-        image_embeddings = image_embeddings.tile([1, num_images_per_prompt])
-        image_embeddings = image_embeddings.reshape([batch_size * num_images_per_prompt, seq_len])
-        # image_embeddings = image_embeddings.repeat_interleave(num_images_per_prompt, axis=0)
-
-        return image_embeddings
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        image: Optional[Union[PIL.Image.Image, List[PIL.Image.Image], paddle.Tensor]] = None,
-        num_images_per_prompt: int = 1,
-        decoder_num_inference_steps: int = 25,
-        super_res_num_inference_steps: int = 7,
-        generator: Optional[paddle.Generator] = None,
-        decoder_latents: Optional[paddle.Tensor] = None,
-        super_res_latents: Optional[paddle.Tensor] = None,
-        image_embeddings: Optional[paddle.Tensor] = None,
-        decoder_guidance_scale: float = 8.0,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-    ):
-        """
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `paddle.Tensor`):
-                The image or images to guide the image generation. If you provide a tensor, it needs to comply with the
-                configuration of
-                [this](https://huggingface.co/fusing/karlo-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json)
-                `CLIPImageProcessor`. Can be left to `None` only when `image_embeddings` are passed.
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            decoder_num_inference_steps (`int`, *optional*, defaults to 25):
-                The number of denoising steps for the decoder. More denoising steps usually lead to a higher quality
-                image at the expense of slower inference.
-            super_res_num_inference_steps (`int`, *optional*, defaults to 7):
-                The number of denoising steps for super resolution. More denoising steps usually lead to a higher
-                quality image at the expense of slower inference.
-            generator (`paddle.Generator`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            decoder_latents (`paddle.Tensor` of shape (batch size, channels, height, width), *optional*):
-                Pre-generated noisy latents to be used as inputs for the decoder.
-            super_res_latents (`paddle.Tensor` of shape (batch size, channels, super res height, super res width), *optional*):
-                Pre-generated noisy latents to be used as inputs for the decoder.
-            decoder_guidance_scale (`float`, *optional*, defaults to 4.0):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            image_embeddings (`paddle.Tensor`, *optional*):
-                Pre-defined image embeddings that can be derived from the image encoder. Pre-defined image embeddings
-                can be passed for tasks like image interpolations. `image` can the be left to `None`.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generated image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
-        """
-        if image is not None:
-            if isinstance(image, PIL.Image.Image):
-                batch_size = 1
-            elif isinstance(image, list):
-                batch_size = len(image)
-            else:
-                batch_size = image.shape[0]
-        else:
-            batch_size = image_embeddings.shape[0]
-
-        prompt = [""] * batch_size
-
-        batch_size = batch_size * num_images_per_prompt
-
-        do_classifier_free_guidance = decoder_guidance_scale > 1.0
-
-        prompt_embeds, text_encoder_hidden_states, text_mask = self._encode_prompt(
-            prompt, num_images_per_prompt, do_classifier_free_guidance
-        )
-
-        image_embeddings = self._encode_image(image, num_images_per_prompt, image_embeddings)
-
-        # decoder
-        text_encoder_hidden_states, additive_clip_time_embeddings = self.text_proj(
-            image_embeddings=image_embeddings,
-            prompt_embeds=prompt_embeds,
-            text_encoder_hidden_states=text_encoder_hidden_states,
-            do_classifier_free_guidance=do_classifier_free_guidance,
-        )
-
-        decoder_text_mask = F.pad(
-            text_mask.unsqueeze(0), (self.text_proj.clip_extra_context_tokens, 0), value=1, data_format="NCL"
-        ).squeeze(0)
-
-        self.decoder_scheduler.set_timesteps(decoder_num_inference_steps)
-        decoder_timesteps_tensor = self.decoder_scheduler.timesteps
-
-        num_channels_latents = self.decoder.config.in_channels
-        height = self.decoder.config.sample_size
-        width = self.decoder.config.sample_size
-
-        if decoder_latents is None:
-            decoder_latents = self.prepare_latents(
-                (batch_size, num_channels_latents, height, width),
-                text_encoder_hidden_states.dtype,
-                generator,
-                decoder_latents,
-                self.decoder_scheduler,
-            )
-
-        for i, t in enumerate(self.progress_bar(decoder_timesteps_tensor)):
-            # expand the latents if we are doing classifier free guidance
-            latent_model_input = (
-                paddle.concat([decoder_latents] * 2) if do_classifier_free_guidance else decoder_latents
-            )
-
-            noise_pred = self.decoder(
-                sample=latent_model_input,
-                timestep=t,
-                encoder_hidden_states=text_encoder_hidden_states,
-                class_labels=additive_clip_time_embeddings,
-                attention_mask=decoder_text_mask,
-            ).sample
-
-            if do_classifier_free_guidance:
-                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                # paddle.split is not equal torch.split
-                noise_pred_uncond, _ = noise_pred_uncond.split(
-                    [latent_model_input.shape[1], noise_pred_uncond.shape[1] - latent_model_input.shape[1]], axis=1
-                )
-                noise_pred_text, predicted_variance = noise_pred_text.split(
-                    [latent_model_input.shape[1], noise_pred_text.shape[1] - latent_model_input.shape[1]], axis=1
-                )
-                noise_pred = noise_pred_uncond + decoder_guidance_scale * (noise_pred_text - noise_pred_uncond)
-                noise_pred = paddle.concat([noise_pred, predicted_variance], axis=1)
-
-            if i + 1 == decoder_timesteps_tensor.shape[0]:
-                prev_timestep = None
-            else:
-                prev_timestep = decoder_timesteps_tensor[i + 1]
-
-            # compute the previous noisy sample x_t -> x_t-1
-            decoder_latents = self.decoder_scheduler.step(
-                noise_pred, t, decoder_latents, prev_timestep=prev_timestep, generator=generator
-            ).prev_sample
-
-        decoder_latents = decoder_latents.clip(-1, 1)
-
-        image_small = decoder_latents
-
-        # done decoder
-
-        # super res
-
-        self.super_res_scheduler.set_timesteps(super_res_num_inference_steps)
-        super_res_timesteps_tensor = self.super_res_scheduler.timesteps
-
-        channels = self.super_res_first.config.in_channels // 2
-        height = self.super_res_first.config.sample_size
-        width = self.super_res_first.config.sample_size
-
-        if super_res_latents is None:
-            super_res_latents = self.prepare_latents(
-                (batch_size, channels, height, width),
-                image_small.dtype,
-                generator,
-                super_res_latents,
-                self.super_res_scheduler,
-            )
-
-        interpolate_antialias = {}
-        if "antialias" in inspect.signature(F.interpolate).parameters:
-            interpolate_antialias["antialias"] = True
-
-        image_upscaled = F.interpolate(
-            image_small, size=[height, width], mode="bicubic", align_corners=False, **interpolate_antialias
-        )
-
-        for i, t in enumerate(self.progress_bar(super_res_timesteps_tensor)):
-            # no classifier free guidance
-
-            if i == super_res_timesteps_tensor.shape[0] - 1:
-                unet = self.super_res_last
-            else:
-                unet = self.super_res_first
-
-            latent_model_input = paddle.concat(
-                [super_res_latents, image_upscaled.cast(super_res_latents.dtype)], axis=1
-            )
-
-            noise_pred = unet(
-                sample=latent_model_input,
-                timestep=t,
-            ).sample
-
-            if i + 1 == super_res_timesteps_tensor.shape[0]:
-                prev_timestep = None
-            else:
-                prev_timestep = super_res_timesteps_tensor[i + 1]
-
-            # compute the previous noisy sample x_t -> x_t-1
-            super_res_latents = self.super_res_scheduler.step(
-                noise_pred, t, super_res_latents, prev_timestep=prev_timestep, generator=generator
-            ).prev_sample
-
-        image = super_res_latents
-
-        # done super res
-
-        # post processing
-
-        image = image * 0.5 + 0.5
-        image = image.clip(0, 1)
-        image = image.transpose([0, 2, 3, 1]).cast("float32").numpy()
-
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image,)
-
-        return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/unclip/text_proj.py b/ppdiffusers/ppdiffusers/pipelines/unclip/text_proj.py
deleted file mode 100644
index 772055f5afe2..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/unclip/text_proj.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright 2022 Kakao Brain and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-from paddle import nn
-
-from ...configuration_utils import ConfigMixin, register_to_config
-from ...models import ModelMixin
-
-
-class UnCLIPTextProjModel(ModelMixin, ConfigMixin):
-    """
-    Utility class for CLIP embeddings. Used to combine the image and text embeddings into a format usable by the
-    decoder.
-
-    For more details, see the original paper: https://arxiv.org/abs/2204.06125 section 2.1
-    """
-
-    @register_to_config
-    def __init__(
-        self,
-        *,
-        clip_extra_context_tokens: int = 4,
-        clip_embeddings_dim: int = 768,
-        time_embed_dim: int,
-        cross_attention_dim,
-    ):
-        super().__init__()
-
-        self.learned_classifier_free_guidance_embeddings = self.create_parameter(
-            (clip_embeddings_dim,), dtype=paddle.get_default_dtype(), default_initializer=nn.initializer.Constant(0.0)
-        )
-
-        # parameters for additional clip time embeddings
-        self.embedding_proj = nn.Linear(clip_embeddings_dim, time_embed_dim)
-        self.clip_image_embeddings_project_to_time_embeddings = nn.Linear(clip_embeddings_dim, time_embed_dim)
-
-        # parameters for encoder hidden states
-        self.clip_extra_context_tokens = clip_extra_context_tokens
-        self.clip_extra_context_tokens_proj = nn.Linear(
-            clip_embeddings_dim, self.clip_extra_context_tokens * cross_attention_dim
-        )
-        self.encoder_hidden_states_proj = nn.Linear(clip_embeddings_dim, cross_attention_dim)
-        self.text_encoder_hidden_states_norm = nn.LayerNorm(cross_attention_dim)
-
-    def forward(self, *, image_embeddings, prompt_embeds, text_encoder_hidden_states, do_classifier_free_guidance):
-
-        image_embeddings = image_embeddings.cast(self.dtype)
-
-        if do_classifier_free_guidance:
-            # Add the classifier free guidance embeddings to the image embeddings
-            image_embeddings_batch_size = image_embeddings.shape[0]
-            classifier_free_guidance_embeddings = self.learned_classifier_free_guidance_embeddings.unsqueeze(0)
-            classifier_free_guidance_embeddings = classifier_free_guidance_embeddings.expand(
-                [image_embeddings_batch_size, -1]
-            )
-            image_embeddings = paddle.concat([classifier_free_guidance_embeddings, image_embeddings], axis=0)
-
-        # The image embeddings batch size and the text embeddings batch size are equal
-        assert image_embeddings.shape[0] == prompt_embeds.shape[0]
-
-        batch_size = prompt_embeds.shape[0]
-
-        # "Specifically, we modify the architecture described in Nichol et al. (2021) by projecting and
-        # adding CLIP embeddings to the existing timestep embedding, ...
-        time_projected_prompt_embeds = self.embedding_proj(prompt_embeds)
-        time_projected_image_embeddings = self.clip_image_embeddings_project_to_time_embeddings(image_embeddings)
-        additive_clip_time_embeddings = time_projected_image_embeddings + time_projected_prompt_embeds
-
-        # ... and by projecting CLIP embeddings into four
-        # extra tokens of context that are concatenated to the sequence of outputs from the GLIDE text encoder"
-        clip_extra_context_tokens = self.clip_extra_context_tokens_proj(image_embeddings)
-        clip_extra_context_tokens = clip_extra_context_tokens.reshape([batch_size, -1, self.clip_extra_context_tokens])
-        clip_extra_context_tokens = clip_extra_context_tokens.transpose([0, 2, 1])
-
-        text_encoder_hidden_states = self.encoder_hidden_states_proj(text_encoder_hidden_states)
-        text_encoder_hidden_states = self.text_encoder_hidden_states_norm(text_encoder_hidden_states)
-        text_encoder_hidden_states = paddle.concat([clip_extra_context_tokens, text_encoder_hidden_states], axis=1)
-
-        return text_encoder_hidden_states, additive_clip_time_embeddings
diff --git a/ppdiffusers/ppdiffusers/pipelines/unidiffuser/__init__.py b/ppdiffusers/ppdiffusers/pipelines/unidiffuser/__init__.py
deleted file mode 100644
index d0e447e0ef36..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/unidiffuser/__init__.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from dataclasses import dataclass
-from typing import List, Optional, Union
-
-import numpy as np
-import PIL
-
-from ...utils import (
-    BaseOutput,
-    OptionalDependencyNotAvailable,
-    is_einops_available,
-    is_paddle_available,
-    is_paddlenlp_available,
-)
-
-
-@dataclass
-class ImageTextPipelineOutput(BaseOutput):
-    """
-    Output class for UniDiffuser pipelines.
-    Args:
-        images (`List[PIL.Image.Image]` or `np.ndarray`)
-            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
-            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
-        prompt (`List[str]` or `str`)
-            List of prompts.
-    """
-
-    images: Union[List[PIL.Image.Image], np.ndarray]
-    texts: Union[List[str], str]
-
-
-try:
-    if not (is_paddlenlp_available() and is_paddle_available() and is_einops_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ...utils.dummy_paddle_and_paddlenlp_and_einops_objects import (
-        UniDiffuserPipeline,
-    )
-    from ...utils.dummy_paddle_and_paddlenlp_objects import CaptionDecoder
-else:
-    from .caption_decoder import CaptionDecoder
-    from .pipeline_unidiffuser import UniDiffuserPipeline
diff --git a/ppdiffusers/ppdiffusers/pipelines/unidiffuser/caption_decoder.py b/ppdiffusers/ppdiffusers/pipelines/unidiffuser/caption_decoder.py
deleted file mode 100644
index fd63e3065534..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/unidiffuser/caption_decoder.py
+++ /dev/null
@@ -1,230 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional
-
-import numpy as np
-import paddle
-from paddle import nn
-from paddle.nn import functional as F
-
-from paddlenlp.transformers import GPTConfig, GPTLMHeadModel
-
-from ...configuration_utils import ConfigMixin, register_to_config
-from ...models.modeling_utils import ModelMixin
-
-
-class CaptionDecoder(ModelMixin, ConfigMixin):
-    @register_to_config
-    def __init__(
-        self,
-        prefix_length: int = 77,
-        hidden_dim: int = 64,
-        vocab_size: int = 50258,
-        hidden_size: int = 768,
-        num_hidden_layers: int = 12,
-        intermediate_size: int = 3072,
-        hidden_act: int = "gelu",
-        hidden_dropout_prob: int = 0.1,
-        attention_probs_dropout_prob: int = 0.1,
-        max_position_embeddings: int = 1024,
-        initializer_range: int = 0.02,
-        eos_token_id: int = 50257,
-    ):
-        super(CaptionDecoder, self).__init__()
-        self.prefix_length = prefix_length
-        config = GPTConfig(
-            vocab_size=vocab_size,
-            hidden_size=hidden_size,
-            num_hidden_layers=num_hidden_layers,
-            intermediate_size=intermediate_size,
-            hidden_act=hidden_act,
-            hidden_dropout_prob=hidden_dropout_prob,
-            attention_probs_dropout_prob=attention_probs_dropout_prob,
-            max_position_embeddings=max_position_embeddings,
-            initializer_range=initializer_range,
-            eos_token_id=eos_token_id,
-        )
-        self.gpt = GPTLMHeadModel(config)
-
-        self.hidden_dim = hidden_dim
-        self.encode_prefix = nn.Linear(hidden_size, hidden_dim) if hidden_dim is not None else nn.Identity()
-        self.decode_prefix = nn.Linear(hidden_dim, hidden_size) if hidden_dim is not None else nn.Identity()
-
-    def get_dummy_token(self, batch_size: int) -> paddle.Tensor:
-        return paddle.zeros([batch_size, self.prefix_length], dtype=paddle.int64)
-
-    def forward(
-        self,
-        tokens: paddle.Tensor,
-        prefix: paddle.Tensor,
-        attention_mask: Optional[paddle.Tensor] = None,
-        labels: Optional[paddle.Tensor] = None,
-    ):
-        embedding_text = self.gpt.gpt.embeddings.word_embeddings(tokens)
-        hidden = self.encode_prefix(prefix)
-        prefix = self.decode_prefix(hidden)
-        embedding_cat = paddle.concat((prefix, embedding_text), axis=1)
-
-        if labels is not None:
-            dummy_token = self.get_dummy_token(tokens.shape[0])
-            labels = paddle.concat((dummy_token, tokens), axis=1)
-        out = self.gpt(inputs_embeds=embedding_cat, labels=labels, attention_mask=attention_mask)
-
-        if self.hidden_dim:
-            return out, hidden
-        else:
-            return out
-
-    @paddle.no_grad()
-    def generate_captions(self, tokenizer, features, use_beam_search=True):
-        # TODO junnyu, support float16
-        features = features.cast(self.dtype)
-        # the low dimension representation of clip feature
-        features = paddle.split(features, 1, axis=0)
-        generated_captions = []
-        for feature in features:
-            feature = self.decode_prefix(feature)  # back to the clip feature
-            if use_beam_search:
-                generated_captions.append(self.generate_beam(tokenizer=tokenizer, embedding=feature)[0])
-            else:
-                generated_captions.append(self.generate2(tokenizer=tokenizer, embedding=feature))
-        return generated_captions
-
-    @paddle.no_grad()
-    def generate_beam(
-        self,
-        tokenizer,
-        prompt=None,
-        embedding=None,
-        beam_size: int = 5,
-        entry_length: int = 67,  # maximum number of words
-        temperature: float = 1.0,
-    ):
-        stop_token_index = self.gpt.config.eos_token_id
-        tokens = None
-        scores = None
-        seq_lengths = paddle.ones([beam_size])
-        is_stopped = paddle.zeros([beam_size], dtype=paddle.bool)
-
-        if embedding is not None:
-            generated = embedding
-        else:
-            if tokens is None:
-                tokens = paddle.to_tensor(tokenizer.encode(prompt)["input_ids"])
-                tokens = tokens.unsqueeze(0)
-                generated = self.gpt.get_input_embeddings()(tokens)
-
-        for i in range(entry_length):
-            logits = self.gpt(inputs_embeds=generated)
-            logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)
-            logits = F.softmax(logits, axis=-1).log()
-            if scores is None:
-                scores, next_tokens = logits.topk(beam_size, -1)
-                generated = generated.expand([beam_size, *generated.shape[1:]])
-                next_tokens, scores = next_tokens.transpose([1, 0]), scores.squeeze(0)
-                if tokens is None:
-                    tokens = next_tokens
-                else:
-                    tokens = tokens.expand([beam_size, *tokens.shape[1:]])
-                    tokens = paddle.concat((tokens, next_tokens), axis=1)
-            else:
-                logits[is_stopped] = -float(np.inf)
-                logits[is_stopped, 0] = 0
-                scores_sum = scores[:, None] + logits
-                seq_lengths[~is_stopped] += 1
-                scores_sum_average = scores_sum / seq_lengths[:, None]
-                scores_sum_average, next_tokens = scores_sum_average.reshape([-1]).topk(beam_size, -1)
-                next_tokens_source = next_tokens // scores_sum.shape[1]
-                seq_lengths = seq_lengths[next_tokens_source]
-                next_tokens = next_tokens % scores_sum.shape[1]
-                next_tokens = next_tokens.unsqueeze(1)
-                tokens = tokens[next_tokens_source]
-                tokens = paddle.concat((tokens, next_tokens), axis=1)
-                generated = generated[next_tokens_source]
-                scores = scores_sum_average * seq_lengths
-                is_stopped = paddle.cast(is_stopped, "int32")  # TODO: nf
-                is_stopped = is_stopped[next_tokens_source]
-                is_stopped = paddle.cast(is_stopped, "bool")
-
-            next_token_embed = self.gpt.get_input_embeddings()(next_tokens.squeeze()).reshape(
-                [generated.shape[0], 1, -1]
-            )
-            generated = paddle.concat((generated, next_token_embed), axis=1)
-            is_stopped = paddle.bitwise_or(is_stopped, next_tokens.equal(stop_token_index).squeeze())
-            if is_stopped.all():
-                break
-
-        scores = scores / seq_lengths
-        output_list = tokens.cpu().numpy()
-        output_texts = [
-            tokenizer.decode(output[: int(length)], skip_special_tokens=True)
-            for output, length in zip(output_list, seq_lengths)
-        ]
-        order = scores.argsort(descending=True)
-        output_texts = [output_texts[i] for i in order]
-        return output_texts
-
-    @paddle.no_grad()
-    def generate2(
-        self,
-        tokenizer,
-        tokens=None,
-        prompt=None,
-        embedding=None,
-        entry_count: int = 1,
-        entry_length: int = 67,  # maximum number of words
-        top_p: float = 0.8,
-        temperature: float = 1.0,
-    ):
-        generated_list = []
-        stop_token_index = self.gpt.config.eos_token_id
-        filter_value = -float("Inf")
-
-        for i in range(entry_count):
-            if embedding is not None:
-                generated = embedding
-            else:
-                if tokens is None:
-                    tokens = paddle.to_tensor(tokenizer.encode(prompt))
-                    tokens = tokens.unsqueeze(0)
-                generated = self.gpt.get_input_embeddings()(tokens)
-
-            for entry_idx in range(entry_length):
-                logits = self.gpt(inputs_embeds=generated)
-                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)
-                sorted_logits = paddle.sort(logits, descending=True)
-                sorted_indices = paddle.argsort(logits, descending=True)
-                cumulative_probs = paddle.cumsum(F.softmax(sorted_logits, axis=-1), axis=-1)
-                sorted_indices_to_remove = cumulative_probs > top_p
-                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
-                sorted_indices_to_remove[..., 0] = 0
-
-                indices_to_remove = sorted_indices[sorted_indices_to_remove]
-                logits[:, indices_to_remove] = filter_value
-                next_token = paddle.argmax(logits, -1).unsqueeze(0)
-                next_token_embed = self.gpt.get_input_embeddings()(next_token)
-                if tokens is None:
-                    tokens = next_token
-                else:
-                    tokens = paddle.concat((tokens, next_token), axis=1)
-                generated = paddle.concat((generated, next_token_embed), axis=1)
-                if stop_token_index == next_token.item():
-                    break
-
-            output_list = list(tokens.squeeze().cpu().numpy())
-            output_text = tokenizer.decode(output_list, skip_special_tokens=True)
-            generated_list.append(output_text)
-
-        return generated_list[0]
diff --git a/ppdiffusers/ppdiffusers/pipelines/unidiffuser/pipeline_unidiffuser.py b/ppdiffusers/ppdiffusers/pipelines/unidiffuser/pipeline_unidiffuser.py
deleted file mode 100644
index 8f93f77b889e..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/unidiffuser/pipeline_unidiffuser.py
+++ /dev/null
@@ -1,826 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import Callable, List, Optional, Union
-
-import einops
-import numpy as np
-import paddle
-import PIL
-from PIL import Image
-
-from paddlenlp.transformers import (
-    CLIPImageProcessor,
-    CLIPTextModel,
-    CLIPTokenizer,
-    CLIPVisionModelWithProjection,
-    GPTTokenizer,
-)
-
-from ...models import AutoencoderKL, UViTModel
-from ...pipeline_utils import DiffusionPipeline
-from ...schedulers import DPMSolverUniDiffuserScheduler
-from ...utils import logging, randn_tensor
-from . import ImageTextPipelineOutput
-from .caption_decoder import CaptionDecoder
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-def center_crop(width, height, img):
-    resample = {"box": Image.BOX, "lanczos": Image.LANCZOS}["lanczos"]
-    crop = np.min(img.shape[:2])
-    img = img[
-        (img.shape[0] - crop) // 2 : (img.shape[0] + crop) // 2,
-        (img.shape[1] - crop) // 2 : (img.shape[1] + crop) // 2,
-    ]  # center crop
-    try:
-        img = Image.fromarray(img, "RGB")
-    except:
-        img = Image.fromarray(img)
-    img = img.resize((width, height), resample)  # resize the center crop from [crop, crop] to [width, height]
-    return np.array(img).astype(np.uint8)
-
-
-class UniDiffuserPipeline(DiffusionPipeline):
-
-    image_encoder: CLIPVisionModelWithProjection
-    image_feature_extractor: CLIPImageProcessor
-    text_encoder: CLIPTextModel
-    tokenizer: CLIPTokenizer
-    unet: UViTModel
-    vae: AutoencoderKL
-    caption_decoder: CaptionDecoder
-    caption_tokenizer: GPTTokenizer
-    scheduler: DPMSolverUniDiffuserScheduler
-
-    def __init__(
-        self,
-        image_encoder: CLIPVisionModelWithProjection,
-        image_feature_extractor: CLIPImageProcessor,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UViTModel,
-        vae: AutoencoderKL,
-        caption_decoder: CaptionDecoder,
-        caption_tokenizer: GPTTokenizer,
-        scheduler: DPMSolverUniDiffuserScheduler,
-    ):
-        super().__init__()
-        self.register_modules(
-            image_encoder=image_encoder,
-            image_feature_extractor=image_feature_extractor,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            vae=vae,
-            caption_decoder=caption_decoder,
-            caption_tokenizer=caption_tokenizer,
-            scheduler=scheduler,
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-
-        self.num_channels_latents = vae.latent_channels  # 4
-        self.image_encoder_clip_img_dim = image_encoder.config.projection_dim  # 512
-        self.text_encoder_seq_len = tokenizer.model_max_length  # 77
-        self.text_encoder_text_dim = text_encoder.config.hidden_size // text_encoder.config.num_attention_heads  # 64
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
-    def check_inputs(
-        self,
-        prompt,
-        height,
-        width,
-        callback_steps,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-    ):
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-    def _infer_batch_size(self, mode, image, prompt, prompt_embeds, num_samples):
-        if mode in ["t2i", "t2i2t"]:
-            if prompt is not None and isinstance(prompt, str):
-                batch_size = 1
-            elif prompt is not None and isinstance(prompt, list):
-                batch_size = len(prompt)
-            else:
-                batch_size = prompt_embeds.shape[0]
-        elif mode in ["i2t", "i2t2i"]:
-            if isinstance(image, PIL.Image.Image):
-                batch_size = 1
-            else:
-                batch_size = image.shape[0]
-        else:
-            # For unconditional (and marginal) generation, set as num_samples
-            batch_size = num_samples
-        return batch_size
-
-    def _split(self, x, height, width):
-        r"""
-        Splits a flattened embedding x of shape (B, C * H * W + clip_img_dim) into two tensors of shape (B, C, H, W)
-        and (B, 1, clip_img_dim)
-        """
-        latent_height = height // self.vae_scale_factor
-        latent_width = width // self.vae_scale_factor
-        img_vae_dim = self.num_channels_latents * latent_height * latent_width
-
-        img_vae, img_clip = x.split([img_vae_dim, self.image_encoder_clip_img_dim], axis=1)
-
-        img_vae = einops.rearrange(
-            img_vae, "B (C H W) -> B C H W", C=self.num_channels_latents, H=latent_height, W=latent_width
-        )
-        img_clip = einops.rearrange(img_clip, "B (L D) -> B L D", L=1, D=self.image_encoder_clip_img_dim)
-        return img_vae, img_clip
-
-    def _combine(self, img_vae, img_clip):
-        r"""
-        Combines a latent iamge img_vae of shape (B, C, H, W) and a CLIP-embedded image img_clip of shape (B, 1,
-        clip_img_dim) into a single tensor of shape (B, C * H * W + clip_img_dim).
-        """
-        img_vae = einops.rearrange(img_vae, "B C H W -> B (C H W)")
-        img_clip = einops.rearrange(img_clip, "B L D -> B (L D)")
-        return paddle.concat([img_vae, img_clip], axis=-1)
-
-    def _split_joint(self, x, height, width):
-        r"""
-        Splits a flattened embedding x of shape (B, C * H * W + clip_img_dim + text_seq_len * text_dim] into (img_vae,
-        img_clip, text) where img_vae is of shape (B, C, H, W), img_clip is of shape (B, 1, clip_img_dim), and text is
-        of shape (B, text_seq_len, text_dim).
-        """
-        latent_height = height // self.vae_scale_factor
-        latent_width = width // self.vae_scale_factor
-        img_vae_dim = self.num_channels_latents * latent_height * latent_width
-        text_dim = self.text_encoder_seq_len * self.text_encoder_text_dim
-
-        img_vae, img_clip, text = x.split([img_vae_dim, self.image_encoder_clip_img_dim, text_dim], axis=1)
-        img_vae = einops.rearrange(
-            img_vae, "B (C H W) -> B C H W", C=self.num_channels_latents, H=latent_height, W=latent_width
-        )
-        img_clip = einops.rearrange(img_clip, "B (L D) -> B L D", L=1, D=self.image_encoder_clip_img_dim)
-        text = einops.rearrange(text, "B (L D) -> B L D", L=self.text_encoder_seq_len, D=self.text_encoder_text_dim)
-        return img_vae, img_clip, text
-
-    def _combine_joint(self, img_vae, img_clip, text):
-        r"""
-        Combines a latent image img_vae of shape (B, C, H, W), a CLIP-embedded image img_clip of shape (B, L_img,
-        clip_img_dim), and a text embedding text of shape (B, L_text, text_dim) into a single embedding x of shape (B,
-        C * H * W + L_img * clip_img_dim + L_text * text_dim).
-        """
-        img_vae = einops.rearrange(img_vae, "B C H W -> B (C H W)")
-        img_clip = einops.rearrange(img_clip, "B L D -> B (L D)")
-        text = einops.rearrange(text, "B L D -> B (L D)")
-        return paddle.concat([img_vae, img_clip, text], axis=-1)
-
-    # Modified from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
-    def encode_text_latents(
-        self,
-        prompt,
-        num_images_per_prompt,
-        negative_prompt=None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-    ):
-        if prompt_embeds is None:
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-            prompt_embeds = self.text_encoder(text_inputs.input_ids)[0]
-
-        return prompt_embeds
-
-    # Modified from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_instruct_pix2pix.StableDiffusionInstructPix2PixPipeline.prepare_image_latents
-    def encode_image_vae_latents(self, image, batch_size, num_images_per_prompt, dtype, generator=None):
-        if not isinstance(image, paddle.Tensor):
-            raise ValueError(f"`image` has to be of type `paddle.Tensor`, but is {type(image)}")
-        image = image.cast(dtype)
-
-        batch_size = batch_size * num_images_per_prompt
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        # vae encode
-        if isinstance(generator, list):
-            image_latents = [
-                self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) * self.vae.scaling_factor
-                for i in range(batch_size)
-            ]
-            image_latents = paddle.concat(image_latents, axis=0)
-        else:
-            image_latents = self.vae.encode(image).latent_dist.sample(generator) * self.vae.scaling_factor
-
-        if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
-            raise ValueError(
-                f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
-            )
-        else:
-            image_latents = paddle.concat([image_latents], axis=0)
-
-        return image_latents
-
-    # Modified from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_instruct_pix2pix.StableDiffusionInstructPix2PixPipeline.prepare_image_latents
-    def encode_image_clip_latents(
-        self,
-        image,
-        batch_size,
-        num_images_per_prompt,
-        dtype,
-    ):
-        batch_size = batch_size * num_images_per_prompt
-
-        # clip encode
-        inputs = self.image_feature_extractor(images=Image.fromarray(image), return_tensors="pd").pixel_values
-        # TODO junnyu, support float16 we need cast dtype
-        image_latents = self.image_encoder(inputs.cast(self.image_encoder.dtype)).image_embeds.unsqueeze(1)
-
-        if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
-            raise ValueError(
-                f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
-            )
-        else:
-            image_latents = paddle.concat([image_latents], axis=0)
-
-        return image_latents
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
-    def decode_image_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
-        image = self.vae.decode(latents).sample
-        image = (image / 2 + 0.5).clip(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        image = image.transpose([0, 2, 3, 1]).cast("float32").numpy()
-        return image
-
-    # Modified from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
-    def prepare_text_latents(self, batch_size, seq_len, hidden_size, dtype, generator, latents=None):
-        # Prepare text latents for the CLIP embedded prompt.
-        shape = [batch_size, seq_len, hidden_size]
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, dtype=dtype)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    # Modified from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
-    def prepare_image_vae_latents(
-        self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None
-    ):
-        # Prepare latents for the VAE embedded image.
-        shape = [batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor]
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, dtype=dtype)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    # Modified from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
-    def prepare_image_clip_latents(self, batch_size, clip_img_dim, dtype, generator, latents=None):
-        # Prepare latents for the CLIP embedded image.
-        shape = [batch_size, 1, clip_img_dim]
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, dtype=dtype)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    def get_noise_pred(
-        self,
-        mode,
-        latents,
-        t,
-        img_vae,
-        img_clip,
-        prompt_embeds,
-        N,
-        guidance_scale,
-        height,
-        width,
-        data_type=1,
-        generator=None,
-    ):
-        dtype = self.unet.dtype
-        if mode == "joint":
-            img_vae_latents, img_clip_latents, text_latents = self._split_joint(latents, height, width)
-            img_vae_out, img_clip_out, text_out = self.unet(
-                img=img_vae_latents,
-                clip_img=img_clip_latents,
-                text=text_latents,
-                t_img=t,
-                t_text=t,
-                data_type=paddle.zeros_like(t, dtype=paddle.int32) + data_type,
-            )
-            x_out = self._combine_joint(img_vae_out, img_clip_out, text_out)
-
-            if guidance_scale == 0.0:
-                return x_out
-
-            img_vae_T = randn_tensor(img_vae.shape, generator=generator, dtype=dtype)
-            img_clip_T = randn_tensor(img_clip.shape, generator=generator, dtype=dtype)
-            _, _, text_out_uncond = self.unet(
-                img=img_vae_T,
-                clip_img=img_clip_T,
-                text=text_latents,
-                t_img=paddle.ones_like(t) * N,
-                t_text=t,
-                data_type=paddle.zeros_like(t, dtype=paddle.int32) + data_type,
-            )
-            text_T = randn_tensor(prompt_embeds.shape, generator=generator, dtype=dtype)
-            img_vae_out_uncond, img_clip_out_uncond, _ = self.unet(
-                img=img_vae_latents,
-                clip_img=img_clip_latents,
-                text=text_T,
-                t_img=t,
-                t_text=paddle.ones_like(t) * N,
-                data_type=paddle.zeros_like(t, dtype=paddle.int32) + data_type,
-            )
-            x_out_uncond = self._combine_joint(img_vae_out_uncond, img_clip_out_uncond, text_out_uncond)
-
-            return x_out + guidance_scale * (x_out - x_out_uncond)
-
-        elif mode == "t2i":
-            img_vae_latents, img_clip_latents = self._split(latents, height, width)
-            t_text = paddle.zeros([t.shape[0]], dtype=paddle.int32)
-            img_vae_out, img_clip_out, text_out = self.unet(
-                img=img_vae_latents,
-                clip_img=img_clip_latents,
-                text=prompt_embeds,
-                t_img=t,
-                t_text=t_text,
-                data_type=paddle.zeros_like(t_text, dtype=paddle.int32) + data_type,
-            )
-            img_out = self._combine(img_vae_out, img_clip_out)
-
-            if guidance_scale == 0.0:
-                return img_out
-
-            text_T = randn_tensor(prompt_embeds.shape, generator=generator, dtype=dtype)
-            img_vae_out_uncond, img_clip_out_uncond, text_out_uncond = self.unet(
-                img=img_vae_latents,
-                clip_img=img_clip_latents,
-                text=text_T,
-                t_img=t,
-                t_text=paddle.ones_like(t) * N,
-                data_type=paddle.zeros_like(t_text, dtype=paddle.int32) + data_type,
-            )
-            img_out_uncond = self._combine(img_vae_out_uncond, img_clip_out_uncond)
-
-            return img_out + guidance_scale * (img_out - img_out_uncond)
-
-        elif mode == "i2t":
-            t_img = paddle.zeros([t.shape[0]], dtype=paddle.int32)
-            img_vae_out, img_clip_out, text_out = self.unet(
-                img=img_vae,
-                clip_img=img_clip,
-                text=latents,
-                t_img=t_img,
-                t_text=t,
-                data_type=paddle.zeros_like(t_img, dtype=paddle.int32) + data_type,
-            )
-            if guidance_scale == 0.0:
-                return text_out
-
-            img_vae_T = randn_tensor(img_vae.shape, generator=generator, dtype=dtype)
-            img_clip_T = randn_tensor(img_clip.shape, generator=generator, dtype=dtype)
-            img_vae_out_uncond, img_clip_out_uncond, text_out_uncond = self.unet(
-                img=img_vae_T,
-                clip_img=img_clip_T,
-                text=latents,
-                t_img=paddle.ones_like(t) * N,
-                t_text=t,
-                data_type=paddle.zeros_like(t, dtype=paddle.int32) + data_type,
-            )
-            return text_out + guidance_scale * (text_out - text_out_uncond)
-
-        elif mode == "t":
-            img_vae_out, img_clip_out, text_out = self.unet(
-                img=img_vae,
-                clip_img=img_clip,
-                text=latents,
-                t_img=paddle.ones_like(t) * N,
-                t_text=t,
-                data_type=paddle.zeros_like(t, dtype=paddle.int32) + data_type,
-            )
-            return text_out
-
-        elif mode == "i":
-            img_vae_latents, img_clip_latents = self._split(latents, height, width)
-            t_text = paddle.ones_like(t) * N
-            img_vae_out, img_clip_out, text_out = self.unet(
-                img=img_vae_latents,
-                clip_img=img_clip_latents,
-                text=prompt_embeds,
-                t_img=t,
-                t_text=t_text,
-                data_type=paddle.zeros_like(t_text, dtype=paddle.int32) + data_type,
-            )
-            img_out = self._combine(img_vae_out, img_clip_out)
-            return img_out
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    def _denoising_sample_fn(
-        self,
-        mode,
-        image_vae_latents,
-        image_clip_latents,
-        prompt_embeds,
-        num_inference_steps,
-        extra_step_kwargs,
-        guidance_scale,
-        height,
-        width,
-        callback,
-        callback_steps,
-    ):
-        # Prepare latent variables
-        if mode == "joint":
-            latents = self._combine_joint(image_vae_latents, image_clip_latents, prompt_embeds)
-        elif mode in ["t2i", "i"]:
-            latents = self._combine(image_vae_latents, image_clip_latents)
-        elif mode in ["i2t", "t"]:
-            latents = prompt_embeds
-        else:
-            raise ValueError
-
-        # Set timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps = self.scheduler.timesteps
-        N = self.scheduler.config.num_train_timesteps
-
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                noise_pred = self.get_noise_pred(
-                    mode,
-                    latents,
-                    t * N,
-                    image_vae_latents,
-                    image_clip_latents,
-                    prompt_embeds,
-                    N,
-                    guidance_scale,
-                    height,
-                    width,
-                )
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        if mode == "joint":
-            image_vae_latents, image_clip_latents, text_latents = self._split_joint(latents, height, width)
-            return image_vae_latents, image_clip_latents, text_latents
-        elif mode in ["t2i", "i"]:
-            image_vae_latents, image_clip_latents = self._split(latents, height, width)
-            return image_vae_latents, image_clip_latents
-        elif mode in ["i2t", "t"]:
-            text_latents = latents
-            return text_latents
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        mode: str = "t2i",  # t2i, i2t, t2i2t, i2t2i, joint, i, t
-        image: Optional[Union[paddle.Tensor, PIL.Image.Image]] = None,
-        prompt: Optional[Union[str, List[str]]] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.0,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        num_prompts_per_image: Optional[int] = 1,
-        num_samples: int = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        prompt_latents: Optional[paddle.Tensor] = None,
-        vae_latents: Optional[paddle.Tensor] = None,
-        clip_latents: Optional[paddle.Tensor] = None,
-        prompt_embeds: Optional[paddle.Tensor] = None,
-        negative_prompt_embeds: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        use_beam_search: Optional[bool] = True,
-        **kwargs,
-    ):
-        # 0. Default height and width to unet
-        height = height or self.unet.config.img_size * self.vae_scale_factor
-        width = width or self.unet.config.img_size * self.vae_scale_factor
-
-        # 1. Check inputs. Raise error if not correct
-        if mode in ["i2t", "i2t2i"]:
-            self.check_inputs([image], height, width, callback_steps)
-
-        if mode in ["t2i", "t2i2t"]:
-            self.check_inputs([prompt], height, width, callback_steps)
-
-        # 2. Define call parameters
-        batch_size = self._infer_batch_size(mode, image, prompt, prompt_embeds, num_samples)
-
-        # 3. Encode input prompt if available; otherwise prepare text latents
-        if mode in ["t2i", "t2i2t"]:
-            # 3.1. Encode input prompt(text)
-            assert prompt is not None or prompt_embeds is not None
-            prompt_embeds = self.encode_text_latents(
-                prompt,
-                num_images_per_prompt,
-                negative_prompt,
-                prompt_embeds=prompt_embeds,
-                negative_prompt_embeds=negative_prompt_embeds,
-            )
-            # Encode contexts to lower text dim, 768 -> 64
-            prompt_embeds = self.unet.encode_prefix(prompt_embeds)
-        else:
-            # 3.2. Prepare text latents
-            prompt_embeds = self.prepare_text_latents(
-                batch_size,
-                self.text_encoder_seq_len,
-                self.text_encoder_text_dim,
-                paddle.float32,  # Placeholder, need to determine correct thing to do for dtype
-                generator,
-                prompt_latents,
-            )
-
-        # 4. Encode input image if available; otherwise prepare image latents
-        if mode in ["i2t", "i2t2i"]:
-            assert image is not None and isinstance(image, PIL.Image.Image)
-            # 4.1. Encode images, if available
-            image = np.array(image).astype(np.uint8)
-            image_crop = center_crop(height, width, image)
-            # Encode image using CLIP
-            image_clip_latents = self.encode_image_clip_latents(
-                image_crop,
-                batch_size,
-                num_prompts_per_image,  # not num_images_per_prompt
-                prompt_embeds.dtype,
-            )
-            # Encode image using VAE
-            image_vae = (image_crop / 127.5 - 1.0).astype(np.float32)
-            image_vae = einops.rearrange(image_vae, "h w c -> 1 c h w")
-            image_vae_latents = self.encode_image_vae_latents(
-                paddle.to_tensor(image_vae),
-                batch_size,
-                num_prompts_per_image,  # not num_images_per_prompt
-                prompt_embeds.dtype,
-                generator,
-            )
-
-        else:
-            # 4.2. Prepare image latent variables, if necessary
-            # Prepare image CLIP latents
-            image_clip_latents = self.prepare_image_clip_latents(
-                batch_size * num_images_per_prompt,
-                self.image_encoder_clip_img_dim,
-                prompt_embeds.dtype,
-                generator,
-                clip_latents,
-            )
-            # Prepare image VAE latents
-            image_vae_latents = self.prepare_image_vae_latents(
-                batch_size * num_images_per_prompt,
-                self.num_channels_latents,
-                height,
-                width,
-                prompt_embeds.dtype,
-                generator,
-                vae_latents,
-            )
-
-        # 5. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 6. Prepare timesteps and Denoising loop
-        if mode in ["i", "t", "i2t", "t2i", "joint"]:
-            outs = self._denoising_sample_fn(
-                mode,
-                image_vae_latents,
-                image_clip_latents,
-                prompt_embeds,
-                num_inference_steps,
-                extra_step_kwargs,
-                guidance_scale,
-                height,
-                width,
-                callback,
-                callback_steps,
-            )
-        elif mode in ["i2t2i"]:
-            # 'i2t2i' should do 'i2t' first
-            outs = self._denoising_sample_fn(
-                "i2t",
-                image_vae_latents,
-                image_clip_latents,
-                prompt_embeds,
-                num_inference_steps,
-                extra_step_kwargs,
-                guidance_scale,
-                height,
-                width,
-                callback,
-                callback_steps,
-            )
-        elif mode in ["t2i2t"]:
-            # 't2i2t' should do 't2i' first
-            outs = self._denoising_sample_fn(
-                "t2i",
-                image_vae_latents,
-                image_clip_latents,
-                prompt_embeds,
-                num_inference_steps,
-                extra_step_kwargs,
-                guidance_scale,
-                height,
-                width,
-                callback,
-                callback_steps,
-            )
-        else:
-            raise ValueError
-
-        # 7. Generate image or text and Post-processing
-        gen_image, gen_text = None, None
-        if mode == "joint":
-            image_vae_latents, image_clip_latents, text_latents = outs
-            gen_image = self.decode_image_latents(image_vae_latents)
-            gen_text = self.caption_decoder.generate_captions(
-                self.caption_tokenizer, text_latents, use_beam_search=use_beam_search
-            )
-
-        elif mode in ["t2i", "i", "t2i2t"]:
-            image_vae_latents, image_clip_latents = outs
-            if mode in ["t2i", "i"]:
-                gen_image = self.decode_image_latents(image_vae_latents)
-            else:
-                # 't2i2t' should do 'i2t' later
-                prompt_embeds = self.prepare_text_latents(
-                    batch_size,
-                    self.text_encoder_seq_len,
-                    self.text_encoder_text_dim,
-                    paddle.float32,  # Placeholder, need to determine correct thing to do for dtype
-                    generator,
-                    prompt_latents,
-                )
-                text_latents = self._denoising_sample_fn(
-                    "i2t",
-                    image_vae_latents,
-                    image_clip_latents,
-                    prompt_embeds,
-                    num_inference_steps,
-                    extra_step_kwargs,
-                    guidance_scale,
-                    height,
-                    width,
-                    callback,
-                    callback_steps,
-                )
-                gen_text = self.caption_decoder.generate_captions(
-                    self.caption_tokenizer, text_latents, use_beam_search=use_beam_search
-                )
-
-        elif mode in ["i2t", "t", "i2t2i"]:
-            text_latents = outs
-            if mode in ["i2t", "t"]:
-                gen_text = self.caption_decoder.generate_captions(
-                    self.caption_tokenizer, text_latents, use_beam_search=use_beam_search
-                )
-            else:
-                # 'i2t2i' should do 't2i' later
-                # Prepare image CLIP latents
-                image_clip_latents = self.prepare_image_clip_latents(
-                    batch_size * num_images_per_prompt,
-                    self.image_encoder_clip_img_dim,
-                    prompt_embeds.dtype,
-                    generator,
-                    clip_latents,
-                )
-                # Prepare image VAE latents
-                image_vae_latents = self.prepare_image_vae_latents(
-                    batch_size * num_images_per_prompt,
-                    self.num_channels_latents,
-                    height,
-                    width,
-                    prompt_embeds.dtype,
-                    generator,
-                    vae_latents,
-                )
-                image_vae_latents, image_clip_latents = self._denoising_sample_fn(
-                    "t2i",
-                    image_vae_latents,
-                    image_clip_latents,
-                    text_latents,
-                    num_inference_steps,
-                    extra_step_kwargs,
-                    guidance_scale,
-                    height,
-                    width,
-                    callback,
-                    callback_steps,
-                )
-                gen_image = self.decode_image_latents(image_vae_latents)
-
-        # 8. Convert gen_image to PIL, gen_text has no else processing
-        if output_type == "pil" and gen_image is not None:
-            gen_image = self.numpy_to_pil(gen_image)
-
-        if not return_dict:
-            return (gen_image, gen_text)
-
-        return ImageTextPipelineOutput(images=gen_image, texts=gen_text)
diff --git a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/__init__.py b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/__init__.py
deleted file mode 100644
index 309b32b2d112..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/__init__.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# flake8: noqa
-
-from ...utils import (
-    OptionalDependencyNotAvailable,
-    is_paddle_available,
-    is_paddlenlp_available,
-)
-
-try:
-    if not (is_paddlenlp_available() and is_paddle_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ...utils.dummy_paddle_and_paddlenlp_objects import (
-        VersatileDiffusionDualGuidedPipeline,
-        VersatileDiffusionImageVariationPipeline,
-        VersatileDiffusionPipeline,
-        VersatileDiffusionTextToImagePipeline,
-    )
-else:
-    from .modeling_text_unet import UNetFlatConditionModel
-    from .pipeline_versatile_diffusion import VersatileDiffusionPipeline
-    from .pipeline_versatile_diffusion_dual_guided import (
-        VersatileDiffusionDualGuidedPipeline,
-    )
-    from .pipeline_versatile_diffusion_image_variation import (
-        VersatileDiffusionImageVariationPipeline,
-    )
-    from .pipeline_versatile_diffusion_text_to_image import (
-        VersatileDiffusionTextToImagePipeline,
-    )
diff --git a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/modeling_text_unet.py
deleted file mode 100644
index 8d1f2e904923..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/modeling_text_unet.py
+++ /dev/null
@@ -1,1699 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Any, Dict, List, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle.distributed.fleet.utils import recompute
-
-from ...configuration_utils import ConfigMixin, register_to_config
-from ...models import ModelMixin
-from ...models.attention import Attention
-from ...models.attention_processor import (
-    AttentionProcessor,
-    AttnAddedKVProcessor,
-    AttnProcessor,
-)
-from ...models.dual_transformer_2d import DualTransformer2DModel
-from ...models.embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps
-from ...models.transformer_2d import Transformer2DModel
-from ...models.unet_2d_condition import UNet2DConditionOutput
-from ...utils import NEG_INF, deprecate, logging
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-def get_down_block(
-    down_block_type,
-    num_layers,
-    in_channels,
-    out_channels,
-    temb_channels,
-    add_downsample,
-    resnet_eps,
-    resnet_act_fn,
-    attn_num_head_channels,
-    resnet_groups=None,
-    cross_attention_dim=None,
-    downsample_padding=None,
-    dual_cross_attention=False,
-    use_linear_projection=False,
-    only_cross_attention=False,
-    upcast_attention=False,
-    resnet_time_scale_shift="default",
-    resnet_skip_time_act=False,  # HF missing in v0.16.1
-    resnet_out_scale_factor=1.0,  # HF missing in v0.16.1
-    cross_attention_norm=None,  # HF missing in v0.16.1
-    resnet_pre_temb_non_linearity: bool = False,
-):
-    down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
-    if down_block_type == "DownBlockFlat":
-        return DownBlockFlat(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            resnet_groups=resnet_groups,
-            downsample_padding=downsample_padding,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-        )
-    elif down_block_type == "CrossAttnDownBlockFlat":
-        if cross_attention_dim is None:
-            raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlockFlat")
-        return CrossAttnDownBlockFlat(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            temb_channels=temb_channels,
-            add_downsample=add_downsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            resnet_groups=resnet_groups,
-            downsample_padding=downsample_padding,
-            cross_attention_dim=cross_attention_dim,
-            attn_num_head_channels=attn_num_head_channels,
-            dual_cross_attention=dual_cross_attention,
-            use_linear_projection=use_linear_projection,
-            only_cross_attention=only_cross_attention,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-        )
-    raise ValueError(f"{down_block_type} is not supported.")
-
-
-def get_up_block(
-    up_block_type,
-    num_layers,
-    in_channels,
-    out_channels,
-    prev_output_channel,
-    temb_channels,
-    add_upsample,
-    resnet_eps,
-    resnet_act_fn,
-    attn_num_head_channels,
-    resnet_groups=None,
-    cross_attention_dim=None,
-    dual_cross_attention=False,
-    use_linear_projection=False,
-    only_cross_attention=False,
-    upcast_attention=False,
-    resnet_time_scale_shift="default",
-    resnet_skip_time_act=False,  # HF missing in v0.16.1
-    resnet_out_scale_factor=1.0,  # HF missing in v0.16.1
-    cross_attention_norm=None,  # HF missing in v0.16.1
-    resnet_pre_temb_non_linearity: bool = False,
-):
-    up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
-    if up_block_type == "UpBlockFlat":
-        return UpBlockFlat(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            prev_output_channel=prev_output_channel,
-            temb_channels=temb_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            resnet_groups=resnet_groups,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-        )
-    elif up_block_type == "CrossAttnUpBlockFlat":
-        if cross_attention_dim is None:
-            raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlockFlat")
-        return CrossAttnUpBlockFlat(
-            num_layers=num_layers,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            prev_output_channel=prev_output_channel,
-            temb_channels=temb_channels,
-            add_upsample=add_upsample,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            resnet_groups=resnet_groups,
-            cross_attention_dim=cross_attention_dim,
-            attn_num_head_channels=attn_num_head_channels,
-            dual_cross_attention=dual_cross_attention,
-            use_linear_projection=use_linear_projection,
-            only_cross_attention=only_cross_attention,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-        )
-    raise ValueError(f"{up_block_type} is not supported.")
-
-
-# Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel with UNet2DConditionModel->UNetFlatConditionModel, nn.Conv2d->LinearMultiDim, Block2D->BlockFlat
-class UNetFlatConditionModel(ModelMixin, ConfigMixin):
-    r"""
-    UNetFlatConditionModel is a conditional 2D UNet model that takes in a noisy sample, conditional state, and a
-    timestep and returns sample shaped output.
-
-    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
-    implements for all the models (such as downloading or saving, etc.)
-
-    Parameters:
-        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
-            Height and width of input/output sample.
-        in_channels (`int`, *optional*, defaults to 4): The number of channels in the input sample.
-        out_channels (`int`, *optional*, defaults to 4): The number of channels in the output.
-        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
-        flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
-            Whether to flip the sin to cos in the time embedding.
-        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
-        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlockFlat", "CrossAttnDownBlockFlat", "CrossAttnDownBlockFlat", "DownBlockFlat")`):
-            The tuple of downsample blocks to use.
-        mid_block_type (`str`, *optional*, defaults to `"UNetMidBlockFlatCrossAttn"`):
-            The mid block type. Choose from `UNetMidBlockFlatCrossAttn` or `UNetMidBlockFlatSimpleCrossAttn`, will skip
-            the mid block layer if `None`.
-        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlockFlat", "CrossAttnUpBlockFlat", "CrossAttnUpBlockFlat", "CrossAttnUpBlockFlat",)`):
-            The tuple of upsample blocks to use.
-        only_cross_attention(`bool` or `Tuple[bool]`, *optional*, default to `False`):
-            Whether to include self-attention in the basic transformer blocks, see
-            [`~models.attention.BasicTransformerBlock`].
-        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
-            The tuple of output channels for each block.
-        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
-        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
-        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
-        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
-        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
-            If `None`, it will skip the normalization and activation layers in post-processing
-        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
-        cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
-            The dimension of the cross attention features.
-        encoder_hid_dim (`int`, *optional*, defaults to None):
-            If given, `encoder_hidden_states` will be projected from this dimension to `cross_attention_dim`.
-        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
-        resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
-            for resnet blocks, see [`~models.resnet.ResnetBlockFlat`]. Choose from `default` or `scale_shift`.
-        class_embed_type (`str`, *optional*, defaults to None):
-            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
-            `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
-        num_class_embeds (`int`, *optional*, defaults to None):
-            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
-            class conditioning with `class_embed_type` equal to `None`.
-        time_embedding_type (`str`, *optional*, default to `positional`):
-            The type of position embedding to use for timesteps. Choose from `positional` or `fourier`.
-        time_embedding_act_fn (`str`, *optional*, default to `None`):
-            Optional activation function to use on the time embeddings only one time before they as passed to the rest
-            of the unet. Choose from `silu`, `mish`, `gelu`, and `swish`.
-        timestep_post_act (`str, *optional*, default to `None`):
-            The second activation function to use in timestep embedding. Choose from `silu`, `mish` and `gelu`.
-        time_cond_proj_dim (`int`, *optional*, default to `None`):
-            The dimension of `cond_proj` layer in timestep embedding.
-        conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer.
-        conv_out_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_out` layer.
-        projection_class_embeddings_input_dim (`int`, *optional*): The dimension of the `class_labels` input when
-            using the "projection" `class_embed_type`. Required when using the "projection" `class_embed_type`.
-        class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time
-            embeddings with the class embeddings.
-        mid_block_only_cross_attention (`bool`, *optional*, defaults to `None`):
-            Whether to use cross attention with the mid block when using the `UNetMidBlockFlatSimpleCrossAttn`. If
-            `only_cross_attention` is given as a single boolean and `mid_block_only_cross_attention` is None, the
-            `only_cross_attention` value will be used as the value for `mid_block_only_cross_attention`. Else, it will
-            default to `False`.
-    """
-
-    _supports_gradient_checkpointing = True
-
-    @register_to_config
-    def __init__(
-        self,
-        sample_size: Optional[int] = None,
-        in_channels: int = 4,
-        out_channels: int = 4,
-        center_input_sample: bool = False,
-        flip_sin_to_cos: bool = True,
-        freq_shift: int = 0,
-        down_block_types: Tuple[str] = (
-            "CrossAttnDownBlockFlat",
-            "CrossAttnDownBlockFlat",
-            "CrossAttnDownBlockFlat",
-            "DownBlockFlat",
-        ),
-        mid_block_type: Optional[str] = "UNetMidBlockFlatCrossAttn",
-        up_block_types: Tuple[str] = (
-            "UpBlockFlat",
-            "CrossAttnUpBlockFlat",
-            "CrossAttnUpBlockFlat",
-            "CrossAttnUpBlockFlat",
-        ),
-        only_cross_attention: Union[bool, Tuple[bool]] = False,
-        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
-        layers_per_block: Union[int, Tuple[int]] = 2,
-        downsample_padding: int = 1,
-        mid_block_scale_factor: float = 1,
-        act_fn: str = "silu",
-        norm_num_groups: Optional[int] = 32,
-        norm_eps: float = 1e-5,
-        cross_attention_dim: Union[int, Tuple[int]] = 1280,
-        encoder_hid_dim: Optional[int] = None,
-        attention_head_dim: Union[int, Tuple[int]] = 8,
-        dual_cross_attention: bool = False,
-        use_linear_projection: bool = False,
-        class_embed_type: Optional[str] = None,
-        num_class_embeds: Optional[int] = None,
-        upcast_attention: bool = False,
-        resnet_time_scale_shift: str = "default",
-        resnet_skip_time_act: bool = False,
-        resnet_out_scale_factor: int = 1.0,
-        time_embedding_type: str = "positional",  # fourier, positional
-        time_embedding_act_fn: Optional[str] = None,
-        timestep_post_act: Optional[str] = None,
-        time_cond_proj_dim: Optional[int] = None,
-        conv_in_kernel: int = 3,
-        conv_out_kernel: int = 3,
-        projection_class_embeddings_input_dim: Optional[int] = None,
-        class_embeddings_concat: bool = False,
-        mid_block_only_cross_attention: Optional[bool] = None,
-        cross_attention_norm: Optional[str] = None,
-        resnet_pre_temb_non_linearity: Optional[bool] = False,
-    ):
-        super().__init__()
-
-        self.sample_size = sample_size
-
-        # Check inputs
-        if len(down_block_types) != len(up_block_types):
-            raise ValueError(
-                "Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`:"
-                f" {down_block_types}. `up_block_types`: {up_block_types}."
-            )
-
-        if len(block_out_channels) != len(down_block_types):
-            raise ValueError(
-                "Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`:"
-                f" {block_out_channels}. `down_block_types`: {down_block_types}."
-            )
-
-        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
-            raise ValueError(
-                "Must provide the same number of `only_cross_attention` as `down_block_types`."
-                f" `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
-            )
-
-        if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
-            raise ValueError(
-                "Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`:"
-                f" {attention_head_dim}. `down_block_types`: {down_block_types}."
-            )
-        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
-            raise ValueError(
-                "Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`:"
-                f" {cross_attention_dim}. `down_block_types`: {down_block_types}."
-            )
-
-        if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
-            raise ValueError(
-                "Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`:"
-                f" {layers_per_block}. `down_block_types`: {down_block_types}."
-            )
-
-        # input
-        conv_in_padding = (conv_in_kernel - 1) // 2
-        self.conv_in = LinearMultiDim(
-            in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
-        )
-
-        # time
-        if time_embedding_type == "fourier":
-            time_embed_dim = block_out_channels[0] * 2
-            if time_embed_dim % 2 != 0:
-                raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.")
-            self.time_proj = GaussianFourierProjection(
-                time_embed_dim // 2, set_W_to_weight=False, log=False, flip_sin_to_cos=flip_sin_to_cos
-            )
-            timestep_input_dim = time_embed_dim
-        elif time_embedding_type == "positional":
-            time_embed_dim = block_out_channels[0] * 4
-
-            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
-            timestep_input_dim = block_out_channels[0]
-        else:
-            raise ValueError(
-                f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`."
-            )
-
-        self.time_embedding = TimestepEmbedding(
-            timestep_input_dim,
-            time_embed_dim,
-            act_fn=act_fn,
-            post_act_fn=timestep_post_act,
-            cond_proj_dim=time_cond_proj_dim,
-        )
-        if encoder_hid_dim is not None:
-            self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
-        else:
-            self.encoder_hid_proj = None
-
-        # class embedding
-        if class_embed_type is None and num_class_embeds is not None:
-            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
-        elif class_embed_type == "timestep":
-            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
-        elif class_embed_type == "identity":
-            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
-        elif class_embed_type == "projection":
-            if projection_class_embeddings_input_dim is None:
-                raise ValueError(
-                    "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
-                )
-            # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
-            # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
-            # 2. it projects from an arbitrary input dimension.
-            #
-            # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
-            # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
-            # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
-            self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
-        elif class_embed_type == "simple_projection":
-            if projection_class_embeddings_input_dim is None:
-                raise ValueError(
-                    "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set"
-                )
-            self.class_embedding = nn.Linear(projection_class_embeddings_input_dim, time_embed_dim)
-        else:
-            self.class_embedding = None
-
-        if time_embedding_act_fn is None:
-            self.time_embed_act = None
-        elif time_embedding_act_fn == "swish":
-            self.time_embed_act = lambda x: F.silu(x)
-        elif time_embedding_act_fn == "mish":
-            self.time_embed_act = nn.Mish()
-        elif time_embedding_act_fn == "silu":
-            self.time_embed_act = nn.Silu()
-        elif time_embedding_act_fn == "gelu":
-            self.time_embed_act = nn.GELU()
-        else:
-            raise ValueError(f"Unsupported activation function: {time_embedding_act_fn}")
-
-        self.down_blocks = nn.LayerList([])
-        self.up_blocks = nn.LayerList([])
-
-        # pre_temb_act_fun opt
-        self.resnet_pre_temb_non_linearity = resnet_pre_temb_non_linearity
-        if resnet_pre_temb_non_linearity:
-            if act_fn == "swish":
-                self.down_resnet_temb_nonlinearity = lambda x: F.silu(x)
-            elif act_fn == "mish":
-                self.down_resnet_temb_nonlinearity = nn.Mish()
-            elif act_fn == "silu":
-                self.down_resnet_temb_nonlinearity = nn.Silu()
-            elif act_fn == "gelu":
-                self.down_resnet_temb_nonlinearity = nn.GELU()
-
-        if isinstance(only_cross_attention, bool):
-            if mid_block_only_cross_attention is None:
-                mid_block_only_cross_attention = only_cross_attention
-
-            only_cross_attention = [only_cross_attention] * len(down_block_types)
-
-        if mid_block_only_cross_attention is None:
-            mid_block_only_cross_attention = False
-
-        if isinstance(attention_head_dim, int):
-            attention_head_dim = (attention_head_dim,) * len(down_block_types)
-
-        if isinstance(cross_attention_dim, int):
-            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
-
-        if isinstance(layers_per_block, int):
-            layers_per_block = [layers_per_block] * len(down_block_types)
-
-        if class_embeddings_concat:
-            # The time embeddings are concatenated with the class embeddings. The dimension of the
-            # time embeddings passed to the down, middle, and up blocks is twice the dimension of the
-            # regular time embeddings
-            blocks_time_embed_dim = time_embed_dim * 2
-        else:
-            blocks_time_embed_dim = time_embed_dim
-
-        # down
-        output_channel = block_out_channels[0]
-        for i, down_block_type in enumerate(down_block_types):
-            input_channel = output_channel
-            output_channel = block_out_channels[i]
-            is_final_block = i == len(block_out_channels) - 1
-
-            down_block = get_down_block(
-                down_block_type,
-                num_layers=layers_per_block[i],
-                in_channels=input_channel,
-                out_channels=output_channel,
-                temb_channels=blocks_time_embed_dim,
-                add_downsample=not is_final_block,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                resnet_groups=norm_num_groups,
-                cross_attention_dim=cross_attention_dim[i],
-                attn_num_head_channels=attention_head_dim[i],
-                downsample_padding=downsample_padding,
-                dual_cross_attention=dual_cross_attention,
-                use_linear_projection=use_linear_projection,
-                only_cross_attention=only_cross_attention[i],
-                upcast_attention=upcast_attention,
-                resnet_time_scale_shift=resnet_time_scale_shift,
-                resnet_skip_time_act=resnet_skip_time_act,
-                resnet_out_scale_factor=resnet_out_scale_factor,
-                cross_attention_norm=cross_attention_norm,
-                resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-            )
-            self.down_blocks.append(down_block)
-
-        # mid
-        if mid_block_type == "UNetMidBlockFlatCrossAttn":
-            self.mid_block = UNetMidBlockFlatCrossAttn(
-                in_channels=block_out_channels[-1],
-                temb_channels=blocks_time_embed_dim,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                output_scale_factor=mid_block_scale_factor,
-                resnet_time_scale_shift=resnet_time_scale_shift,
-                cross_attention_dim=cross_attention_dim[-1],
-                attn_num_head_channels=attention_head_dim[-1],
-                resnet_groups=norm_num_groups,
-                dual_cross_attention=dual_cross_attention,
-                use_linear_projection=use_linear_projection,
-                upcast_attention=upcast_attention,
-                resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-            )
-        elif mid_block_type == "UNetMidBlockFlatSimpleCrossAttn":
-            self.mid_block = UNetMidBlockFlatSimpleCrossAttn(
-                in_channels=block_out_channels[-1],
-                temb_channels=blocks_time_embed_dim,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                output_scale_factor=mid_block_scale_factor,
-                cross_attention_dim=cross_attention_dim[-1],
-                attn_num_head_channels=attention_head_dim[-1],
-                resnet_groups=norm_num_groups,
-                resnet_time_scale_shift=resnet_time_scale_shift,
-                skip_time_act=resnet_skip_time_act,
-                only_cross_attention=mid_block_only_cross_attention,
-                cross_attention_norm=cross_attention_norm,
-                resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-            )
-        elif mid_block_type is None:
-            self.mid_block = None
-        else:
-            raise ValueError(f"unknown mid_block_type : {mid_block_type}")
-
-        # count how many layers upsample the images
-        self.num_upsamplers = 0
-
-        # up
-        reversed_block_out_channels = list(reversed(block_out_channels))
-        reversed_attention_head_dim = list(reversed(attention_head_dim))
-        reversed_layers_per_block = list(reversed(layers_per_block))
-        reversed_cross_attention_dim = list(reversed(cross_attention_dim))
-        reversed_only_cross_attention = list(reversed(only_cross_attention))
-
-        output_channel = reversed_block_out_channels[0]
-        for i, up_block_type in enumerate(up_block_types):
-            is_final_block = i == len(block_out_channels) - 1
-
-            prev_output_channel = output_channel
-            output_channel = reversed_block_out_channels[i]
-            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
-
-            # add upsample block for all BUT final layer
-            if not is_final_block:
-                add_upsample = True
-                self.num_upsamplers += 1
-            else:
-                add_upsample = False
-
-            up_block = get_up_block(
-                up_block_type,
-                num_layers=reversed_layers_per_block[i] + 1,
-                in_channels=input_channel,
-                out_channels=output_channel,
-                prev_output_channel=prev_output_channel,
-                temb_channels=blocks_time_embed_dim,
-                add_upsample=add_upsample,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                resnet_groups=norm_num_groups,
-                cross_attention_dim=reversed_cross_attention_dim[i],
-                attn_num_head_channels=reversed_attention_head_dim[i],
-                dual_cross_attention=dual_cross_attention,
-                use_linear_projection=use_linear_projection,
-                only_cross_attention=reversed_only_cross_attention[i],
-                upcast_attention=upcast_attention,
-                resnet_time_scale_shift=resnet_time_scale_shift,
-                resnet_skip_time_act=resnet_skip_time_act,
-                resnet_out_scale_factor=resnet_out_scale_factor,
-                cross_attention_norm=cross_attention_norm,
-                resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-            )
-            self.up_blocks.append(up_block)
-            prev_output_channel = output_channel
-
-        # out
-        if norm_num_groups is not None:
-            self.conv_norm_out = nn.GroupNorm(
-                num_channels=block_out_channels[0], num_groups=norm_num_groups, epsilon=norm_eps
-            )
-            self.conv_act = nn.Silu()
-        else:
-            self.conv_norm_out = None
-            self.conv_act = None
-
-        conv_out_padding = (conv_out_kernel - 1) // 2
-        self.conv_out = LinearMultiDim(
-            block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
-        )
-
-    @property
-    def in_channels(self):
-        deprecate(
-            "in_channels",
-            "1.0.0",
-            (
-                "Accessing `in_channels` directly via unet.in_channels is deprecated. Please use"
-                " `unet.config.in_channels` instead"
-            ),
-            standard_warn=False,
-        )
-        return self.config.in_channels
-
-    @property
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: nn.Layer, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "set_processor"):
-                processors[f"{name}.processor"] = module.processor
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Parameters:
-            `processor (`dict` of `AttentionProcessor` or `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                of **all** `Attention` layers.
-            In case `processor` is a dict, the key needs to define the path to the corresponding cross attention processor. This is strongly recommended when setting trainable attention processors.:
-
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: nn.Layer, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
-    def set_default_attn_processor(self):
-        """
-        Disables custom attention processors and sets the default attention implementation.
-        """
-        self.set_attn_processor(AttnProcessor())
-
-    def set_attention_slice(self, slice_size):
-        r"""
-        Enable sliced attention computation.
-
-        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
-        in several steps. This is useful to save some memory in exchange for a small speed decrease.
-
-        Args:
-            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
-                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
-                `"max"`, maximum amount of memory will be saved by running only one slice at a time. If a number is
-                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
-                must be a multiple of `slice_size`.
-        """
-        sliceable_head_dims = []
-
-        def fn_recursive_retrieve_sliceable_dims(module: nn.Layer):
-            if hasattr(module, "set_attention_slice"):
-                sliceable_head_dims.append(module.sliceable_head_dim)
-
-            for child in module.children():
-                fn_recursive_retrieve_sliceable_dims(child)
-
-        # retrieve number of attention layers
-        for module in self.children():
-            fn_recursive_retrieve_sliceable_dims(module)
-
-        num_sliceable_layers = len(sliceable_head_dims)
-
-        if slice_size == "auto":
-            # half the attention head size is usually a good trade-off between
-            # speed and memory
-            slice_size = [dim // 2 for dim in sliceable_head_dims]
-        elif slice_size == "max":
-            # make smallest slice possible
-            slice_size = num_sliceable_layers * [1]
-
-        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
-
-        if len(slice_size) != len(sliceable_head_dims):
-            raise ValueError(
-                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
-                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
-            )
-
-        for i in range(len(slice_size)):
-            size = slice_size[i]
-            dim = sliceable_head_dims[i]
-            if size is not None and size > dim:
-                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
-
-        # Recursively walk through all the children.
-        # Any children which exposes the set_attention_slice method
-        # gets the message
-        def fn_recursive_set_attention_slice(module: nn.Layer, slice_size: List[int]):
-            if hasattr(module, "set_attention_slice"):
-                module.set_attention_slice(slice_size.pop())
-
-            for child in module.children():
-                fn_recursive_set_attention_slice(child, slice_size)
-
-        reversed_slice_size = list(reversed(slice_size))
-        for module in self.children():
-            fn_recursive_set_attention_slice(module, reversed_slice_size)
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, (CrossAttnDownBlockFlat, DownBlockFlat, CrossAttnUpBlockFlat, UpBlockFlat)):
-            module.gradient_checkpointing = value
-
-    def forward(
-        self,
-        sample: paddle.Tensor,
-        timestep: Union[paddle.Tensor, float, int],
-        encoder_hidden_states: paddle.Tensor,
-        class_labels: Optional[paddle.Tensor] = None,
-        timestep_cond: Optional[paddle.Tensor] = None,
-        attention_mask: Optional[paddle.Tensor] = None,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        down_block_additional_residuals: Optional[Tuple[paddle.Tensor]] = None,
-        mid_block_additional_residual: Optional[paddle.Tensor] = None,
-        return_dict: bool = True,
-    ) -> Union[UNet2DConditionOutput, Tuple]:
-        r"""
-        Args:
-            sample (`paddle.Tensor`): (batch, channel, height, width) noisy inputs tensor
-            timestep (`paddle.Tensor` or `float` or `int`): (batch) timesteps
-            encoder_hidden_states (`paddle.Tensor`): (batch, sequence_length, feature_dim) encoder hidden states
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [ppdiffusers.cross_attention](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/ppdiffusers/ppdiffusers/models/cross_attention.py).
-
-        Returns:
-            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
-            [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
-            returning a tuple, the first element is the sample tensor.
-        """
-        sample = sample.cast(self.dtype)
-
-        # By default samples have to be AT least a multiple of the overall upsampling factor.
-        # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
-        # However, the upsampling interpolation output size can be forced to fit any upsampling size
-        # on the fly if necessary.
-        default_overall_up_factor = 2**self.num_upsamplers
-
-        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
-        forward_upsample_size = False
-        upsample_size = None
-
-        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
-            logger.info("Forward upsample size to force interpolation output size.")
-            forward_upsample_size = True
-
-        # prepare attention_mask
-        if attention_mask is not None:
-            attention_mask = (1 - attention_mask.cast(sample.dtype)) * NEG_INF
-            attention_mask = attention_mask.unsqueeze(1)
-
-        # 0. center input if necessary
-        if self.config.center_input_sample:
-            sample = 2 * sample - 1.0
-
-        # 1. time
-        timesteps = timestep
-        if not paddle.is_tensor(timesteps):
-            timesteps = paddle.to_tensor([timesteps], dtype="int64")
-        elif len(timesteps.shape) == 0:
-            timesteps = timesteps[None]
-
-        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-        timesteps = timesteps.expand(
-            [
-                sample.shape[0],
-            ]
-        )
-        t_emb = self.time_proj(timesteps)
-
-        # timesteps does not contain any weights and will always return f32 tensors
-        # but time_embedding might actually be running in fp16. so we need to cast here.
-        # there might be better ways to encapsulate this.
-        t_emb = t_emb.cast(self.dtype)
-
-        emb = self.time_embedding(t_emb, timestep_cond)
-
-        if self.class_embedding is not None:
-            if class_labels is None:
-                raise ValueError("class_labels should be provided when num_class_embeds > 0")
-
-            # maybe cast it to float16
-            class_labels = class_labels.cast(self.dtype)
-
-            if self.config.class_embed_type == "timestep":
-                class_labels = self.time_proj(class_labels)
-
-            # maybe cast it to int64
-            if isinstance(self.class_embedding, nn.Embedding):
-                class_labels = class_labels.cast(paddle.int64)
-            class_emb = self.class_embedding(class_labels).cast(self.dtype)
-
-            if self.config.class_embeddings_concat:
-                emb = paddle.concat([emb, class_emb], axis=-1)
-            else:
-                emb = emb + class_emb
-
-        if self.resnet_pre_temb_non_linearity:
-            emb = self.down_resnet_temb_nonlinearity(emb)
-        else:
-            if self.time_embed_act is not None:
-                emb = self.time_embed_act(emb)
-
-        if self.encoder_hid_proj is not None:
-            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
-
-        # 2. pre-process
-        sample = self.conv_in(sample)
-
-        # 3. down
-
-        is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
-        is_adapter = mid_block_additional_residual is None and down_block_additional_residuals is not None
-
-        down_block_res_samples = (sample,)
-        for downsample_block in self.down_blocks:
-            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
-                additional_kwargs = {}
-                if is_adapter and len(down_block_additional_residuals) > 0:
-                    additional_kwargs["additional_residuals"] = down_block_additional_residuals.pop(0)
-
-                sample, res_samples = downsample_block(
-                    hidden_states=sample,
-                    temb=emb,
-                    encoder_hidden_states=encoder_hidden_states,
-                    attention_mask=attention_mask,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                    **additional_kwargs,
-                )
-            else:
-                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
-
-                if is_adapter and len(down_block_additional_residuals) > 0:
-                    sample += down_block_additional_residuals.pop(0)
-
-            down_block_res_samples += res_samples
-
-        if is_controlnet:
-            new_down_block_res_samples = ()
-
-            for down_block_res_sample, down_block_additional_residual in zip(
-                down_block_res_samples, down_block_additional_residuals
-            ):
-                down_block_res_sample = down_block_res_sample + down_block_additional_residual
-                new_down_block_res_samples += (down_block_res_sample,)
-            down_block_res_samples = new_down_block_res_samples
-
-        # 4. mid
-        if self.mid_block is not None:
-            sample = self.mid_block(
-                sample,
-                emb,
-                encoder_hidden_states=encoder_hidden_states,
-                attention_mask=attention_mask,
-                cross_attention_kwargs=cross_attention_kwargs,
-            )
-
-        if is_controlnet:
-            sample = sample + mid_block_additional_residual
-
-        # 5. up
-        for i, upsample_block in enumerate(self.up_blocks):
-            is_final_block = i == len(self.up_blocks) - 1
-
-            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
-            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
-
-            # if we have not reached the final block and need to forward the
-            # upsample size, we do it here
-            if not is_final_block and forward_upsample_size:
-                upsample_size = down_block_res_samples[-1].shape[2:]
-
-            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
-                sample = upsample_block(
-                    hidden_states=sample,
-                    temb=emb,
-                    res_hidden_states_tuple=res_samples,
-                    encoder_hidden_states=encoder_hidden_states,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                    upsample_size=upsample_size,
-                    attention_mask=attention_mask,
-                )
-            else:
-                sample = upsample_block(
-                    hidden_states=sample,
-                    temb=emb,
-                    res_hidden_states_tuple=res_samples,
-                    upsample_size=upsample_size,
-                )
-        # 6. post-process
-        if self.conv_norm_out:
-            sample = self.conv_norm_out(sample)
-            sample = self.conv_act(sample)
-        sample = self.conv_out(sample)
-
-        if not return_dict:
-            return (sample,)
-
-        return UNet2DConditionOutput(sample=sample)
-
-
-class LinearMultiDim(nn.Linear):
-    def __init__(self, in_features, out_features=None, second_dim=4, *args, **kwargs):
-        in_features = [in_features, second_dim, 1] if isinstance(in_features, int) else list(in_features)
-        if out_features is None:
-            out_features = in_features
-        out_features = [out_features, second_dim, 1] if isinstance(out_features, int) else list(out_features)
-        self.in_features_multidim = in_features
-        self.out_features_multidim = out_features
-        self.n_dim = len(self.in_features_multidim)
-        super().__init__(np.array(in_features).prod(), np.array(out_features).prod())
-        self.in_features = self.weight.shape[0]
-
-    def forward(self, input_tensor, *args, **kwargs):
-        shape = input_tensor.shape
-        input_tensor = input_tensor.reshape([*shape[0 : -self.n_dim], self.in_features])
-        output_tensor = super().forward(input_tensor)
-        output_tensor = output_tensor.reshape([*shape[0 : -self.n_dim], *self.out_features_multidim])
-        return output_tensor
-
-
-class ResnetBlockFlat(nn.Layer):
-    def __init__(
-        self,
-        *,
-        in_channels,
-        out_channels=None,
-        dropout: float = 0.0,
-        temb_channels: int = 512,
-        groups: int = 32,
-        groups_out=None,
-        pre_norm: bool = True,
-        eps: float = 1e-6,
-        time_embedding_norm: str = "default",
-        use_in_shortcut=None,
-        second_dim: int = 4,
-        pre_temb_non_linearity: bool = False,
-        **kwargs,
-    ):
-        super().__init__()
-        self.pre_temb_non_linearity = pre_temb_non_linearity
-        self.pre_norm = pre_norm
-        self.pre_norm = True
-
-        in_channels = [in_channels, second_dim, 1] if isinstance(in_channels, int) else list(in_channels)
-        self.in_channels_prod = np.array(in_channels).prod()
-        self.channels_multidim = in_channels
-
-        if out_channels is not None:
-            out_channels = [out_channels, second_dim, 1] if isinstance(out_channels, int) else list(out_channels)
-            out_channels_prod = np.array(out_channels).prod()
-            self.out_channels_multidim = out_channels
-        else:
-            out_channels_prod = self.in_channels_prod
-            self.out_channels_multidim = self.channels_multidim
-        self.time_embedding_norm = time_embedding_norm
-
-        if groups_out is None:
-            groups_out = groups
-
-        self.norm1 = nn.GroupNorm(num_groups=groups, num_channels=self.in_channels_prod, epsilon=eps)
-        self.conv1 = nn.Conv2D(self.in_channels_prod, out_channels_prod, kernel_size=1, padding=0)
-
-        if temb_channels is not None:
-            self.time_emb_proj = nn.Linear(temb_channels, out_channels_prod)
-        else:
-            self.time_emb_proj = None
-
-        self.norm2 = nn.GroupNorm(num_groups=groups_out, num_channels=out_channels_prod, epsilon=eps)
-        self.dropout = nn.Dropout(dropout)
-        self.conv2 = nn.Conv2D(out_channels_prod, out_channels_prod, kernel_size=1, padding=0)
-
-        self.nonlinearity = nn.Silu()
-
-        self.use_in_shortcut = (
-            self.in_channels_prod != out_channels_prod if use_in_shortcut is None else use_in_shortcut
-        )
-
-        self.conv_shortcut = None
-        if self.use_in_shortcut:
-            self.conv_shortcut = nn.Conv2D(
-                self.in_channels_prod, out_channels_prod, kernel_size=1, stride=1, padding=0
-            )
-        self.n_dim = len(self.channels_multidim)
-
-    def forward(self, input_tensor, temb=None):
-        shape = input_tensor.shape
-
-        input_tensor = input_tensor.reshape([*shape[0 : -self.n_dim], self.in_channels_prod, 1, 1])
-        input_tensor = input_tensor.reshape([-1, self.in_channels_prod, 1, 1])
-
-        hidden_states = input_tensor
-
-        hidden_states = self.norm1(hidden_states)
-        hidden_states = self.nonlinearity(hidden_states)
-        hidden_states = self.conv1(hidden_states)
-
-        if temb is not None and self.time_emb_proj is not None:
-            if not self.pre_temb_non_linearity:
-                temb = self.time_emb_proj(self.nonlinearity(temb))[:, :, None, None]
-            else:
-                temb = self.time_emb_proj(temb)[:, :, None, None]
-            hidden_states = hidden_states + temb
-
-        hidden_states = self.norm2(hidden_states)
-        hidden_states = self.nonlinearity(hidden_states)
-
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.conv2(hidden_states)
-
-        if self.conv_shortcut is not None:
-            input_tensor = self.conv_shortcut(input_tensor)
-
-        output_tensor = input_tensor + hidden_states
-
-        output_tensor = output_tensor.reshape([*shape[0 : -self.n_dim], -1])
-        output_tensor = output_tensor.reshape([*shape[0 : -self.n_dim], *self.out_channels_multidim])
-
-        return output_tensor
-
-
-# Copied from ppdiffusers.models.unet_2d_blocks.DownBlock2D with DownBlock2D->DownBlockFlat, ResnetBlock2D->ResnetBlockFlat, Downsample2D->LinearMultiDim
-class DownBlockFlat(nn.Layer):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        output_scale_factor: float = 1.0,
-        add_downsample: bool = True,
-        downsample_padding: int = 1,
-        resnet_pre_temb_non_linearity: bool = False,
-    ):
-        super().__init__()
-        resnets = []
-
-        for i in range(num_layers):
-            in_channels = in_channels if i == 0 else out_channels
-            resnets.append(
-                ResnetBlockFlat(
-                    in_channels=in_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-                )
-            )
-
-        self.resnets = nn.LayerList(resnets)
-
-        if add_downsample:
-            self.downsamplers = nn.LayerList(
-                [
-                    LinearMultiDim(
-                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
-                    )
-                ]
-            )
-        else:
-            self.downsamplers = None
-
-        self.gradient_checkpointing = False
-
-    def forward(self, hidden_states, temb=None):
-        output_states = ()
-
-        for resnet in self.resnets:
-            if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs)
-
-                    return custom_forward
-
-                hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
-            else:
-                hidden_states = resnet(hidden_states, temb)
-
-            output_states += (hidden_states,)
-
-        if self.downsamplers is not None:
-            for downsampler in self.downsamplers:
-                hidden_states = downsampler(hidden_states)
-
-            output_states += (hidden_states,)
-
-        return hidden_states, output_states
-
-
-# Copied from ppdiffusers.models.unet_2d_blocks.CrossAttnDownBlock2D with CrossAttnDownBlock2D->CrossAttnDownBlockFlat, ResnetBlock2D->ResnetBlockFlat, Downsample2D->LinearMultiDim
-class CrossAttnDownBlockFlat(nn.Layer):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels: int = 1,
-        cross_attention_dim: int = 1280,
-        output_scale_factor: float = 1.0,
-        downsample_padding: int = 1,
-        add_downsample: bool = True,
-        dual_cross_attention: bool = False,
-        use_linear_projection: bool = False,
-        only_cross_attention: bool = False,
-        upcast_attention: bool = False,
-        resnet_pre_temb_non_linearity: bool = False,
-    ):
-        super().__init__()
-        resnets = []
-        attentions = []
-
-        self.has_cross_attention = True
-        self.attn_num_head_channels = attn_num_head_channels
-
-        for i in range(num_layers):
-            in_channels = in_channels if i == 0 else out_channels
-            resnets.append(
-                ResnetBlockFlat(
-                    in_channels=in_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-                )
-            )
-            if not dual_cross_attention:
-                attentions.append(
-                    Transformer2DModel(
-                        attn_num_head_channels,
-                        out_channels // attn_num_head_channels,
-                        in_channels=out_channels,
-                        num_layers=1,
-                        cross_attention_dim=cross_attention_dim,
-                        norm_num_groups=resnet_groups,
-                        use_linear_projection=use_linear_projection,
-                        only_cross_attention=only_cross_attention,
-                        upcast_attention=upcast_attention,
-                    )
-                )
-            else:
-                attentions.append(
-                    DualTransformer2DModel(
-                        attn_num_head_channels,
-                        out_channels // attn_num_head_channels,
-                        in_channels=out_channels,
-                        num_layers=1,
-                        cross_attention_dim=cross_attention_dim,
-                        norm_num_groups=resnet_groups,
-                    )
-                )
-        self.attentions = nn.LayerList(attentions)
-        self.resnets = nn.LayerList(resnets)
-
-        if add_downsample:
-            self.downsamplers = nn.LayerList(
-                [
-                    LinearMultiDim(
-                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
-                    )
-                ]
-            )
-        else:
-            self.downsamplers = None
-
-        self.gradient_checkpointing = False
-
-    def forward(
-        self,
-        hidden_states,
-        temb=None,
-        encoder_hidden_states=None,
-        attention_mask=None,
-        cross_attention_kwargs=None,
-        additional_residuals=None,
-    ):
-        # TODO(Patrick, William) - attention mask is not used
-        output_states = ()
-
-        for resnet, attn in zip(self.resnets, self.attentions):
-            if self.training and self.gradient_checkpointing:
-
-                def create_custom_forward(module, return_dict=None):
-                    def custom_forward(*inputs):
-                        if return_dict is not None:
-                            return module(*inputs, return_dict=return_dict)[0]
-                        else:
-                            return module(*inputs)
-
-                    return custom_forward
-
-                hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
-                hidden_states = recompute(
-                    create_custom_forward(attn, return_dict=False),
-                    hidden_states,
-                    encoder_hidden_states,
-                    cross_attention_kwargs,
-                )  # [0]
-            else:
-                hidden_states = resnet(hidden_states, temb)
-                hidden_states = attn(
-                    hidden_states,
-                    encoder_hidden_states=encoder_hidden_states,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                ).sample
-
-            output_states += (hidden_states,)
-
-        if additional_residuals is not None:
-            hidden_states += additional_residuals
-
-        if self.downsamplers is not None:
-            for downsampler in self.downsamplers:
-                hidden_states = downsampler(hidden_states)
-
-            output_states += (hidden_states,)
-
-        return hidden_states, output_states
-
-
-# Copied from ppdiffusers.models.unet_2d_blocks.UpBlock2D with UpBlock2D->UpBlockFlat, ResnetBlock2D->ResnetBlockFlat, Upsample2D->LinearMultiDim
-class UpBlockFlat(nn.Layer):
-    def __init__(
-        self,
-        in_channels: int,
-        prev_output_channel: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        output_scale_factor: float = 1.0,
-        add_upsample: bool = True,
-        resnet_pre_temb_non_linearity: bool = False,
-    ):
-        super().__init__()
-        resnets = []
-
-        for i in range(num_layers):
-            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
-            resnet_in_channels = prev_output_channel if i == 0 else out_channels
-
-            resnets.append(
-                ResnetBlockFlat(
-                    in_channels=resnet_in_channels + res_skip_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-                )
-            )
-
-        self.resnets = nn.LayerList(resnets)
-
-        if add_upsample:
-            self.upsamplers = nn.LayerList([LinearMultiDim(out_channels, use_conv=True, out_channels=out_channels)])
-        else:
-            self.upsamplers = None
-
-        self.gradient_checkpointing = False
-
-    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None):
-        for resnet in self.resnets:
-            # pop res hidden states
-            res_hidden_states = res_hidden_states_tuple[-1]
-            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-            hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
-
-            if self.training and self.gradient_checkpointing:
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs)
-
-                    return custom_forward
-
-                hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
-            else:
-                hidden_states = resnet(hidden_states, temb)
-
-        if self.upsamplers is not None:
-            for upsampler in self.upsamplers:
-                hidden_states = upsampler(hidden_states, upsample_size)
-
-        return hidden_states
-
-
-# Copied from ppdiffusers.models.unet_2d_blocks.CrossAttnUpBlock2D with CrossAttnUpBlock2D->CrossAttnUpBlockFlat, ResnetBlock2D->ResnetBlockFlat, Upsample2D->LinearMultiDim
-class CrossAttnUpBlockFlat(nn.Layer):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        prev_output_channel: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels: int = 1,
-        cross_attention_dim: int = 1280,
-        output_scale_factor: float = 1.0,
-        add_upsample: bool = True,
-        dual_cross_attention: bool = False,
-        use_linear_projection: bool = False,
-        only_cross_attention: bool = False,
-        upcast_attention: bool = False,
-        resnet_pre_temb_non_linearity: bool = False,
-    ):
-        super().__init__()
-        resnets = []
-        attentions = []
-
-        self.has_cross_attention = True
-        self.attn_num_head_channels = attn_num_head_channels
-
-        for i in range(num_layers):
-            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
-            resnet_in_channels = prev_output_channel if i == 0 else out_channels
-
-            resnets.append(
-                ResnetBlockFlat(
-                    in_channels=resnet_in_channels + res_skip_channels,
-                    out_channels=out_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-                )
-            )
-            if not dual_cross_attention:
-                attentions.append(
-                    Transformer2DModel(
-                        attn_num_head_channels,
-                        out_channels // attn_num_head_channels,
-                        in_channels=out_channels,
-                        num_layers=1,
-                        cross_attention_dim=cross_attention_dim,
-                        norm_num_groups=resnet_groups,
-                        use_linear_projection=use_linear_projection,
-                        only_cross_attention=only_cross_attention,
-                        upcast_attention=upcast_attention,
-                    )
-                )
-            else:
-                attentions.append(
-                    DualTransformer2DModel(
-                        attn_num_head_channels,
-                        out_channels // attn_num_head_channels,
-                        in_channels=out_channels,
-                        num_layers=1,
-                        cross_attention_dim=cross_attention_dim,
-                        norm_num_groups=resnet_groups,
-                    )
-                )
-        self.attentions = nn.LayerList(attentions)
-        self.resnets = nn.LayerList(resnets)
-
-        if add_upsample:
-            self.upsamplers = nn.LayerList([LinearMultiDim(out_channels, use_conv=True, out_channels=out_channels)])
-        else:
-            self.upsamplers = None
-
-        self.gradient_checkpointing = False
-
-    def forward(
-        self,
-        hidden_states,
-        res_hidden_states_tuple,
-        temb=None,
-        encoder_hidden_states=None,
-        cross_attention_kwargs=None,
-        upsample_size=None,
-        attention_mask=None,
-    ):
-        # TODO(Patrick, William) - attention mask is not used
-        for resnet, attn in zip(self.resnets, self.attentions):
-            # pop res hidden states
-            res_hidden_states = res_hidden_states_tuple[-1]
-            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-            hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
-
-            if self.training and self.gradient_checkpointing:
-
-                def create_custom_forward(module, return_dict=None):
-                    def custom_forward(*inputs):
-                        if return_dict is not None:
-                            return module(*inputs, return_dict=return_dict)[0]
-                        else:
-                            return module(*inputs)
-
-                    return custom_forward
-
-                hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
-                hidden_states = recompute(
-                    create_custom_forward(attn, return_dict=False),
-                    hidden_states,
-                    encoder_hidden_states,
-                    cross_attention_kwargs,
-                )  # [0]
-            else:
-                hidden_states = resnet(hidden_states, temb)
-                hidden_states = attn(
-                    hidden_states,
-                    encoder_hidden_states=encoder_hidden_states,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                ).sample
-
-        if self.upsamplers is not None:
-            for upsampler in self.upsamplers:
-                hidden_states = upsampler(hidden_states, upsample_size)
-
-        return hidden_states
-
-
-# Copied from ppdiffusers.models.unet_2d_blocks.UNetMidBlock2DCrossAttn with UNetMidBlock2DCrossAttn->UNetMidBlockFlatCrossAttn, ResnetBlock2D->ResnetBlockFlat
-class UNetMidBlockFlatCrossAttn(nn.Layer):
-    def __init__(
-        self,
-        in_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels: int = 1,
-        output_scale_factor: float = 1.0,
-        cross_attention_dim: int = 1280,
-        dual_cross_attention: bool = False,
-        use_linear_projection: bool = False,
-        upcast_attention: bool = False,
-        resnet_pre_temb_non_linearity: bool = False,
-    ):
-        super().__init__()
-
-        self.has_cross_attention = True
-        self.attn_num_head_channels = attn_num_head_channels
-        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
-
-        # there is always at least one resnet
-        resnets = [
-            ResnetBlockFlat(
-                in_channels=in_channels,
-                out_channels=in_channels,
-                temb_channels=temb_channels,
-                eps=resnet_eps,
-                groups=resnet_groups,
-                dropout=dropout,
-                time_embedding_norm=resnet_time_scale_shift,
-                non_linearity=resnet_act_fn,
-                output_scale_factor=output_scale_factor,
-                pre_norm=resnet_pre_norm,
-                pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-            )
-        ]
-        attentions = []
-
-        for _ in range(num_layers):
-            if not dual_cross_attention:
-                attentions.append(
-                    Transformer2DModel(
-                        attn_num_head_channels,
-                        in_channels // attn_num_head_channels,
-                        in_channels=in_channels,
-                        num_layers=1,
-                        cross_attention_dim=cross_attention_dim,
-                        norm_num_groups=resnet_groups,
-                        use_linear_projection=use_linear_projection,
-                        upcast_attention=upcast_attention,
-                    )
-                )
-            else:
-                attentions.append(
-                    DualTransformer2DModel(
-                        attn_num_head_channels,
-                        in_channels // attn_num_head_channels,
-                        in_channels=in_channels,
-                        num_layers=1,
-                        cross_attention_dim=cross_attention_dim,
-                        norm_num_groups=resnet_groups,
-                    )
-                )
-            resnets.append(
-                ResnetBlockFlat(
-                    in_channels=in_channels,
-                    out_channels=in_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-                )
-            )
-
-        self.attentions = nn.LayerList(attentions)
-        self.resnets = nn.LayerList(resnets)
-
-    def forward(
-        self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None, cross_attention_kwargs=None
-    ):
-        hidden_states = self.resnets[0](hidden_states, temb)
-        for attn, resnet in zip(self.attentions, self.resnets[1:]):
-            hidden_states = attn(
-                hidden_states,
-                encoder_hidden_states=encoder_hidden_states,
-                cross_attention_kwargs=cross_attention_kwargs,
-            ).sample
-            hidden_states = resnet(hidden_states, temb)
-
-        return hidden_states
-
-
-# Copied from ppdiffusers.models.unet_2d_blocks.UNetMidBlock2DSimpleCrossAttn with UNetMidBlock2DSimpleCrossAttn->UNetMidBlockFlatSimpleCrossAttn, ResnetBlock2D->ResnetBlockFlat
-class UNetMidBlockFlatSimpleCrossAttn(nn.Layer):
-    def __init__(
-        self,
-        in_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_time_scale_shift: str = "default",
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        resnet_pre_norm: bool = True,
-        attn_num_head_channels: int = 1,
-        output_scale_factor: float = 1.0,
-        cross_attention_dim: int = 1280,
-        skip_time_act=False,
-        only_cross_attention=False,
-        cross_attention_norm=None,
-        resnet_pre_temb_non_linearity: bool = False,
-    ):
-        super().__init__()
-
-        self.has_cross_attention = True
-
-        self.attn_num_head_channels = attn_num_head_channels
-        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
-
-        self.num_heads = in_channels // self.attn_num_head_channels
-
-        # there is always at least one resnet
-        resnets = [
-            ResnetBlockFlat(
-                in_channels=in_channels,
-                out_channels=in_channels,
-                temb_channels=temb_channels,
-                eps=resnet_eps,
-                groups=resnet_groups,
-                dropout=dropout,
-                time_embedding_norm=resnet_time_scale_shift,
-                non_linearity=resnet_act_fn,
-                output_scale_factor=output_scale_factor,
-                pre_norm=resnet_pre_norm,
-                pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-            )
-        ]
-        attentions = []
-
-        for _ in range(num_layers):
-            # TODO use AttnAddedKVProcessor2_5
-            # processor = (
-            #     AttnAddedKVProcessor2_5() if hasattr(F, "scaled_dot_product_attention_") else AttnAddedKVProcessor()
-            # )
-            processor = AttnAddedKVProcessor()
-            attentions.append(
-                Attention(
-                    query_dim=in_channels,
-                    cross_attention_dim=in_channels,
-                    heads=self.num_heads,
-                    dim_head=attn_num_head_channels,
-                    added_kv_proj_dim=cross_attention_dim,
-                    norm_num_groups=resnet_groups,
-                    bias=True,
-                    upcast_softmax=True,
-                    only_cross_attention=only_cross_attention,
-                    cross_attention_norm=cross_attention_norm,
-                    processor=processor,
-                )
-            )
-            resnets.append(
-                ResnetBlockFlat(
-                    in_channels=in_channels,
-                    out_channels=in_channels,
-                    temb_channels=temb_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    time_embedding_norm=resnet_time_scale_shift,
-                    non_linearity=resnet_act_fn,
-                    output_scale_factor=output_scale_factor,
-                    pre_norm=resnet_pre_norm,
-                    skip_time_act=skip_time_act,
-                    pre_temb_non_linearity=resnet_pre_temb_non_linearity,
-                )
-            )
-
-        self.attentions = nn.LayerList(attentions)
-        self.resnets = nn.LayerList(resnets)
-
-    def forward(
-        self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None, cross_attention_kwargs=None
-    ):
-        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
-        hidden_states = self.resnets[0](hidden_states, temb)
-        for attn, resnet in zip(self.attentions, self.resnets[1:]):
-            # attn
-            hidden_states = attn(
-                hidden_states,
-                encoder_hidden_states=encoder_hidden_states,
-                attention_mask=attention_mask,
-                **cross_attention_kwargs,
-            )
-
-            # resnet
-            hidden_states = resnet(hidden_states, temb)
-
-        return hidden_states
diff --git a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion.py
deleted file mode 100644
index 449213f25b52..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion.py
+++ /dev/null
@@ -1,457 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import Callable, List, Optional, Union
-
-import paddle
-import PIL.Image
-
-from paddlenlp.transformers import (
-    CLIPImageProcessor,
-    CLIPTextModelWithProjection,
-    CLIPTokenizer,
-    CLIPVisionModelWithProjection,
-)
-
-from ...models import AutoencoderKL, UNet2DConditionModel
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import logging
-from ..pipeline_utils import DiffusionPipeline
-from .modeling_text_unet import UNetFlatConditionModel
-from .pipeline_versatile_diffusion_dual_guided import (
-    VersatileDiffusionDualGuidedPipeline,
-)
-from .pipeline_versatile_diffusion_image_variation import (
-    VersatileDiffusionImageVariationPipeline,
-)
-from .pipeline_versatile_diffusion_text_to_image import (
-    VersatileDiffusionTextToImagePipeline,
-)
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-class VersatileDiffusionPipeline(DiffusionPipeline):
-    r"""
-    Pipeline for generation using Versatile Diffusion.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModelWithProjection`]):
-            Frozen text-encoder. Versatile Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        image_encoder ([`CLIPVisionModelWithProjection`]):
-            Frozen vision-encoder. Versatile Diffusion uses the vision portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPVisionModelWithProjection), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        image_unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        text_unet ([`UNetFlatConditionModel`]): xxx.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        image_feature_extractor ([`CLIPImageProcessor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-
-    tokenizer: CLIPTokenizer
-    image_feature_extractor: CLIPImageProcessor
-    text_encoder: CLIPTextModelWithProjection
-    image_encoder: CLIPVisionModelWithProjection
-    image_unet: UNet2DConditionModel
-    text_unet: UNetFlatConditionModel
-    vae: AutoencoderKL
-    scheduler: KarrasDiffusionSchedulers
-
-    def __init__(
-        self,
-        tokenizer: CLIPTokenizer,
-        image_feature_extractor: CLIPImageProcessor,
-        text_encoder: CLIPTextModelWithProjection,
-        image_encoder: CLIPVisionModelWithProjection,
-        image_unet: UNet2DConditionModel,
-        text_unet: UNet2DConditionModel,
-        vae: AutoencoderKL,
-        scheduler: KarrasDiffusionSchedulers,
-    ):
-        super().__init__()
-
-        self.register_modules(
-            tokenizer=tokenizer,
-            image_feature_extractor=image_feature_extractor,
-            text_encoder=text_encoder,
-            image_encoder=image_encoder,
-            image_unet=image_unet,
-            text_unet=text_unet,
-            vae=vae,
-            scheduler=scheduler,
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-
-    @paddle.no_grad()
-    def image_variation(
-        self,
-        image: Union[paddle.Tensor, PIL.Image.Image],
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            image (`PIL.Image.Image`, `List[PIL.Image.Image]` or `paddle.Tensor`):
-                The image prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to self.image_unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.image_unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-
-        Examples:
-
-        ```py
-        >>> from ppdiffusers import VersatileDiffusionPipeline
-        >>> import paddle
-        >>> import requests
-        >>> from io import BytesIO
-        >>> from PIL import Image
-
-        >>> # let's download an initial image
-        >>> url = "https://huggingface.co/datasets/diffusers/images/resolve/main/benz.jpg"
-
-        >>> response = requests.get(url)
-        >>> image = Image.open(BytesIO(response.content)).convert("RGB")
-
-        >>> pipe = VersatileDiffusionPipeline.from_pretrained(
-        ...     "shi-labs/versatile-diffusion", paddle_dtype=paddle.float16
-        ... )
-
-        >>> generator = paddle.Generator().manual_seed(0)
-        >>> image = pipe.image_variation(image, generator=generator).images[0]
-        >>> image.save("./car_variation.png")
-        ```
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        expected_components = inspect.signature(VersatileDiffusionImageVariationPipeline.__init__).parameters.keys()
-        components = {name: component for name, component in self.components.items() if name in expected_components}
-        return VersatileDiffusionImageVariationPipeline(**components)(
-            image=image,
-            height=height,
-            width=width,
-            num_inference_steps=num_inference_steps,
-            guidance_scale=guidance_scale,
-            negative_prompt=negative_prompt,
-            num_images_per_prompt=num_images_per_prompt,
-            eta=eta,
-            generator=generator,
-            latents=latents,
-            output_type=output_type,
-            return_dict=return_dict,
-            callback=callback,
-            callback_steps=callback_steps,
-        )
-
-    @paddle.no_grad()
-    def text_to_image(
-        self,
-        prompt: Union[str, List[str]],
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`):
-                The prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to self.image_unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.image_unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-
-        Examples:
-
-        ```py
-        >>> from ppdiffusers import VersatileDiffusionPipeline
-        >>> import paddle
-
-        >>> pipe = VersatileDiffusionPipeline.from_pretrained(
-        ...     "shi-labs/versatile-diffusion", paddle_dtype=paddle.float16
-        ... )
-
-        >>> generator = paddle.Generator().manual_seed(0)
-        >>> image = pipe.text_to_image("an astronaut riding on a horse on mars", generator=generator).images[0]
-        >>> image.save("./astronaut.png")
-        ```
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        expected_components = inspect.signature(VersatileDiffusionTextToImagePipeline.__init__).parameters.keys()
-        components = {name: component for name, component in self.components.items() if name in expected_components}
-        temp_pipeline = VersatileDiffusionTextToImagePipeline(**components)
-        output = temp_pipeline(
-            prompt=prompt,
-            height=height,
-            width=width,
-            num_inference_steps=num_inference_steps,
-            guidance_scale=guidance_scale,
-            negative_prompt=negative_prompt,
-            num_images_per_prompt=num_images_per_prompt,
-            eta=eta,
-            generator=generator,
-            latents=latents,
-            output_type=output_type,
-            return_dict=return_dict,
-            callback=callback,
-            callback_steps=callback_steps,
-        )
-        # swap the attention blocks back to the original state
-        temp_pipeline._swap_unet_attention_blocks()
-
-        return output
-
-    @paddle.no_grad()
-    def dual_guided(
-        self,
-        prompt: Union[PIL.Image.Image, List[PIL.Image.Image]],
-        image: Union[str, List[str]],
-        text_to_image_strength: float = 0.5,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`):
-                The prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to self.image_unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.image_unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-
-        Examples:
-
-        ```py
-        >>> from ppdiffusers import VersatileDiffusionPipeline
-        >>> import paddle
-        >>> import requests
-        >>> from io import BytesIO
-        >>> from PIL import Image
-
-        >>> # let's download an initial image
-        >>> url = "https://huggingface.co/datasets/diffusers/images/resolve/main/benz.jpg"
-
-        >>> response = requests.get(url)
-        >>> image = Image.open(BytesIO(response.content)).convert("RGB")
-        >>> text = "a red car in the sun"
-
-        >>> pipe = VersatileDiffusionPipeline.from_pretrained(
-        ...     "shi-labs/versatile-diffusion", paddle_dtype=paddle.float16
-        ... )
-
-        >>> generator = paddle.Generator().manual_seed(0)
-        >>> text_to_image_strength = 0.75
-
-        >>> image = pipe.dual_guided(
-        ...     prompt=text, image=image, text_to_image_strength=text_to_image_strength, generator=generator
-        ... ).images[0]
-        >>> image.save("./car_variation.png")
-        ```
-
-        Returns:
-            [`~pipelines.stable_diffusion.ImagePipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.ImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
-            returning a tuple, the first element is a list with the generated images.
-        """
-
-        expected_components = inspect.signature(VersatileDiffusionDualGuidedPipeline.__init__).parameters.keys()
-        components = {name: component for name, component in self.components.items() if name in expected_components}
-        temp_pipeline = VersatileDiffusionDualGuidedPipeline(**components)
-        output = temp_pipeline(
-            prompt=prompt,
-            image=image,
-            text_to_image_strength=text_to_image_strength,
-            height=height,
-            width=width,
-            num_inference_steps=num_inference_steps,
-            guidance_scale=guidance_scale,
-            num_images_per_prompt=num_images_per_prompt,
-            eta=eta,
-            generator=generator,
-            latents=latents,
-            output_type=output_type,
-            return_dict=return_dict,
-            callback=callback,
-            callback_steps=callback_steps,
-        )
-        temp_pipeline._revert_dual_attention()
-
-        return output
diff --git a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
deleted file mode 100644
index 9c15eef08a5d..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
+++ /dev/null
@@ -1,556 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import Callable, List, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-import PIL
-
-from paddlenlp.transformers import (
-    CLIPImageProcessor,
-    CLIPTextModelWithProjection,
-    CLIPTokenizer,
-    CLIPVisionModelWithProjection,
-)
-
-from ...models import (
-    AutoencoderKL,
-    DualTransformer2DModel,
-    Transformer2DModel,
-    UNet2DConditionModel,
-)
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import logging, randn_tensor
-from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
-from .modeling_text_unet import UNetFlatConditionModel
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-class VersatileDiffusionDualGuidedPipeline(DiffusionPipeline):
-    r"""
-    Pipeline for dual-guided generation using Versatile Diffusion.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModelWithProjection`]):
-            Frozen text-encoder. Versatile Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        image_encoder ([`CLIPVisionModelWithProjection`]):
-            Frozen vision-encoder. Versatile Diffusion uses the vision portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPVisionModelWithProjection), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        image_unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        text_unet ([`UNetFlatConditionModel`]): xxx.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        image_feature_extractor ([`CLIPImageProcessor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-    tokenizer: CLIPTokenizer
-    image_feature_extractor: CLIPImageProcessor
-    text_encoder: CLIPTextModelWithProjection
-    image_encoder: CLIPVisionModelWithProjection
-    image_unet: UNet2DConditionModel
-    text_unet: UNetFlatConditionModel
-    vae: AutoencoderKL
-    scheduler: KarrasDiffusionSchedulers
-
-    _optional_components = ["text_unet"]
-
-    def __init__(
-        self,
-        tokenizer: CLIPTokenizer,
-        image_feature_extractor: CLIPImageProcessor,
-        text_encoder: CLIPTextModelWithProjection,
-        image_encoder: CLIPVisionModelWithProjection,
-        image_unet: UNet2DConditionModel,
-        text_unet: UNetFlatConditionModel,
-        vae: AutoencoderKL,
-        scheduler: KarrasDiffusionSchedulers,
-    ):
-        super().__init__()
-        self.register_modules(
-            tokenizer=tokenizer,
-            image_feature_extractor=image_feature_extractor,
-            text_encoder=text_encoder,
-            image_encoder=image_encoder,
-            image_unet=image_unet,
-            text_unet=text_unet,
-            vae=vae,
-            scheduler=scheduler,
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-
-        if self.text_unet is not None and (
-            "dual_cross_attention" not in self.image_unet.config or not self.image_unet.config.dual_cross_attention
-        ):
-            # if loading from a universal checkpoint rather than a saved dual-guided pipeline
-            self._convert_to_dual_attention()
-
-    def remove_unused_weights(self):
-        self.register_modules(text_unet=None)
-
-    def _convert_to_dual_attention(self):
-        """
-        Replace image_unet's `Transformer2DModel` blocks with `DualTransformer2DModel` that contains transformer blocks
-        from both `image_unet` and `text_unet`
-        """
-        for name, module in self.image_unet.named_sublayers(include_self=True):
-            if isinstance(module, Transformer2DModel):
-                parent_name, index = name.rsplit(".", 1)
-                index = int(index)
-
-                image_transformer = self.image_unet.get_sublayer(parent_name)[index]
-                text_transformer = self.text_unet.get_sublayer(parent_name)[index]
-
-                config = image_transformer.config
-                dual_transformer = DualTransformer2DModel(
-                    num_attention_heads=config.num_attention_heads,
-                    attention_head_dim=config.attention_head_dim,
-                    in_channels=config.in_channels,
-                    num_layers=config.num_layers,
-                    dropout=config.dropout,
-                    norm_num_groups=config.norm_num_groups,
-                    cross_attention_dim=config.cross_attention_dim,
-                    attention_bias=config.attention_bias,
-                    sample_size=config.sample_size,
-                    num_vector_embeds=config.num_vector_embeds,
-                    activation_fn=config.activation_fn,
-                    num_embeds_ada_norm=config.num_embeds_ada_norm,
-                )
-                dual_transformer.transformers[0] = image_transformer
-                dual_transformer.transformers[1] = text_transformer
-
-                self.image_unet.get_sublayer(parent_name)[index] = dual_transformer
-                self.image_unet.register_to_config(dual_cross_attention=True)
-
-    def _revert_dual_attention(self):
-        """
-        Revert the image_unet `DualTransformer2DModel` blocks back to `Transformer2DModel` with image_unet weights Call
-        this function if you reuse `image_unet` in another pipeline, e.g. `VersatileDiffusionPipeline`
-        """
-        for name, module in self.image_unet.named_sublayers(include_self=True):
-            if isinstance(module, DualTransformer2DModel):
-                parent_name, index = name.rsplit(".", 1)
-                index = int(index)
-                self.image_unet.get_sublayer(parent_name)[index] = module.transformers[0]
-        self.image_unet.register_to_config(dual_cross_attention=False)
-
-    def _encode_text_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guidance):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-        """
-
-        def normalize_embeddings(encoder_output):
-            embeds = paddle.matmul(encoder_output.last_hidden_state, self.text_encoder.text_projection)
-            embeds_pooled = encoder_output.text_embeds
-            embeds = embeds / paddle.norm(embeds_pooled.unsqueeze(1), axis=-1, keepdim=True)
-            return embeds
-
-        batch_size = len(prompt)
-
-        text_inputs = self.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=self.tokenizer.model_max_length,
-            truncation=True,
-            return_tensors="pd",
-        )
-        text_input_ids = text_inputs.input_ids
-        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
-
-        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
-            text_input_ids, untruncated_ids
-        ):
-            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
-            logger.warning(
-                "The following part of your input was truncated because CLIP can only handle sequences up to"
-                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-            )
-
-        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-            attention_mask = text_inputs.attention_mask
-        else:
-            attention_mask = None
-
-        prompt_embeds = self.text_encoder(
-            text_input_ids,
-            attention_mask=attention_mask,
-        )
-        prompt_embeds = normalize_embeddings(prompt_embeds)
-
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance:
-            uncond_tokens = [""] * batch_size
-            max_length = text_input_ids.shape[-1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = uncond_input.attention_mask
-            else:
-                attention_mask = None
-
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids,
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = normalize_embeddings(negative_prompt_embeds)
-
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
-
-        return prompt_embeds
-
-    def _encode_image_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guidance):
-        r"""
-        Encodes the prompt into vision encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-        """
-
-        def normalize_embeddings(encoder_output):
-            embeds = self.image_encoder.vision_model.ln_post(encoder_output.last_hidden_state)
-            embeds = paddle.matmul(embeds, self.image_encoder.vision_projection)
-            embeds_pooled = embeds[:, 0:1]
-            embeds = embeds / paddle.norm(embeds_pooled, axis=-1, keepdim=True)
-            return embeds
-
-        batch_size = len(prompt) if isinstance(prompt, list) else 1
-
-        # get prompt text embeddings
-        image_input = self.image_feature_extractor(images=prompt, return_tensors="pd")
-        pixel_values = image_input.pixel_values.cast(self.image_encoder.dtype)
-        image_embeddings = self.image_encoder(pixel_values)
-        image_embeddings = normalize_embeddings(image_embeddings)
-
-        # duplicate image embeddings for each generation per prompt, using mps friendly method
-        bs_embed, seq_len, _ = image_embeddings.shape
-        image_embeddings = image_embeddings.tile([1, num_images_per_prompt, 1])
-        image_embeddings = image_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance:
-            uncond_images = [np.zeros((512, 512, 3)) + 0.5] * batch_size
-            uncond_images = self.image_feature_extractor(images=uncond_images, return_tensors="pd")
-            pixel_values = uncond_images.pixel_values.cast(self.image_encoder.dtype)
-            negative_prompt_embeds = self.image_encoder(pixel_values)
-            negative_prompt_embeds = normalize_embeddings(negative_prompt_embeds)
-
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and conditional embeddings into a single batch
-            # to avoid doing two forward passes
-            image_embeddings = paddle.concat([negative_prompt_embeds, image_embeddings])
-
-        return image_embeddings
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
-    def decode_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
-        image = self.vae.decode(latents).sample
-        image = (image / 2 + 0.5).clip(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        image = image.transpose([0, 2, 3, 1]).cast("float32").numpy()
-        return image
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    def check_inputs(self, prompt, image, height, width, callback_steps):
-        if not isinstance(prompt, str) and not isinstance(prompt, PIL.Image.Image) and not isinstance(prompt, list):
-            raise ValueError(f"`prompt` has to be of type `str` `PIL.Image` or `list` but is {type(prompt)}")
-        if not isinstance(image, str) and not isinstance(image, PIL.Image.Image) and not isinstance(image, list):
-            raise ValueError(f"`image` has to be of type `str` `PIL.Image` or `list` but is {type(image)}")
-
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, dtype=dtype)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    def set_transformer_params(self, mix_ratio: float = 0.5, condition_types: Tuple = ("text", "image")):
-        for name, module in self.image_unet.named_sublayers(include_self=True):
-            if isinstance(module, DualTransformer2DModel):
-                module.mix_ratio = mix_ratio
-
-                for i, type in enumerate(condition_types):
-                    if type == "text":
-                        module.condition_lengths[i] = self.text_encoder.config.max_position_embeddings
-                        module.transformer_index_for_condition[i] = 1  # use the second (text) transformer
-                    else:
-                        module.condition_lengths[i] = 257
-                        module.transformer_index_for_condition[i] = 0  # use the first (image) transformer
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        prompt: Union[PIL.Image.Image, List[PIL.Image.Image]],
-        image: Union[str, List[str]],
-        text_to_image_strength: float = 0.5,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        **kwargs,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`):
-                The prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to self.image_unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.image_unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-
-        Examples:
-
-        ```py
-        >>> from ppdiffusers import VersatileDiffusionDualGuidedPipeline
-        >>> import paddle
-        >>> import requests
-        >>> from io import BytesIO
-        >>> from PIL import Image
-
-        >>> # let's download an initial image
-        >>> url = "https://huggingface.co/datasets/diffusers/images/resolve/main/benz.jpg"
-
-        >>> response = requests.get(url)
-        >>> image = Image.open(BytesIO(response.content)).convert("RGB")
-        >>> text = "a red car in the sun"
-
-        >>> pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained(
-        ...     "shi-labs/versatile-diffusion", paddle_dtype=paddle.float16
-        ... )
-        >>> pipe.remove_unused_weights()
-
-        >>> generator = paddle.Generator().manual_seed(0)
-        >>> text_to_image_strength = 0.75
-
-        >>> image = pipe(
-        ...     prompt=text, image=image, text_to_image_strength=text_to_image_strength, generator=generator
-        ... ).images[0]
-        >>> image.save("./car_variation.png")
-        ```
-
-        Returns:
-            [`~pipelines.stable_diffusion.ImagePipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.ImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
-            returning a tuple, the first element is a list with the generated images.
-        """
-        # 0. Default height and width to unet
-        height = height or self.image_unet.config.sample_size * self.vae_scale_factor
-        width = width or self.image_unet.config.sample_size * self.vae_scale_factor
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(prompt, image, height, width, callback_steps)
-
-        # 2. Define call parameters
-        prompt = [prompt] if not isinstance(prompt, list) else prompt
-        image = [image] if not isinstance(image, list) else image
-        batch_size = len(prompt)
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input prompts
-        prompt_embeds = self._encode_text_prompt(prompt, num_images_per_prompt, do_classifier_free_guidance)
-        image_embeddings = self._encode_image_prompt(image, num_images_per_prompt, do_classifier_free_guidance)
-        dual_prompt_embeddings = paddle.concat([prompt_embeds, image_embeddings], axis=1)
-        prompt_types = ("text", "image")
-
-        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps = self.scheduler.timesteps
-
-        # 5. Prepare latent variables
-        num_channels_latents = self.image_unet.in_channels
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            dual_prompt_embeddings.dtype,
-            generator,
-            latents,
-        )
-
-        # 6. Prepare extra step kwargs.
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 7. Combine the attention blocks of the image and text UNets
-        self.set_transformer_params(text_to_image_strength, prompt_types)
-
-        # 8. Denoising loop
-        for i, t in enumerate(self.progress_bar(timesteps)):
-            # expand the latents if we are doing classifier free guidance
-            latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-            # predict the noise residual
-            noise_pred = self.image_unet(latent_model_input, t, encoder_hidden_states=dual_prompt_embeddings).sample
-
-            # perform guidance
-            if do_classifier_free_guidance:
-                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-            # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-            # call the callback, if provided
-            if callback is not None and i % callback_steps == 0:
-                callback(i, t, latents)
-
-        # 9. Post-processing
-        image = self.decode_latents(latents)
-
-        # 10. Convert to PIL
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image,)
-
-        return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
deleted file mode 100644
index 088e3642e1a7..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
+++ /dev/null
@@ -1,385 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import Callable, List, Optional, Union
-
-import numpy as np
-import paddle
-import PIL
-
-from paddlenlp.transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
-
-from ...models import AutoencoderKL, UNet2DConditionModel
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import logging, randn_tensor
-from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-class VersatileDiffusionImageVariationPipeline(DiffusionPipeline):
-    r"""
-    Pipeline for image variation using Versatile Diffusion.
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        image_encoder ([`CLIPVisionModelWithProjection`]):
-            Frozen vision-encoder. Versatile Diffusion uses the vision portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPVisionModelWithProjection), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        image_unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        image_feature_extractor ([`CLIPImageProcessor`]):
-             that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-    image_feature_extractor: CLIPImageProcessor
-    image_encoder: CLIPVisionModelWithProjection
-    image_unet: UNet2DConditionModel
-    vae: AutoencoderKL
-    scheduler: KarrasDiffusionSchedulers
-
-    def __init__(
-        self,
-        image_feature_extractor: CLIPImageProcessor,
-        image_encoder: CLIPVisionModelWithProjection,
-        image_unet: UNet2DConditionModel,
-        vae: AutoencoderKL,
-        scheduler: KarrasDiffusionSchedulers,
-    ):
-        super().__init__()
-        self.register_modules(
-            image_feature_extractor=image_feature_extractor,
-            image_encoder=image_encoder,
-            image_unet=image_unet,
-            vae=vae,
-            scheduler=scheduler,
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-
-    def _encode_image_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-        """
-
-        def normalize_embeddings(encoder_output):
-            embeds = self.image_encoder.vision_model.ln_post(encoder_output.last_hidden_state)
-            embeds = paddle.matmul(embeds, self.image_encoder.vision_projection)
-            embeds_pooled = embeds[:, 0:1]
-            embeds = embeds / paddle.norm(embeds_pooled, axis=-1, keepdim=True)
-            return embeds
-
-        if isinstance(prompt, paddle.Tensor) and len(prompt.shape) == 4:
-            prompt = list(prompt)
-
-        batch_size = len(prompt) if isinstance(prompt, list) else 1
-
-        # get prompt text embeddings
-        image_input = self.image_feature_extractor(images=prompt, return_tensors="pd")
-        pixel_values = image_input.pixel_values.cast(self.image_encoder.dtype)
-        image_embeddings = self.image_encoder(pixel_values)
-        image_embeddings = normalize_embeddings(image_embeddings)
-
-        # duplicate image embeddings for each generation per prompt, using mps friendly method
-        bs_embed, seq_len, _ = image_embeddings.shape
-        image_embeddings = image_embeddings.tile([1, num_images_per_prompt, 1])
-        image_embeddings = image_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance:
-            uncond_images: List[str]
-            if negative_prompt is None:
-                uncond_images = [np.zeros((512, 512, 3)) + 0.5] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, PIL.Image.Image):
-                uncond_images = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_images = negative_prompt
-
-            uncond_images = self.image_feature_extractor(images=uncond_images, return_tensors="pd")
-            pixel_values = uncond_images.pixel_values.cast(self.image_encoder.dtype)
-            negative_prompt_embeds = self.image_encoder(pixel_values)
-            negative_prompt_embeds = normalize_embeddings(negative_prompt_embeds)
-
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and conditional embeddings into a single batch
-            # to avoid doing two forward passes
-            image_embeddings = paddle.concat([negative_prompt_embeds, image_embeddings])
-
-        return image_embeddings
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
-    def decode_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
-        image = self.vae.decode(latents).sample
-        image = (image / 2 + 0.5).clip(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        image = image.transpose([0, 2, 3, 1]).cast("float32").numpy()
-        return image
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_image_variation.StableDiffusionImageVariationPipeline.check_inputs
-    def check_inputs(self, image, height, width, callback_steps):
-        if (
-            not isinstance(image, paddle.Tensor)
-            and not isinstance(image, PIL.Image.Image)
-            and not isinstance(image, list)
-        ):
-            raise ValueError(
-                "`image` has to be of type `paddle.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
-                f" {type(image)}"
-            )
-
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
-        shape = [batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor]
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, dtype=dtype)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        image: Union[PIL.Image.Image, List[PIL.Image.Image], paddle.Tensor],
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        **kwargs,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            image (`PIL.Image.Image`, `List[PIL.Image.Image]` or `paddle.Tensor`):
-                The image prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to self.image_unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.image_unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-
-        Examples:
-
-        ```py
-        >>> from ppdiffusers import VersatileDiffusionImageVariationPipeline
-        >>> import paddle
-        >>> import requests
-        >>> from io import BytesIO
-        >>> from PIL import Image
-
-        >>> # let's download an initial image
-        >>> url = "https://huggingface.co/datasets/diffusers/images/resolve/main/benz.jpg"
-
-        >>> response = requests.get(url)
-        >>> image = Image.open(BytesIO(response.content)).convert("RGB")
-
-        >>> pipe = VersatileDiffusionImageVariationPipeline.from_pretrained(
-        ...     "shi-labs/versatile-diffusion", paddle_dtype=paddle.float16
-        ... )
-
-        >>> generator = paddle.Generator().manual_seed(0)
-        >>> image = pipe(image, generator=generator).images[0]
-        >>> image.save("./car_variation.png")
-        ```
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        # 0. Default height and width to unet
-        height = height or self.image_unet.config.sample_size * self.vae_scale_factor
-        width = width or self.image_unet.config.sample_size * self.vae_scale_factor
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(image, height, width, callback_steps)
-
-        # 2. Define call parameters
-        batch_size = 1 if isinstance(image, PIL.Image.Image) else len(image)
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input prompt
-        image_embeddings = self._encode_image_prompt(
-            image, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
-        )
-
-        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps = self.scheduler.timesteps
-
-        # 5. Prepare latent variables
-        num_channels_latents = self.image_unet.config.in_channels
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            image_embeddings.dtype,
-            generator,
-            latents,
-        )
-
-        # 6. Prepare extra step kwargs.
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 7. Denoising loop
-        for i, t in enumerate(self.progress_bar(timesteps)):
-            # expand the latents if we are doing classifier free guidance
-            latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-            # predict the noise residual
-            noise_pred = self.image_unet(latent_model_input, t, encoder_hidden_states=image_embeddings).sample
-
-            # perform guidance
-            if do_classifier_free_guidance:
-                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-            # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-            # call the callback, if provided
-            if callback is not None and i % callback_steps == 0:
-                callback(i, t, latents)
-
-        # 8. Post-processing
-        image = self.decode_latents(latents)
-
-        # 9. Convert to PIL
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image,)
-
-        return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
deleted file mode 100644
index 86803330e026..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
+++ /dev/null
@@ -1,472 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import Callable, List, Optional, Union
-
-import paddle
-
-from paddlenlp.transformers import (
-    CLIPImageProcessor,
-    CLIPTextModelWithProjection,
-    CLIPTokenizer,
-)
-
-from ...models import AutoencoderKL, Transformer2DModel, UNet2DConditionModel
-from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import logging, randn_tensor
-from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
-from .modeling_text_unet import UNetFlatConditionModel
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-class VersatileDiffusionTextToImagePipeline(DiffusionPipeline):
-    r"""
-    Pipeline for text-to-image generation using Versatile Diffusion.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModelWithProjection`]):
-            Frozen text-encoder. Versatile Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        image_encoder ([`CLIPVisionModelWithProjection`]):
-            Frozen vision-encoder. Versatile Diffusion uses the vision portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPVisionModelWithProjection), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        image_unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        text_unet ([`UNetFlatConditionModel`]): xxx.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-    """
-    tokenizer: CLIPTokenizer
-    image_feature_extractor: CLIPImageProcessor
-    text_encoder: CLIPTextModelWithProjection
-    image_unet: UNet2DConditionModel
-    text_unet: UNetFlatConditionModel
-    vae: AutoencoderKL
-    scheduler: KarrasDiffusionSchedulers
-
-    _optional_components = ["text_unet"]
-
-    def __init__(
-        self,
-        tokenizer: CLIPTokenizer,
-        text_encoder: CLIPTextModelWithProjection,
-        image_unet: UNet2DConditionModel,
-        text_unet: UNetFlatConditionModel,
-        vae: AutoencoderKL,
-        scheduler: KarrasDiffusionSchedulers,
-    ):
-        super().__init__()
-        self.register_modules(
-            tokenizer=tokenizer,
-            text_encoder=text_encoder,
-            image_unet=image_unet,
-            text_unet=text_unet,
-            vae=vae,
-            scheduler=scheduler,
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-
-        if self.text_unet is not None:
-            self._swap_unet_attention_blocks()
-
-    def _swap_unet_attention_blocks(self):
-        """
-        Swap the `Transformer2DModel` blocks between the image and text UNets
-        """
-        for name, module in self.image_unet.named_sublayers(include_self=True):
-            if isinstance(module, Transformer2DModel):
-                parent_name, index = name.rsplit(".", 1)
-                index = int(index)
-                self.image_unet.get_sublayer(parent_name)[index], self.text_unet.get_sublayer(parent_name)[index] = (
-                    self.text_unet.get_sublayer(parent_name)[index],
-                    self.image_unet.get_sublayer(parent_name)[index],
-                )
-
-    def remove_unused_weights(self):
-        self.register_modules(text_unet=None)
-
-    def _encode_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-        """
-
-        def normalize_embeddings(encoder_output):
-            embeds = paddle.matmul(encoder_output.last_hidden_state, self.text_encoder.text_projection)
-            embeds_pooled = encoder_output.text_embeds
-            embeds = embeds / paddle.norm(embeds_pooled.unsqueeze(1), axis=-1, keepdim=True)
-            return embeds
-
-        batch_size = len(prompt) if isinstance(prompt, list) else 1
-
-        text_inputs = self.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=self.tokenizer.model_max_length,
-            truncation=True,
-            return_tensors="pd",
-        )
-        text_input_ids = text_inputs.input_ids
-        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
-
-        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all(
-            text_input_ids, untruncated_ids
-        ):
-            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
-            logger.warning(
-                "The following part of your input was truncated because CLIP can only handle sequences up to"
-                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-            )
-
-        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-            attention_mask = text_inputs.attention_mask
-        else:
-            attention_mask = None
-
-        prompt_embeds = self.text_encoder(
-            text_input_ids,
-            attention_mask=attention_mask,
-        )
-        prompt_embeds = normalize_embeddings(prompt_embeds)
-
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            max_length = text_input_ids.shape[-1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = uncond_input.attention_mask
-            else:
-                attention_mask = None
-
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids,
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = normalize_embeddings(negative_prompt_embeds)
-
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
-
-        return prompt_embeds
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
-    def decode_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
-        image = self.vae.decode(latents).sample
-        image = (image / 2 + 0.5).clip(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        image = image.transpose([0, 2, 3, 1]).cast("float32").numpy()
-        return image
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
-    def check_inputs(
-        self,
-        prompt,
-        height,
-        width,
-        callback_steps,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-    ):
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
-        shape = [batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor]
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, dtype=dtype)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]],
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        **kwargs,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`):
-                The prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to self.image_unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.image_unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`paddle.Generator`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            latents (`paddle.Tensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-
-        Examples:
-
-        ```py
-        >>> from ppdiffusers import VersatileDiffusionTextToImagePipeline
-        >>> import paddle
-
-        >>> pipe = VersatileDiffusionTextToImagePipeline.from_pretrained(
-        ...     "shi-labs/versatile-diffusion", paddle_dtype=paddle.float16
-        ... )
-        >>> pipe.remove_unused_weights()
-
-        >>> generator = paddle.Generator().manual_seed(0)
-        >>> image = pipe("an astronaut riding on a horse on mars", generator=generator).images[0]
-        >>> image.save("./astronaut.png")
-        ```
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        # 0. Default height and width to unet
-        height = height or self.image_unet.config.sample_size * self.vae_scale_factor
-        width = width or self.image_unet.config.sample_size * self.vae_scale_factor
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(prompt, height, width, callback_steps)
-
-        # 2. Define call parameters
-        batch_size = 1 if isinstance(prompt, str) else len(prompt)
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
-            prompt, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
-        )
-
-        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps = self.scheduler.timesteps
-
-        # 5. Prepare latent variables
-        num_channels_latents = self.image_unet.config.in_channels
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            prompt_embeds.dtype,
-            generator,
-            latents,
-        )
-
-        # 6. Prepare extra step kwargs.
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 7. Denoising loop
-        for i, t in enumerate(self.progress_bar(timesteps)):
-            # expand the latents if we are doing classifier free guidance
-            latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents
-            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-            # predict the noise residual
-            noise_pred = self.image_unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
-
-            # perform guidance
-            if do_classifier_free_guidance:
-                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-            # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-            # call the callback, if provided
-            if callback is not None and i % callback_steps == 0:
-                callback(i, t, latents)
-
-        # 9. Post-processing
-        image = self.decode_latents(latents)
-
-        # 10. Convert to PIL
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image,)
-
-        return ImagePipelineOutput(images=image)
diff --git a/ppdiffusers/ppdiffusers/pipelines/vq_diffusion/__init__.py b/ppdiffusers/ppdiffusers/pipelines/vq_diffusion/__init__.py
deleted file mode 100644
index f7426c40427c..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/vq_diffusion/__init__.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# flake8: noqa
-
-from ...utils import is_paddle_available, is_paddlenlp_available
-
-if is_paddle_available() and is_paddlenlp_available():
-    from .pipeline_vq_diffusion import (
-        LearnedClassifierFreeSamplingEmbeddings,
-        VQDiffusionPipeline,
-    )
diff --git a/ppdiffusers/ppdiffusers/pipelines/vq_diffusion/pipeline_vq_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/vq_diffusion/pipeline_vq_diffusion.py
deleted file mode 100644
index 10ebc05c142d..000000000000
--- a/ppdiffusers/ppdiffusers/pipelines/vq_diffusion/pipeline_vq_diffusion.py
+++ /dev/null
@@ -1,341 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2022 Microsoft and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Callable, List, Optional, Tuple, Union
-
-import paddle
-import paddle.nn as nn
-
-from paddlenlp.transformers import CLIPTextModel, CLIPTokenizer
-
-from ...configuration_utils import ConfigMixin, register_to_config
-from ...models import ModelMixin, Transformer2DModel, VQModel
-from ...schedulers import VQDiffusionScheduler
-from ...utils import logging
-from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-INF = 1e9
-
-
-# paddle logsumexp may has bug
-def logsumexp(x, axis=None, keepdim=False):
-    return paddle.log(x.exp().sum(axis=axis, keepdim=keepdim))
-
-
-class LearnedClassifierFreeSamplingEmbeddings(ModelMixin, ConfigMixin):
-    """
-    Utility class for storing learned text embeddings for classifier free sampling
-    """
-
-    @register_to_config
-    def __init__(self, learnable: bool, hidden_size: Optional[int] = None, length: Optional[int] = None):
-        super().__init__()
-
-        self.learnable = learnable
-
-        if self.learnable:
-            assert hidden_size is not None, "learnable=True requires `hidden_size` to be set"
-            assert length is not None, "learnable=True requires `length` to be set"
-
-            embeddings = paddle.zeros([length, hidden_size])
-            self.embeddings = self.create_parameter(
-                embeddings.shape, default_initializer=nn.initializer.Assign(embeddings)
-            )
-        else:
-            self.embeddings = None
-
-
-class VQDiffusionPipeline(DiffusionPipeline):
-    r"""
-    Pipeline for text-to-image generation using VQ Diffusion
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        vqvae ([`VQModel`]):
-            Vector Quantized Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent
-            representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. VQ Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        transformer ([`Transformer2DModel`]):
-            Conditional transformer to denoise the encoded image latents.
-        scheduler ([`VQDiffusionScheduler`]):
-            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
-    """
-
-    vqvae: VQModel
-    text_encoder: CLIPTextModel
-    tokenizer: CLIPTokenizer
-    transformer: Transformer2DModel
-    learned_classifier_free_sampling_embeddings: LearnedClassifierFreeSamplingEmbeddings
-    scheduler: VQDiffusionScheduler
-
-    def __init__(
-        self,
-        vqvae: VQModel,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        transformer: Transformer2DModel,
-        scheduler: VQDiffusionScheduler,
-        learned_classifier_free_sampling_embeddings: LearnedClassifierFreeSamplingEmbeddings,
-    ):
-        super().__init__()
-
-        self.register_modules(
-            vqvae=vqvae,
-            transformer=transformer,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            scheduler=scheduler,
-            learned_classifier_free_sampling_embeddings=learned_classifier_free_sampling_embeddings,
-        )
-
-    def _encode_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guidance):
-        batch_size = len(prompt) if isinstance(prompt, list) else 1
-
-        # get prompt text embeddings
-        text_inputs = self.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=self.tokenizer.model_max_length,
-            return_tensors="pd",
-        )
-        text_input_ids = text_inputs.input_ids
-
-        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
-            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
-            logger.warning(
-                "The following part of your input was truncated because CLIP can only handle sequences up to"
-                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-            )
-            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
-        prompt_embeds = self.text_encoder(text_input_ids)[0]
-
-        # NOTE: This additional step of normalizing the text embeddings is from VQ-Diffusion.
-        # While CLIP does normalize the pooled output of the text transformer when combining
-        # the image and text embeddings, CLIP does not directly normalize the last hidden state.
-        #
-        # CLIP normalizing the pooled output.
-        # https://github.com/huggingface/transformers/blob/d92e22d1f28324f513f3080e5c47c071a3916721/src/transformers/models/clip/modeling_clip.py#L1052-L1053
-        prompt_embeds = prompt_embeds / prompt_embeds.norm(axis=-1, keepdim=True)
-
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1])
-        prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
-
-        if do_classifier_free_guidance:
-            if self.learned_classifier_free_sampling_embeddings.learnable:
-                negative_prompt_embeds = self.learned_classifier_free_sampling_embeddings.embeddings
-                negative_prompt_embeds = negative_prompt_embeds.unsqueeze(0).tile([batch_size, 1, 1])
-            else:
-                uncond_tokens = [""] * batch_size
-
-                max_length = text_input_ids.shape[-1]
-                uncond_input = self.tokenizer(
-                    uncond_tokens,
-                    padding="max_length",
-                    max_length=max_length,
-                    truncation=True,
-                    return_tensors="pd",
-                )
-                negative_prompt_embeds = self.text_encoder(uncond_input.input_ids)[0]
-                # See comment for normalizing text embeddings
-                negative_prompt_embeds = negative_prompt_embeds / negative_prompt_embeds.norm(axis=-1, keepdim=True)
-
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1])
-            negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
-
-        return prompt_embeds
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]],
-        num_inference_steps: int = 100,
-        guidance_scale: float = 5.0,
-        truncation_rate: float = 1.0,
-        num_images_per_prompt: int = 1,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        latents: Optional[paddle.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-    ) -> Union[ImagePipelineOutput, Tuple]:
-        """
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`):
-                The prompt or prompts to guide the image generation.
-            num_inference_steps (`int`, *optional*, defaults to 100):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            truncation_rate (`float`, *optional*, defaults to 1.0 (equivalent to no truncation)):
-                Used to "truncate" the predicted classes for x_0 such that the cumulative probability for a pixel is at
-                most `truncation_rate`. The lowest probabilities that would increase the cumulative probability above
-                `truncation_rate` are set to zero.
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            generator (`paddle.Generator`, *optional*):
-                One or a list of paddle generator(s) to make generation deterministic.
-            latents (`paddle.Tensor` of shape (batch), *optional*):
-                Pre-generated noisy latents to be used as inputs for image generation. Must be valid embedding indices.
-                Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will
-                be generated of completely masked latent pixels.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generated image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
-            callback (`Callable`, *optional*):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-
-        Returns:
-            [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~ pipeline_utils.ImagePipelineOutput `] if `return_dict`
-            is True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
-        """
-        if isinstance(prompt, str):
-            batch_size = 1
-        elif isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        batch_size = batch_size * num_images_per_prompt
-
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        prompt_embeds = self._encode_prompt(prompt, num_images_per_prompt, do_classifier_free_guidance)
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        # get the initial completely masked latents unless the user supplied it
-
-        latents_shape = [batch_size, self.transformer.num_latent_pixels]
-        if latents is None:
-            mask_class = self.transformer.num_vector_embeds - 1
-            latents = paddle.full(latents_shape, mask_class, dtype="int64")
-        else:
-            if latents.shape != latents_shape:
-                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
-            if (latents < 0).any() or (latents >= self.transformer.num_vector_embeds).any():
-                raise ValueError(
-                    "Unexpected latents value(s). All latents be valid embedding indices i.e. in the range 0,"
-                    f" {self.transformer.num_vector_embeds - 1} (inclusive)."
-                )
-
-        # set timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-
-        timesteps_tensor = self.scheduler.timesteps
-
-        sample = latents
-
-        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
-            # expand the sample if we are doing classifier free guidance
-            latent_model_input = paddle.concat([sample] * 2) if do_classifier_free_guidance else sample
-
-            # predict the un-noised image
-            # model_output == `log_p_x_0`
-            model_output = self.transformer(latent_model_input, encoder_hidden_states=prompt_embeds, timestep=t).sample
-
-            if do_classifier_free_guidance:
-                model_output_uncond, model_output_text = model_output.chunk(2)
-                model_output = model_output_uncond + guidance_scale * (model_output_text - model_output_uncond)
-                model_output -= logsumexp(model_output, axis=1, keepdim=True)
-
-            model_output = self.truncate(model_output, truncation_rate)
-
-            # remove `log(0)`'s (`-inf`s)
-            model_output = model_output.clip(-70)
-
-            # compute the previous noisy sample x_t -> x_t-1
-            sample = self.scheduler.step(model_output, timestep=t, sample=sample, generator=generator).prev_sample
-
-            # call the callback, if provided
-            if callback is not None and i % callback_steps == 0:
-                callback(i, t, sample)
-
-        embedding_channels = self.vqvae.config.vq_embed_dim
-        embeddings_shape = (batch_size, self.transformer.height, self.transformer.width, embedding_channels)
-        embeddings = self.vqvae.quantize.get_codebook_entry(sample, shape=embeddings_shape)
-        image = self.vqvae.decode(embeddings, force_not_quantize=True).sample
-
-        image = (image / 2 + 0.5).clip(0, 1)
-        image = image.transpose([0, 2, 3, 1]).cast("float32").numpy()
-
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image,)
-
-        return ImagePipelineOutput(images=image)
-
-    def truncate(self, log_p_x_0: paddle.Tensor, truncation_rate: float) -> paddle.Tensor:
-        """
-        Truncates log_p_x_0 such that for each column vector, the total cumulative probability is `truncation_rate` The
-        lowest probabilities that would increase the cumulative probability above `truncation_rate` are set to zero.
-        """
-        sorted_log_p_x_0, indices = paddle.topk(log_p_x_0, k=log_p_x_0.shape[1], axis=1)
-        sorted_p_x_0 = paddle.exp(sorted_log_p_x_0)
-        keep_mask = (sorted_p_x_0.cumsum(axis=1) < truncation_rate).cast("int64")
-
-        # Ensure that at least the largest probability is not zeroed out
-        all_true = paddle.full_like(keep_mask[:, 0:1, :], 1)
-        keep_mask = paddle.concat((all_true, keep_mask), axis=1)
-        keep_mask = keep_mask[:, :-1, :]
-
-        keep_mask = paddle.take_along_axis(keep_mask, indices.argsort(1), axis=1).cast(
-            "bool"
-        )  # keep_mask.gather(indices.argsort(1), axis=1)
-        rv = log_p_x_0.clone()
-        # rv[~keep_mask] = -INF  # -inf = log(0)
-        rv = paddle.where(keep_mask, rv, paddle.to_tensor(-INF, dtype="float32"))
-
-        return rv
diff --git a/ppdiffusers/ppdiffusers/schedulers/__init__.py b/ppdiffusers/ppdiffusers/schedulers/__init__.py
deleted file mode 100644
index 01817c75a2f4..000000000000
--- a/ppdiffusers/ppdiffusers/schedulers/__init__.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from ..utils import (
-    OptionalDependencyNotAvailable,
-    is_paddle_available,
-    is_scipy_available,
-)
-
-try:
-    if not is_paddle_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ..utils.dummy_paddle_objects import *  # noqa F403
-else:
-    from .preconfig.preconfig_scheduling_euler_ancestral_discrete import (
-        PreconfigEulerAncestralDiscreteScheduler,
-    )
-    from .scheduling_ddim import DDIMScheduler
-    from .scheduling_ddim_inverse import DDIMInverseScheduler
-    from .scheduling_ddpm import DDPMScheduler
-    from .scheduling_deis_multistep import DEISMultistepScheduler
-    from .scheduling_dpmsolver_multistep import DPMSolverMultistepScheduler
-    from .scheduling_dpmsolver_singlestep import DPMSolverSinglestepScheduler
-    from .scheduling_dpmsolver_unidiffuser import DPMSolverUniDiffuserScheduler
-    from .scheduling_euler_ancestral_discrete import EulerAncestralDiscreteScheduler
-    from .scheduling_euler_discrete import EulerDiscreteScheduler
-    from .scheduling_heun_discrete import HeunDiscreteScheduler
-    from .scheduling_ipndm import IPNDMScheduler
-    from .scheduling_k_dpm_2_ancestral_discrete import KDPM2AncestralDiscreteScheduler
-    from .scheduling_k_dpm_2_discrete import KDPM2DiscreteScheduler
-    from .scheduling_karras_ve import KarrasVeScheduler
-    from .scheduling_pndm import PNDMScheduler
-    from .scheduling_repaint import RePaintScheduler
-    from .scheduling_sde_ve import ScoreSdeVeScheduler
-    from .scheduling_sde_vp import ScoreSdeVpScheduler
-    from .scheduling_unclip import UnCLIPScheduler
-    from .scheduling_unipc_multistep import UniPCMultistepScheduler
-    from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
-    from .scheduling_vq_diffusion import VQDiffusionScheduler
-
-try:
-    if not (is_paddle_available() and is_scipy_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ..utils.dummy_paddle_and_scipy_objects import *  # noqa F403
-else:
-    from .preconfig.preconfig_scheduling_lms_discrete import (
-        PreconfigLMSDiscreteScheduler,
-    )
-    from .scheduling_lms_discrete import LMSDiscreteScheduler
diff --git a/ppdiffusers/ppdiffusers/schedulers/preconfig/__init__.py b/ppdiffusers/ppdiffusers/schedulers/preconfig/__init__.py
deleted file mode 100644
index ecff93753b32..000000000000
--- a/ppdiffusers/ppdiffusers/schedulers/preconfig/__init__.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# flake8: noqa
-
-from ...utils import (
-    OptionalDependencyNotAvailable,
-    is_paddle_available,
-    is_scipy_available,
-)
-
-try:
-    if not is_paddle_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ...utils.dummy_paddle_objects import *  # noqa F403
-else:
-    from .preconfig_scheduling_euler_ancestral_discrete import (
-        PreconfigEulerAncestralDiscreteScheduler,
-    )
-try:
-    if not (is_paddle_available() and is_scipy_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ...utils.dummy_paddle_and_scipy_objects import *  # noqa F403
-else:
-    from .preconfig_scheduling_lms_discrete import PreconfigLMSDiscreteScheduler
diff --git a/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_euler_ancestral_discrete.py b/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_euler_ancestral_discrete.py
deleted file mode 100644
index 3a1b59cca9dd..000000000000
--- a/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_euler_ancestral_discrete.py
+++ /dev/null
@@ -1,310 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2022 Katherine Crowson and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-
-from ...configuration_utils import ConfigMixin, register_to_config
-from ...utils import BaseOutput, logging, randn_tensor
-from ..scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-@dataclass
-# Copied from ppdiffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->EulerAncestralDiscrete
-class PreconfigEulerAncestralDiscreteSchedulerOutput(BaseOutput):
-    """
-    Output class for the scheduler's step function output.
-
-    Args:
-        prev_sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
-            Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
-            denoising loop.
-        pred_original_sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
-            The predicted denoised sample (x_{0}) based on the model output from the current timestep.
-            `pred_original_sample` can be used to preview progress or for guidance.
-    """
-
-    prev_sample: paddle.Tensor
-    pred_original_sample: Optional[paddle.Tensor] = None
-
-
-# Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> paddle.Tensor:
-    """
-    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
-    (1-beta) over time from t = [0,1].
-
-    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
-    to that part of the diffusion process.
-
-
-    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-
-    Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
-    """
-
-    def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
-
-    betas = []
-    for i in range(num_diffusion_timesteps):
-        t1 = i / num_diffusion_timesteps
-        t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
-    return paddle.to_tensor(betas, dtype=paddle.float32)
-
-
-class PreconfigEulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
-    """
-    Ancestral sampling with Euler method steps. Based on the original k-diffusion implementation by Katherine Crowson:
-    https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L72
-
-    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
-    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
-    [`~SchedulerMixin.from_pretrained`] functions.
-
-    Args:
-        num_train_timesteps (`int`): number of diffusion steps used to train the model.
-        beta_start (`float`): the starting `beta` value of inference.
-        beta_end (`float`): the final `beta` value.
-        beta_schedule (`str`):
-            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
-            `linear` or `scaled_linear`.
-        trained_betas (`np.ndarray`, optional):
-            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
-        prediction_type (`str`, default `epsilon`, optional):
-            prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
-            process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
-            https://imagen.research.google/video/paper.pdf)
-
-    """
-
-    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
-    order = 1
-
-    @register_to_config
-    def __init__(
-        self,
-        num_train_timesteps: int = 1000,
-        beta_start: float = 0.0001,
-        beta_end: float = 0.02,
-        beta_schedule: str = "linear",
-        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
-        prediction_type: str = "epsilon",
-        preconfig: bool = True,
-    ):
-        if trained_betas is not None:
-            self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
-        elif beta_schedule == "linear":
-            self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
-        elif beta_schedule == "scaled_linear":
-            # this schedule is very specific to the latent diffusion model.
-            self.betas = (
-                paddle.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=paddle.float32) ** 2
-            )
-        elif beta_schedule == "squaredcos_cap_v2":
-            # Glide cosine schedule
-            self.betas = betas_for_alpha_bar(num_train_timesteps)
-        else:
-            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
-
-        self.alphas = 1.0 - self.betas
-        self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
-
-        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
-        sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32)
-        self.sigmas = paddle.to_tensor(sigmas)
-
-        # standard deviation of the initial noise distribution
-        self.init_noise_sigma = self.sigmas.max()
-
-        # setable values
-        self.num_inference_steps = None
-        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy()
-        self.timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32)
-        self.is_scale_input_called = False
-        self.preconfig = preconfig
-        self.step_index_offset = 0
-
-    def scale_model_input(
-        self, sample: paddle.Tensor, timestep: Union[float, paddle.Tensor], **kwargs
-    ) -> paddle.Tensor:
-        """
-        Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm.
-
-        Args:
-            sample (`paddle.Tensor`): input sample
-            timestep (`float` or `paddle.Tensor`): the current timestep in the diffusion chain
-
-        Returns:
-            `paddle.Tensor`: scaled input sample
-        """
-        self.is_scale_input_called = True
-        if kwargs.get("step_index") is not None:
-            step_index = kwargs["step_index"] + self.step_index_offset
-        else:
-            step_index = (self.timesteps == timestep).nonzero().item()
-
-        if not self.preconfig:
-            sigma = self.sigmas[step_index]
-            sample = sample / ((sigma**2 + 1) ** 0.5)
-            return sample
-        else:
-            if step_index > (len(self.latent_scales) - 1):
-                step_index = -1
-            return sample * self.latent_scales[step_index]
-
-    def set_timesteps(self, num_inference_steps: int):
-        """
-        Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
-
-        Args:
-            num_inference_steps (`int`):
-                the number of diffusion steps used when generating samples with a pre-trained model.
-        """
-        self.num_inference_steps = num_inference_steps
-        self.step_index_offset = 0
-
-        timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
-        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
-        sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
-        sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
-        self.sigmas = paddle.to_tensor(sigmas)
-        self.timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32)
-        if self.preconfig:
-            self.sigma_up = []
-            self.sigma_down = []
-            for step_index_i in range(len(self.timesteps)):
-                sigma_from = self.sigmas[step_index_i]
-                sigma_to = self.sigmas[step_index_i + 1]
-                sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5
-                sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
-                self.sigma_up.append(sigma_up)
-                self.sigma_down.append(sigma_down)
-            self.latent_scales = 1 / ((self.sigmas**2 + 1) ** 0.5)
-
-    def step(
-        self,
-        model_output: paddle.Tensor,
-        timestep: Union[float, paddle.Tensor],
-        sample: paddle.Tensor,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        return_dict: bool = True,
-        **kwargs
-    ) -> Union[PreconfigEulerAncestralDiscreteSchedulerOutput, Tuple]:
-        """
-        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
-        process from the learned model outputs (most often the predicted noise).
-
-        Args:
-            model_output (`paddle.Tensor`): direct output from learned diffusion model.
-            timestep (`float`): current timestep in the diffusion chain.
-            sample (`paddle.Tensor`):
-                current instance of sample being created by diffusion process.
-            generator (`paddle.Generator`, optional): Random number generator.
-            return_dict (`bool`): option for returning tuple rather than PreconfigEulerAncestralDiscreteSchedulerOutput class
-
-        Returns:
-            [`~schedulers.scheduling_utils.PreconfigEulerAncestralDiscreteSchedulerOutput`] or `tuple`:
-            [`~schedulers.scheduling_utils.PreconfigEulerAncestralDiscreteSchedulerOutput`] if `return_dict` is True, otherwise
-            a `tuple`. When returning a tuple, the first element is the sample tensor.
-
-        """
-        if not self.is_scale_input_called:
-            logger.warning(
-                "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
-                "See `StableDiffusionPipeline` for a usage example."
-            )
-        if kwargs.get("return_pred_original_sample") is not None:
-            return_pred_original_sample = kwargs["return_pred_original_sample"]
-        else:
-            return_pred_original_sample = True
-        if kwargs.get("step_index") is not None:
-            step_index = kwargs["step_index"] + self.step_index_offset
-        else:
-            step_index = (self.timesteps == timestep).nonzero().item()
-        sigma = self.sigmas[step_index]
-        if self.config.prediction_type == "epsilon" and not return_pred_original_sample:
-            derivative = model_output
-            pred_original_sample = None
-        else:
-            # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
-            if self.config.prediction_type == "epsilon":
-                pred_original_sample = sample - sigma * model_output
-            elif self.config.prediction_type == "v_prediction":
-                # * c_out + input * c_skip
-                pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
-            elif self.config.prediction_type == "sample":
-                raise NotImplementedError("prediction_type not implemented yet: sample")
-            else:
-                raise ValueError(
-                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
-                )
-            derivative = (sample - pred_original_sample) / sigma
-        if not self.preconfig:
-            sigma_from = self.sigmas[step_index]
-            sigma_to = self.sigmas[step_index + 1]
-            sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5
-            sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
-        else:
-            sigma_up = self.sigma_up[step_index]
-            sigma_down = self.sigma_down[step_index]
-        # 2. Convert to an ODE derivative
-        dt = sigma_down - sigma
-        prev_sample = sample + derivative * dt
-        noise = randn_tensor(model_output.shape, dtype=model_output.dtype, generator=generator)
-        prev_sample = prev_sample + noise * sigma_up
-        if not return_dict:
-            if not return_pred_original_sample:
-                return (prev_sample,)
-            else:
-                return (prev_sample, pred_original_sample)
-
-        return PreconfigEulerAncestralDiscreteSchedulerOutput(
-            prev_sample=prev_sample, pred_original_sample=pred_original_sample
-        )
-
-    def add_noise(
-        self,
-        original_samples: paddle.Tensor,
-        noise: paddle.Tensor,
-        timesteps: paddle.Tensor,
-    ) -> paddle.Tensor:
-        # Make sure sigmas and timesteps have the same dtype as original_samples
-        self.sigmas = self.sigmas.cast(original_samples.dtype)
-
-        schedule_timesteps = self.timesteps
-        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
-
-        sigma = self.sigmas[step_indices].flatten()
-        while len(sigma.shape) < len(original_samples.shape):
-            sigma = sigma.unsqueeze(-1)
-
-        noisy_samples = original_samples + noise * sigma
-        return noisy_samples
-
-    def __len__(self):
-        return self.config.num_train_timesteps
diff --git a/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_lms_discrete.py b/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_lms_discrete.py
deleted file mode 100644
index 96bbb8620558..000000000000
--- a/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_lms_discrete.py
+++ /dev/null
@@ -1,337 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2022 Katherine Crowson and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-import warnings
-from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-from scipy import integrate
-
-from ...configuration_utils import ConfigMixin, register_to_config
-from ...utils import BaseOutput
-from ..scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
-
-
-@dataclass
-# Copied from ppdiffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->LMSDiscrete
-class PreconfigLMSDiscreteSchedulerOutput(BaseOutput):
-    """
-    Output class for the scheduler's step function output.
-
-    Args:
-        prev_sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
-            Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
-            denoising loop.
-        pred_original_sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
-            The predicted denoised sample (x_{0}) based on the model output from the current timestep.
-            `pred_original_sample` can be used to preview progress or for guidance.
-    """
-
-    prev_sample: paddle.Tensor
-    pred_original_sample: Optional[paddle.Tensor] = None
-
-
-# Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
-    """
-    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
-    (1-beta) over time from t = [0,1].
-
-    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
-    to that part of the diffusion process.
-
-
-    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-
-    Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
-    """
-
-    def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
-
-    betas = []
-    for i in range(num_diffusion_timesteps):
-        t1 = i / num_diffusion_timesteps
-        t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
-    return paddle.to_tensor(betas, dtype=paddle.float32)
-
-
-class PreconfigLMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
-    """
-    Linear Multistep Scheduler for discrete beta schedules. Based on the original k-diffusion implementation by
-    Katherine Crowson:
-    https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L181
-
-    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
-    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
-    [`~SchedulerMixin.from_pretrained`] functions.
-
-    Args:
-        num_train_timesteps (`int`): number of diffusion steps used to train the model.
-        beta_start (`float`): the starting `beta` value of inference.
-        beta_end (`float`): the final `beta` value.
-        beta_schedule (`str`):
-            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
-            `linear` or `scaled_linear`.
-        trained_betas (`np.ndarray`, optional):
-            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
-        prediction_type (`str`, default `epsilon`, optional):
-            prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
-            process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
-            https://imagen.research.google/video/paper.pdf)
-    """
-
-    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
-    order = 1
-
-    @register_to_config
-    def __init__(
-        self,
-        num_train_timesteps: int = 1000,
-        beta_start: float = 0.0001,
-        beta_end: float = 0.02,
-        beta_schedule: str = "linear",
-        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
-        prediction_type: str = "epsilon",
-        preconfig=True,
-    ):
-        if trained_betas is not None:
-            self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
-        elif beta_schedule == "linear":
-            self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
-        elif beta_schedule == "scaled_linear":
-            # this schedule is very specific to the latent diffusion model.
-            self.betas = (
-                paddle.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=paddle.float32) ** 2
-            )
-        elif beta_schedule == "squaredcos_cap_v2":
-            # Glide cosine schedule
-            self.betas = betas_for_alpha_bar(num_train_timesteps)
-        else:
-            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
-
-        self.alphas = 1.0 - self.betas
-        self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
-
-        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
-        sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32)
-        self.sigmas = paddle.to_tensor(sigmas)
-
-        # standard deviation of the initial noise distribution
-        self.init_noise_sigma = self.sigmas.max()
-
-        # setable values
-        self.num_inference_steps = None
-        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy()
-        self.timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32)
-        self.derivatives = []
-        self.is_scale_input_called = False
-        self.preconfig = preconfig
-
-    def scale_model_input(
-        self, sample: paddle.Tensor, timestep: Union[float, paddle.Tensor], **kwargs
-    ) -> paddle.Tensor:
-        """
-        Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the K-LMS algorithm.
-
-        Args:
-            sample (`paddle.Tensor`): input sample
-            timestep (`float` or `paddle.Tensor`): the current timestep in the diffusion chain
-
-        Returns:
-            `paddle.Tensor`: scaled input sample
-        """
-        if kwargs.get("step_index") is not None:
-            step_index = kwargs["step_index"]
-        else:
-            step_index = (self.timesteps == timestep).nonzero().item()
-        self.is_scale_input_called = True
-        if not self.preconfig:
-            sigma = self.sigmas[step_index]
-            sample = sample / ((sigma**2 + 1) ** 0.5)
-            return sample
-        else:
-            return sample * self.latent_scales[step_index]
-
-    def get_lms_coefficient(self, order, t, current_order):
-        """
-        Compute a linear multistep coefficient.
-
-        Args:
-            order (TODO):
-            t (TODO):
-            current_order (TODO):
-        """
-
-        def lms_derivative(tau):
-            prod = 1.0
-            for k in range(order):
-                if current_order == k:
-                    continue
-                prod *= (tau - self.sigmas[t - k]) / (self.sigmas[t - current_order] - self.sigmas[t - k])
-            return prod
-
-        integrated_coeff = integrate.quad(lms_derivative, self.sigmas[t], self.sigmas[t + 1], epsrel=1e-4)[0]
-
-        return integrated_coeff
-
-    def set_timesteps(self, num_inference_steps: int, preconfig_order: int = 4):
-        """
-        Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
-
-        Args:
-            num_inference_steps (`int`):
-                the number of diffusion steps used when generating samples with a pre-trained model.
-        """
-        self.num_inference_steps = num_inference_steps
-
-        timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
-        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
-        sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
-        sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
-        self.sigmas = paddle.to_tensor(sigmas)
-        self.timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32)
-
-        self.derivatives = []
-        if self.preconfig:
-            self.order = preconfig_order
-            self.lms_coeffs = []
-            self.latent_scales = [1.0 / ((sigma**2 + 1) ** 0.5) for sigma in self.sigmas]
-            for step_index in range(self.num_inference_steps):
-                order = min(step_index + 1, preconfig_order)
-                self.lms_coeffs.append(
-                    [self.get_lms_coefficient(order, step_index, curr_order) for curr_order in range(order)]
-                )
-
-    def step(
-        self,
-        model_output: paddle.Tensor,
-        timestep: Union[float, paddle.Tensor],
-        sample: paddle.Tensor,
-        order: int = 4,
-        return_dict: bool = True,
-        **kwargs
-    ) -> Union[PreconfigLMSDiscreteSchedulerOutput, Tuple]:
-        """
-        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
-        process from the learned model outputs (most often the predicted noise).
-
-        Args:
-            model_output (`paddle.Tensor`): direct output from learned diffusion model.
-            timestep (`float`): current timestep in the diffusion chain.
-            sample (`paddle.Tensor`):
-                current instance of sample being created by diffusion process.
-            order: coefficient for multi-step inference.
-            return_dict (`bool`): option for returning tuple rather than PreconfigLMSDiscreteSchedulerOutput class
-            Args in kwargs:
-                step_index (`int`):
-                return_pred_original_sample (`bool`): option for return pred_original_sample
-
-        Returns:
-            [`~schedulers.scheduling_utils.PreconfigLMSDiscreteSchedulerOutput`] or `tuple`:
-            [`~schedulers.scheduling_utils.PreconfigLMSDiscreteSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`.
-            When returning a tuple, the first element is the sample tensor.
-
-        """
-        if not self.is_scale_input_called:
-            warnings.warn(
-                "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
-                "See `StableDiffusionPipeline` for a usage example."
-            )
-        if kwargs.get("return_pred_original_sample") is not None:
-            return_pred_original_sample = kwargs["return_pred_original_sample"]
-        else:
-            return_pred_original_sample = True
-        if kwargs.get("step_index") is not None:
-            step_index = kwargs["step_index"]
-        else:
-            step_index = (self.timesteps == timestep).nonzero().item()
-        if self.config.prediction_type == "epsilon" and not return_pred_original_sample:
-            # if pred_original_sample is no need
-            self.derivatives.append(model_output)
-            pred_original_sample = None
-        else:
-            sigma = self.sigmas[step_index]
-            # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
-            if self.config.prediction_type == "epsilon":
-                pred_original_sample = sample - sigma * model_output
-            elif self.config.prediction_type == "v_prediction":
-                # * c_out + input * c_skip
-                pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
-            elif self.config.prediction_type == "sample":
-                pred_original_sample = model_output
-            else:
-                raise ValueError(
-                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
-                )
-            # 2. Convert to an ODE derivative
-            derivative = (sample - pred_original_sample) / sigma
-            self.derivatives.append(derivative)
-
-        if len(self.derivatives) > order:
-            self.derivatives.pop(0)
-
-        if not self.preconfig:
-            # 3. If not preconfiged, compute linear multistep coefficients.
-            order = min(step_index + 1, order)
-            lms_coeffs = [self.get_lms_coefficient(order, step_index, curr_order) for curr_order in range(order)]
-            # 4. Compute previous sample based on the derivatives path
-            prev_sample = sample + sum(
-                coeff * derivative for coeff, derivative in zip(lms_coeffs, reversed(self.derivatives))
-            )
-        else:
-            # 3. If preconfiged, direct compute previous sample based on the derivatives path
-            prev_sample = sample + sum(
-                coeff * derivative
-                for coeff, derivative in zip(self.lms_coeffs[step_index], reversed(self.derivatives))
-            )
-
-        if not return_dict:
-            if not return_pred_original_sample:
-                return (prev_sample,)
-            else:
-                return (prev_sample, pred_original_sample)
-
-        return PreconfigLMSDiscreteSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
-
-    def add_noise(
-        self,
-        original_samples: paddle.Tensor,
-        noise: paddle.Tensor,
-        timesteps: paddle.Tensor,
-    ) -> paddle.Tensor:
-        # Make sure sigmas and timesteps have the same dtype as original_samples
-        sigmas = self.sigmas.cast(original_samples.dtype)
-        schedule_timesteps = self.timesteps
-
-        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
-
-        sigma = sigmas[step_indices].flatten()
-        while len(sigma.shape) < len(original_samples.shape):
-            sigma = sigma.unsqueeze(-1)
-
-        noisy_samples = original_samples + noise * sigma
-        return noisy_samples
-
-    def __len__(self):
-        return self.config.num_train_timesteps
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_ddim.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_ddim.py
deleted file mode 100644
index d915a5ee72f8..000000000000
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_ddim.py
+++ /dev/null
@@ -1,424 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2022 Stanford University Team and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
-# and https://github.com/hojonathanho/diffusion
-
-import math
-from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, randn_tensor
-from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
-
-
-@dataclass
-# Copied from ppdiffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->DDIM
-class DDIMSchedulerOutput(BaseOutput):
-    """
-    Output class for the scheduler's step function output.
-
-    Args:
-        prev_sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
-            Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
-            denoising loop.
-        pred_original_sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
-            The predicted denoised sample (x_{0}) based on the model output from the current timestep.
-            `pred_original_sample` can be used to preview progress or for guidance.
-    """
-
-    prev_sample: paddle.Tensor
-    pred_original_sample: Optional[paddle.Tensor] = None
-
-
-# Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> paddle.Tensor:
-    """
-    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
-    (1-beta) over time from t = [0,1].
-
-    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
-    to that part of the diffusion process.
-
-
-    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-
-    Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
-    """
-
-    def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
-
-    betas = []
-    for i in range(num_diffusion_timesteps):
-        t1 = i / num_diffusion_timesteps
-        t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
-    return paddle.to_tensor(betas, dtype=paddle.float32)
-
-
-class DDIMScheduler(SchedulerMixin, ConfigMixin):
-    """
-    Denoising diffusion implicit models is a scheduler that extends the denoising procedure introduced in denoising
-    diffusion probabilistic models (DDPMs) with non-Markovian guidance.
-
-    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
-    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
-    [`~SchedulerMixin.from_pretrained`] functions.
-
-    For more details, see the original paper: https://arxiv.org/abs/2010.02502
-
-    Args:
-        num_train_timesteps (`int`): number of diffusion steps used to train the model.
-        beta_start (`float`): the starting `beta` value of inference.
-        beta_end (`float`): the final `beta` value.
-        beta_schedule (`str`):
-            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
-            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
-        trained_betas (`np.ndarray`, optional):
-            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
-        clip_sample (`bool`, default `True`):
-            option to clip predicted sample for numerical stability.
-        clip_sample_range (`float`, default `1.0`):
-            the maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
-        set_alpha_to_one (`bool`, default `True`):
-            each diffusion step uses the value of alphas product at that step and at the previous one. For the final
-            step there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
-            otherwise it uses the value of alpha at step 0.
-        steps_offset (`int`, default `0`):
-            an offset added to the inference steps. You can use a combination of `offset=1` and
-            `set_alpha_to_one=False`, to make the last step use step 0 for the previous alpha product, as done in
-            stable diffusion.
-        prediction_type (`str`, default `epsilon`, optional):
-            prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
-            process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
-            https://imagen.research.google/video/paper.pdf)
-        thresholding (`bool`, default `False`):
-            whether to use the "dynamic thresholding" method (introduced by Imagen, https://arxiv.org/abs/2205.11487).
-            Note that the thresholding method is unsuitable for latent-space diffusion models (such as
-            stable-diffusion).
-        dynamic_thresholding_ratio (`float`, default `0.995`):
-            the ratio for the dynamic thresholding method. Default is `0.995`, the same as Imagen
-            (https://arxiv.org/abs/2205.11487). Valid only when `thresholding=True`.
-        sample_max_value (`float`, default `1.0`):
-            the threshold value for dynamic thresholding. Valid only when `thresholding=True`.
-    """
-
-    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
-    order = 1
-
-    @register_to_config
-    def __init__(
-        self,
-        num_train_timesteps: int = 1000,
-        beta_start: float = 0.0001,
-        beta_end: float = 0.02,
-        beta_schedule: str = "linear",
-        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
-        clip_sample: bool = True,
-        set_alpha_to_one: bool = True,
-        steps_offset: int = 0,
-        prediction_type: str = "epsilon",
-        thresholding: bool = False,
-        dynamic_thresholding_ratio: float = 0.995,
-        clip_sample_range: float = 1.0,
-        sample_max_value: float = 1.0,
-    ):
-        if trained_betas is not None:
-            self.betas = paddle.to_tensor(trained_betas, dtype="float32")
-        elif beta_schedule == "linear":
-            self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype="float32")
-        elif beta_schedule == "scaled_linear":
-            # this schedule is very specific to the latent diffusion model.
-            self.betas = paddle.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype="float32") ** 2
-        elif beta_schedule == "squaredcos_cap_v2":
-            # Glide cosine schedule
-            self.betas = betas_for_alpha_bar(num_train_timesteps)
-        else:
-            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
-
-        self.alphas = 1.0 - self.betas
-        self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
-
-        # At every step in ddim, we are looking into the previous alphas_cumprod
-        # For the final step, there is no previous alphas_cumprod because we are already at 0
-        # `set_alpha_to_one` decides whether we set this parameter simply to one or
-        # whether we use the final alpha of the "non-previous" one.
-        self.final_alpha_cumprod = paddle.to_tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
-
-        # standard deviation of the initial noise distribution
-        self.init_noise_sigma = 1.0
-
-        # setable values
-        self.num_inference_steps = None
-        self.timesteps = paddle.to_tensor(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64))
-
-    def scale_model_input(self, sample: paddle.Tensor, timestep: Optional[int] = None) -> paddle.Tensor:
-        """
-        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
-        current timestep.
-
-        Args:
-            sample (`paddle.Tensor`): input sample
-            timestep (`int`, optional): current timestep
-
-        Returns:
-            `paddle.Tensor`: scaled input sample
-        """
-        return sample
-
-    def _get_variance(self, timestep, prev_timestep):
-        alpha_prod_t = self.alphas_cumprod[timestep]
-        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
-        beta_prod_t = 1 - alpha_prod_t
-        beta_prod_t_prev = 1 - alpha_prod_t_prev
-
-        variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
-
-        return variance
-
-    def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor:
-        """
-        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
-        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
-        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
-        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
-        photorealism as well as better image-text alignment, especially when using very large guidance weights."
-        https://arxiv.org/abs/2205.11487
-        """
-        dtype = sample.dtype
-        batch_size, channels, height, width = sample.shape
-
-        if dtype not in (paddle.float32, paddle.float64):
-            sample = paddle.cast(
-                sample, "float32"
-            )  # upcast for quantile calculation, and clamp not implemented for cpu half
-
-        # Flatten sample for doing quantile calculation along each image
-        sample = paddle.reshape(sample, [batch_size, channels * height * width])
-
-        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
-
-        s = paddle.quantile(abs_sample, self.config.dynamic_thresholding_ratio, axis=1)
-        # paddle.clip donot support min > max
-        if self.config.sample_max_value < 1:
-            s = paddle.ones_like(s) * self.config.sample_max_value
-        else:
-            s = paddle.clip(
-                s, min=1, max=self.config.sample_max_value
-            )  # When clip to min=1, equivalent to standard clipping to [-1, 1]
-
-        s = s.unsqueeze(1)  # (batch_size, 1) because clip will broadcast along axis=0
-        sample = paddle.clip(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
-
-        sample = paddle.reshape(sample, [batch_size, channels, height, width])
-        sample = paddle.cast(sample, dtype)
-
-        return sample
-
-    def set_timesteps(self, num_inference_steps: int):
-        """
-        Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
-
-        Args:
-            num_inference_steps (`int`):
-                the number of diffusion steps used when generating samples with a pre-trained model.
-        """
-
-        if num_inference_steps > self.config.num_train_timesteps:
-            raise ValueError(
-                f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
-                f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
-                f" maximal {self.config.num_train_timesteps} timesteps."
-            )
-
-        self.num_inference_steps = num_inference_steps
-        step_ratio = self.config.num_train_timesteps // self.num_inference_steps
-        # creates integer timesteps by multiplying by ratio
-        # casting to int to avoid issues when num_inference_step is power of 3
-        timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
-        self.timesteps = paddle.to_tensor(timesteps)
-        self.timesteps += self.config.steps_offset
-
-    def step(
-        self,
-        model_output: paddle.Tensor,
-        timestep: int,
-        sample: paddle.Tensor,
-        eta: float = 0.0,
-        use_clipped_model_output: bool = False,
-        generator=None,
-        variance_noise: Optional[paddle.Tensor] = None,
-        return_dict: bool = True,
-    ) -> Union[DDIMSchedulerOutput, Tuple]:
-        """
-        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
-        process from the learned model outputs (most often the predicted noise).
-
-        Args:
-            model_output (`paddle.Tensor`): direct output from learned diffusion model.
-            timestep (`int`): current discrete timestep in the diffusion chain.
-            sample (`paddle.Tensor`):
-                current instance of sample being created by diffusion process.
-            eta (`float`): weight of noise for added noise in diffusion step.
-            use_clipped_model_output (`bool`): if `True`, compute "corrected" `model_output` from the clipped
-                predicted original sample. Necessary because predicted original sample is clipped to [-1, 1] when
-                `self.config.clip_sample` is `True`. If no clipping has happened, "corrected" `model_output` would
-                coincide with the one provided as input and `use_clipped_model_output` will have not effect.
-            generator: random number generator.
-            variance_noise (`paddle.Tensor`): instead of generating noise for the variance using `generator`, we
-                can directly provide the noise for the variance itself. This is useful for methods such as
-                CycleDiffusion. (https://arxiv.org/abs/2210.05559)
-            return_dict (`bool`): option for returning tuple rather than DDIMSchedulerOutput class
-
-        Returns:
-            [`~schedulers.scheduling_utils.DDIMSchedulerOutput`] or `tuple`:
-            [`~schedulers.scheduling_utils.DDIMSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. When
-            returning a tuple, the first element is the sample tensor.
-
-        """
-        if self.num_inference_steps is None:
-            raise ValueError(
-                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
-            )
-
-        # See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf
-        # Ideally, read DDIM paper in-detail understanding
-
-        # Notation (<variable name> -> <name in paper>
-        # - pred_noise_t -> e_theta(x_t, t)
-        # - pred_original_sample -> f_theta(x_t, t) or x_0
-        # - std_dev_t -> sigma_t
-        # - eta -> η
-        # - pred_sample_direction -> "direction pointing to x_t"
-        # - pred_prev_sample -> "x_t-1"
-
-        # 1. get previous step value (=t-1)
-        prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps
-
-        # 2. compute alphas, betas
-        alpha_prod_t = self.alphas_cumprod[timestep]
-        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
-
-        beta_prod_t = 1 - alpha_prod_t
-
-        # 3. compute predicted original sample from predicted noise also called
-        # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-        if self.config.prediction_type == "epsilon":
-            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
-            pred_epsilon = model_output
-        elif self.config.prediction_type == "sample":
-            pred_original_sample = model_output
-            pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
-        elif self.config.prediction_type == "v_prediction":
-            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
-            pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
-        else:
-            raise ValueError(
-                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
-                " `v_prediction`"
-            )
-
-        # 4. Clip or threshold "predicted x_0"
-        if self.config.thresholding:
-            pred_original_sample = self._threshold_sample(pred_original_sample)
-        elif self.config.clip_sample:
-            pred_original_sample = pred_original_sample.clip(
-                -self.config.clip_sample_range, self.config.clip_sample_range
-            )
-
-        # 5. compute variance: "sigma_t(η)" -> see formula (16)
-        # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
-        variance = self._get_variance(timestep, prev_timestep)
-        std_dev_t = eta * variance ** (0.5)
-
-        if use_clipped_model_output:
-            # the pred_epsilon is always re-derived from the clipped x_0 in Glide
-            pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
-
-        # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-        pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * pred_epsilon
-
-        # 7. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-        prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
-
-        if eta > 0:
-            if variance_noise is not None and generator is not None:
-                raise ValueError(
-                    "Cannot pass both generator and variance_noise. Please make sure that either `generator` or"
-                    " `variance_noise` stays `None`."
-                )
-
-            if variance_noise is None:
-                variance_noise = randn_tensor(model_output.shape, generator=generator, dtype=model_output.dtype)
-            variance = std_dev_t * variance_noise
-
-            prev_sample = prev_sample + variance
-
-        if not return_dict:
-            return (prev_sample,)
-
-        return DDIMSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
-
-    def add_noise(
-        self,
-        original_samples: paddle.Tensor,
-        noise: paddle.Tensor,
-        timesteps: paddle.Tensor,
-    ) -> paddle.Tensor:
-        # Make sure alphas_cumprod and timestep have same dtype as original_samples
-        alphas_cumprod = self.alphas_cumprod.cast(dtype=original_samples.dtype)
-
-        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
-        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
-        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
-            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
-
-        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
-        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
-        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
-            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
-
-        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
-        return noisy_samples
-
-    def get_velocity(self, sample: paddle.Tensor, noise: paddle.Tensor, timesteps: paddle.Tensor) -> paddle.Tensor:
-        # Make sure alphas_cumprod and timestep have same dtype as sample
-        alphas_cumprod = self.alphas_cumprod.cast(dtype=sample.dtype)
-
-        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
-        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
-        while len(sqrt_alpha_prod.shape) < len(sample.shape):
-            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
-
-        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
-        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
-        while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
-            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
-
-        velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
-        return velocity
-
-    def __len__(self):
-        return self.config.num_train_timesteps
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_ddim_inverse.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_ddim_inverse.py
deleted file mode 100644
index 0044dfde5919..000000000000
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_ddim_inverse.py
+++ /dev/null
@@ -1,267 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
-# and https://github.com/hojonathanho/diffusion
-import math
-from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, deprecate
-from .scheduling_utils import SchedulerMixin
-
-
-@dataclass
-# Copied from ppdiffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->DDIM
-class DDIMSchedulerOutput(BaseOutput):
-    """
-    Output class for the scheduler's step function output.
-
-    Args:
-        prev_sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
-            Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
-            denoising loop.
-        pred_original_sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
-            The predicted denoised sample (x_{0}) based on the model output from the current timestep.
-            `pred_original_sample` can be used to preview progress or for guidance.
-    """
-
-    prev_sample: paddle.Tensor
-    pred_original_sample: Optional[paddle.Tensor] = None
-
-
-# Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> paddle.Tensor:
-    """
-    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
-    (1-beta) over time from t = [0,1].
-
-    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
-    to that part of the diffusion process.
-
-
-    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-
-    Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
-    """
-
-    def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
-
-    betas = []
-    for i in range(num_diffusion_timesteps):
-        t1 = i / num_diffusion_timesteps
-        t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
-    return paddle.to_tensor(betas, dtype=paddle.float32)
-
-
-class DDIMInverseScheduler(SchedulerMixin, ConfigMixin):
-    """
-    DDIMInverseScheduler is the reverse scheduler of [`DDIMScheduler`].
-
-    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
-    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
-    [`~SchedulerMixin.from_pretrained`] functions.
-
-    For more details, see the original paper: https://arxiv.org/abs/2010.02502
-
-    Args:
-        num_train_timesteps (`int`): number of diffusion steps used to train the model.
-        beta_start (`float`): the starting `beta` value of inference.
-        beta_end (`float`): the final `beta` value.
-        beta_schedule (`str`):
-            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
-            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
-        trained_betas (`np.ndarray`, optional):
-            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
-        clip_sample (`bool`, default `True`):
-            option to clip predicted sample for numerical stability.
-        clip_sample_range (`float`, default `1.0`):
-            the maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
-        set_alpha_to_zero (`bool`, default `True`):
-            each diffusion step uses the value of alphas product at that step and at the previous one. For the final
-            step there is no previous alpha. When this option is `True` the previous alpha product is fixed to `0`,
-            otherwise it uses the value of alpha at step `num_train_timesteps - 1`.
-        steps_offset (`int`, default `0`):
-            an offset added to the inference steps. You can use a combination of `offset=1` and
-            `set_alpha_to_zero=False`, to make the last step use step `num_train_timesteps - 1` for the previous alpha
-            product.
-        prediction_type (`str`, default `epsilon`, optional):
-            prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
-            process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
-            https://imagen.research.google/video/paper.pdf)
-    """
-
-    order = 1
-
-    @register_to_config
-    def __init__(
-        self,
-        num_train_timesteps: int = 1000,
-        beta_start: float = 0.0001,
-        beta_end: float = 0.02,
-        beta_schedule: str = "linear",
-        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
-        clip_sample: bool = True,
-        set_alpha_to_zero: bool = True,
-        steps_offset: int = 0,
-        prediction_type: str = "epsilon",
-        clip_sample_range: float = 1.0,
-        **kwargs,
-    ):
-        if kwargs.get("set_alpha_to_one", None) is not None:
-            deprecation_message = (
-                "The `set_alpha_to_one` argument is deprecated. Please use `set_alpha_to_zero` instead."
-            )
-            deprecate("set_alpha_to_one", "1.0.0", deprecation_message, standard_warn=False)
-            set_alpha_to_zero = kwargs["set_alpha_to_one"]
-
-        if trained_betas is not None:
-            self.betas = paddle.to_tensor(trained_betas, dtype="float32")
-        elif beta_schedule == "linear":
-            self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype="float32")
-        elif beta_schedule == "scaled_linear":
-            # this schedule is very specific to the latent diffusion model.
-            self.betas = paddle.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype="float32") ** 2
-        elif beta_schedule == "squaredcos_cap_v2":
-            # Glide cosine schedule
-            self.betas = betas_for_alpha_bar(num_train_timesteps)
-        else:
-            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
-
-        self.alphas = 1.0 - self.betas
-        self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
-
-        # At every step in inverted ddim, we are looking into the next alphas_cumprod
-        # For the final step, there is no next alphas_cumprod, and the index is out of bounds
-        # `set_alpha_to_zero` decides whether we set this parameter simply to zero
-        # in this case, self.step() just output the predicted noise
-        # or whether we use the final alpha of the "non-previous" one.
-        self.final_alpha_cumprod = paddle.to_tensor(0.0) if set_alpha_to_zero else self.alphas_cumprod[-1]
-
-        # standard deviation of the initial noise distribution
-        self.init_noise_sigma = 1.0
-
-        # setable values
-        self.num_inference_steps = None
-        self.timesteps = paddle.to_tensor(np.arange(0, num_train_timesteps).copy().astype(np.int64))
-
-    def scale_model_input(self, sample: paddle.Tensor, timestep: Optional[int] = None) -> paddle.Tensor:
-        """
-        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
-        current timestep.
-
-        Args:
-            sample (`paddle.Tensor`): input sample
-            timestep (`int`, optional): current timestep
-
-        Returns:
-            `paddle.Tensor`: scaled input sample
-        """
-        return sample
-
-    def set_timesteps(self, num_inference_steps: int):
-        """
-        Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
-
-        Args:
-            num_inference_steps (`int`):
-                the number of diffusion steps used when generating samples with a pre-trained model.
-        """
-
-        if num_inference_steps > self.config.num_train_timesteps:
-            raise ValueError(
-                f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
-                f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
-                f" maximal {self.config.num_train_timesteps} timesteps."
-            )
-
-        self.num_inference_steps = num_inference_steps
-        step_ratio = self.config.num_train_timesteps // self.num_inference_steps
-        # creates integer timesteps by multiplying by ratio
-        # casting to int to avoid issues when num_inference_step is power of 3
-        timesteps = (np.arange(0, num_inference_steps) * step_ratio).round().copy().astype(np.int64)
-        self.timesteps = paddle.to_tensor(timesteps)
-        self.timesteps += self.config.steps_offset
-
-    def step(
-        self,
-        model_output: paddle.Tensor,
-        timestep: int,
-        sample: paddle.Tensor,
-        eta: float = 0.0,
-        use_clipped_model_output: bool = False,
-        variance_noise: Optional[paddle.Tensor] = None,
-        return_dict: bool = True,
-    ) -> Union[DDIMSchedulerOutput, Tuple]:
-        # 1. get previous step value (=t+1)
-        prev_timestep = timestep + self.config.num_train_timesteps // self.num_inference_steps
-
-        # 2. compute alphas, betas
-        # change original implementation to exactly match noise levels for analogous forward process
-        alpha_prod_t = self.alphas_cumprod[timestep]
-        alpha_prod_t_prev = (
-            self.alphas_cumprod[prev_timestep]
-            if prev_timestep < self.config.num_train_timesteps
-            else self.final_alpha_cumprod
-        )
-
-        beta_prod_t = 1 - alpha_prod_t
-
-        # 3. compute predicted original sample from predicted noise also called
-        # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-        if self.config.prediction_type == "epsilon":
-            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
-            pred_epsilon = model_output
-        elif self.config.prediction_type == "sample":
-            pred_original_sample = model_output
-            pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
-        elif self.config.prediction_type == "v_prediction":
-            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
-            pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
-        else:
-            raise ValueError(
-                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
-                " `v_prediction`"
-            )
-
-        # 4. Clip or threshold "predicted x_0"
-        if self.config.clip_sample:
-            pred_original_sample = pred_original_sample.clip(
-                -self.config.clip_sample_range, self.config.clip_sample_range
-            )
-
-        # 5. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-        pred_sample_direction = (1 - alpha_prod_t_prev) ** (0.5) * pred_epsilon
-
-        # 6. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-        prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
-
-        if not return_dict:
-            return (prev_sample, pred_original_sample)
-        return DDIMSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
-
-    def __len__(self):
-        return self.config.num_train_timesteps
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_ddpm.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_ddpm.py
deleted file mode 100644
index 8cf8bbd0e3d3..000000000000
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_ddpm.py
+++ /dev/null
@@ -1,459 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2022 UC Berkeley Team and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# DISCLAIMER: This file is strongly influenced by https://github.com/ermongroup/ddim
-
-import math
-from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-import paddle.nn.functional as F
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, randn_tensor
-from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
-
-
-@dataclass
-class DDPMSchedulerOutput(BaseOutput):
-    """
-    Output class for the scheduler's step function output.
-
-    Args:
-        prev_sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
-            Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
-            denoising loop.
-        pred_original_sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
-            The predicted denoised sample (x_{0}) based on the model output from the current timestep.
-            `pred_original_sample` can be used to preview progress or for guidance.
-    """
-
-    prev_sample: paddle.Tensor
-    pred_original_sample: Optional[paddle.Tensor] = None
-
-
-def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
-    """
-    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
-    (1-beta) over time from t = [0,1].
-
-    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
-    to that part of the diffusion process.
-
-
-    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-
-    Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
-    """
-
-    def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
-
-    betas = []
-    for i in range(num_diffusion_timesteps):
-        t1 = i / num_diffusion_timesteps
-        t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
-    return paddle.to_tensor(betas, dtype=paddle.float32)
-
-
-class DDPMScheduler(SchedulerMixin, ConfigMixin):
-    """
-    Denoising diffusion probabilistic models (DDPMs) explores the connections between denoising score matching and
-    Langevin dynamics sampling.
-
-    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
-    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
-    [`~SchedulerMixin.from_pretrained`] functions.
-
-    For more details, see the original paper: https://arxiv.org/abs/2006.11239
-
-    Args:
-        num_train_timesteps (`int`): number of diffusion steps used to train the model.
-        beta_start (`float`): the starting `beta` value of inference.
-        beta_end (`float`): the final `beta` value.
-        beta_schedule (`str`):
-            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
-            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
-        trained_betas (`np.ndarray`, optional):
-            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
-        variance_type (`str`):
-            options to clip the variance used when adding noise to the denoised sample. Choose from `fixed_small`,
-            `fixed_small_log`, `fixed_large`, `fixed_large_log`, `learned` or `learned_range`.
-        clip_sample (`bool`, default `True`):
-            option to clip predicted sample for numerical stability.
-        clip_sample_range (`float`, default `1.0`):
-            the maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
-        prediction_type (`str`, default `epsilon`, optional):
-            prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
-            process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
-            https://imagen.research.google/video/paper.pdf)
-        thresholding (`bool`, default `False`):
-            whether to use the "dynamic thresholding" method (introduced by Imagen, https://arxiv.org/abs/2205.11487).
-            Note that the thresholding method is unsuitable for latent-space diffusion models (such as
-            stable-diffusion).
-        dynamic_thresholding_ratio (`float`, default `0.995`):
-            the ratio for the dynamic thresholding method. Default is `0.995`, the same as Imagen
-            (https://arxiv.org/abs/2205.11487). Valid only when `thresholding=True`.
-        sample_max_value (`float`, default `1.0`):
-            the threshold value for dynamic thresholding. Valid only when `thresholding=True`.
-    """
-
-    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
-    order = 1
-
-    @register_to_config
-    def __init__(
-        self,
-        num_train_timesteps: int = 1000,
-        beta_start: float = 0.0001,
-        beta_end: float = 0.02,
-        beta_schedule: str = "linear",
-        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
-        variance_type: str = "fixed_small",
-        clip_sample: bool = True,
-        prediction_type: str = "epsilon",
-        thresholding: bool = False,
-        dynamic_thresholding_ratio: float = 0.995,
-        clip_sample_range: float = 1.0,
-        sample_max_value: float = 1.0,
-    ):
-        if trained_betas is not None:
-            self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
-        elif beta_schedule == "linear":
-            self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
-        elif beta_schedule == "scaled_linear":
-            # this schedule is very specific to the latent diffusion model.
-            self.betas = (
-                paddle.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=paddle.float32) ** 2
-            )
-        elif beta_schedule == "squaredcos_cap_v2":
-            # Glide cosine schedule
-            self.betas = betas_for_alpha_bar(num_train_timesteps)
-        elif beta_schedule == "sigmoid":
-            # GeoDiff sigmoid schedule
-            betas = paddle.linspace(-6, 6, num_train_timesteps)
-            self.betas = F.sigmoid(betas) * (beta_end - beta_start) + beta_start
-        else:
-            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
-
-        self.alphas = 1.0 - self.betas
-        self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
-        self.one = paddle.to_tensor(1.0)
-
-        # standard deviation of the initial noise distribution
-        self.init_noise_sigma = 1.0
-
-        # setable values
-        self.custom_timesteps = False
-        self.num_inference_steps = None
-        self.timesteps = paddle.to_tensor(np.arange(0, num_train_timesteps)[::-1].copy())
-
-        self.variance_type = variance_type
-
-    def scale_model_input(self, sample: paddle.Tensor, timestep: Optional[int] = None) -> paddle.Tensor:
-        """
-        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
-        current timestep.
-
-        Args:
-            sample (`paddle.Tensor`): input sample
-            timestep (`int`, optional): current timestep
-
-        Returns:
-            `paddle.Tensor`: scaled input sample
-        """
-        return sample
-
-    def set_timesteps(
-        self,
-        num_inference_steps: Optional[int] = None,
-        timesteps: Optional[List[int]] = None,
-    ):
-        """
-        Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
-
-        Args:
-            num_inference_steps (`Optional[int]`):
-                the number of diffusion steps used when generating samples with a pre-trained model. If passed, then
-                `timesteps` must be `None`.
-            custom_timesteps (`List[int]`, optional):
-                custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
-                timestep spacing strategy of equal spacing between timesteps is used. If passed, `num_inference_steps`
-                must be `None`.
-        """
-        if num_inference_steps is not None and timesteps is not None:
-            raise ValueError("Can only pass one of `num_inference_steps` or `custom_timesteps`.")
-
-        if timesteps is not None:
-            for i in range(1, len(timesteps)):
-                if timesteps[i] >= timesteps[i - 1]:
-                    raise ValueError("`custom_timesteps` must be in descending order.")
-
-            if timesteps[0] >= self.config.num_train_timesteps:
-                raise ValueError(
-                    f"`timesteps` must start before `self.config.train_timesteps`:"
-                    f" {self.config.num_train_timesteps}."
-                )
-
-            timesteps = np.array(timesteps, dtype=np.int64)
-            self.custom_timesteps = True
-        else:
-            if num_inference_steps > self.config.num_train_timesteps:
-                raise ValueError(
-                    f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
-                    f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
-                    f" maximal {self.config.num_train_timesteps} timesteps."
-                )
-            self.num_inference_steps = num_inference_steps
-            step_ratio = self.config.num_train_timesteps // self.num_inference_steps
-            timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
-            self.custom_timesteps = False
-
-        self.timesteps = paddle.to_tensor(timesteps)
-
-    def _get_variance(self, t, predicted_variance=None, variance_type=None):
-        prev_t = self.previous_timestep(t)
-
-        alpha_prod_t = self.alphas_cumprod[t]
-        alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.one
-        current_beta_t = 1 - alpha_prod_t / alpha_prod_t_prev
-
-        # For t > 0, compute predicted variance βt (see formula (6) and (7) from https://arxiv.org/pdf/2006.11239.pdf)
-        # and sample from it to get previous sample
-        # x_{t-1} ~ N(pred_prev_sample, variance) == add variance to pred_sample
-        variance = (1 - alpha_prod_t_prev) / (1 - alpha_prod_t) * current_beta_t
-
-        # we always take the log of variance, so clamp it to ensure it's not 0
-        variance = paddle.clip(variance, min=1e-20)
-
-        if variance_type is None:
-            variance_type = self.config.variance_type
-
-        # hacks - were probably added for training stability
-        if variance_type == "fixed_small":
-            variance = variance
-        # for rl-diffuser https://arxiv.org/abs/2205.09991
-        elif variance_type == "fixed_small_log":
-            variance = paddle.log(variance)
-            variance = paddle.exp(0.5 * variance)
-        elif variance_type == "fixed_large":
-            variance = current_beta_t
-        elif variance_type == "fixed_large_log":
-            # Glide max_log
-            variance = paddle.log(current_beta_t)
-        elif variance_type == "learned":
-            return predicted_variance
-        elif variance_type == "learned_range":
-            min_log = paddle.log(variance)
-            max_log = paddle.log(current_beta_t)
-            frac = (predicted_variance + 1) / 2
-            variance = frac * max_log + (1 - frac) * min_log
-
-        return variance
-
-    def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor:
-        """
-        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
-        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
-        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
-        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
-        photorealism as well as better image-text alignment, especially when using very large guidance weights."
-        https://arxiv.org/abs/2205.11487
-        """
-        dtype = sample.dtype
-        batch_size, channels, height, width = sample.shape
-
-        if dtype not in (paddle.float32, paddle.float64):
-            sample = paddle.cast(
-                sample, "float32"
-            )  # upcast for quantile calculation, and clamp not implemented for cpu half
-
-        # Flatten sample for doing quantile calculation along each image
-        sample = paddle.reshape(sample, [batch_size, channels * height * width])
-
-        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
-
-        s = paddle.quantile(abs_sample, self.config.dynamic_thresholding_ratio, axis=1)
-        # paddle.clip donot support min > max
-        if self.config.sample_max_value < 1:
-            s = paddle.ones_like(s) * self.config.sample_max_value
-        else:
-            s = paddle.clip(
-                s, min=1, max=self.config.sample_max_value
-            )  # When clip to min=1, equivalent to standard clipping to [-1, 1]
-        s = s.unsqueeze(1)  # (batch_size, 1) because clip will broadcast along axis=0
-        sample = paddle.clip(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
-
-        sample = paddle.reshape(sample, [batch_size, channels, height, width])
-        sample = paddle.cast(sample, dtype)
-
-        return sample
-
-    def step(
-        self,
-        model_output: paddle.Tensor,
-        timestep: int,
-        sample: paddle.Tensor,
-        generator=None,
-        return_dict: bool = True,
-    ) -> Union[DDPMSchedulerOutput, Tuple]:
-        """
-        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
-        process from the learned model outputs (most often the predicted noise).
-
-        Args:
-            model_output (`paddle.Tensor`): direct output from learned diffusion model.
-            timestep (`int`): current discrete timestep in the diffusion chain.
-            sample (`paddle.Tensor`):
-                current instance of sample being created by diffusion process.
-            generator: random number generator.
-            return_dict (`bool`): option for returning tuple rather than DDPMSchedulerOutput class
-
-        Returns:
-            [`~schedulers.scheduling_utils.DDPMSchedulerOutput`] or `tuple`:
-            [`~schedulers.scheduling_utils.DDPMSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. When
-            returning a tuple, the first element is the sample tensor.
-
-        """
-        t = timestep
-        prev_t = self.previous_timestep(t)
-
-        if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type in ["learned", "learned_range"]:
-            model_output, predicted_variance = model_output.chunk(2, axis=1)
-        else:
-            predicted_variance = None
-
-        # 1. compute alphas, betas
-        alpha_prod_t = self.alphas_cumprod[t]
-        alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.one
-        beta_prod_t = 1 - alpha_prod_t
-        beta_prod_t_prev = 1 - alpha_prod_t_prev
-        current_alpha_t = alpha_prod_t / alpha_prod_t_prev
-        current_beta_t = 1 - current_alpha_t
-
-        # 2. compute predicted original sample from predicted noise also called
-        # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
-        if self.config.prediction_type == "epsilon":
-            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
-        elif self.config.prediction_type == "sample":
-            pred_original_sample = model_output
-        elif self.config.prediction_type == "v_prediction":
-            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
-        else:
-            raise ValueError(
-                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` or"
-                " `v_prediction`  for the DDPMScheduler."
-            )
-
-        # 3. Clip or threshold "predicted x_0"
-        if self.config.thresholding:
-            pred_original_sample = self._threshold_sample(pred_original_sample)
-        elif self.config.clip_sample:
-            pred_original_sample = paddle.clip(
-                pred_original_sample, -self.config.clip_sample_range, self.config.clip_sample_range
-            )
-
-        # 4. Compute coefficients for pred_original_sample x_0 and current sample x_t
-        # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
-        pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * current_beta_t) / beta_prod_t
-        current_sample_coeff = current_alpha_t ** (0.5) * beta_prod_t_prev / beta_prod_t
-
-        # 5. Compute predicted previous sample µ_t
-        # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
-        pred_prev_sample = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * sample
-
-        # 6. Add noise
-        variance = 0
-        if t > 0:
-            variance_noise = randn_tensor(model_output.shape, generator=generator, dtype=model_output.dtype)
-            if self.variance_type == "fixed_small_log":
-                variance = self._get_variance(t, predicted_variance=predicted_variance) * variance_noise
-            elif self.variance_type == "learned_range":
-                variance = self._get_variance(t, predicted_variance=predicted_variance)
-                variance = paddle.exp(0.5 * variance) * variance_noise
-            else:
-                variance = (self._get_variance(t, predicted_variance=predicted_variance) ** 0.5) * variance_noise
-
-        pred_prev_sample = pred_prev_sample + variance
-
-        if not return_dict:
-            return (pred_prev_sample,)
-
-        return DDPMSchedulerOutput(prev_sample=pred_prev_sample, pred_original_sample=pred_original_sample)
-
-    def add_noise(
-        self,
-        original_samples: paddle.Tensor,
-        noise: paddle.Tensor,
-        timesteps: paddle.Tensor,
-    ) -> paddle.Tensor:
-        # Make sure alphas_cumprod and timestep have same dtype as original_samples
-        alphas_cumprod = self.alphas_cumprod.cast(original_samples.dtype)
-
-        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
-        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
-        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
-            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
-
-        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
-        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
-        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
-            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
-
-        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
-        return noisy_samples
-
-    def get_velocity(self, sample: paddle.Tensor, noise: paddle.Tensor, timesteps: paddle.Tensor) -> paddle.Tensor:
-        # Make sure alphas_cumprod and timestep have same dtype as original_samples
-        alphas_cumprod = self.alphas_cumprod.cast(sample.dtype)
-
-        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
-        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
-        while len(sqrt_alpha_prod.shape) < len(sample.shape):
-            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
-
-        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
-        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
-        while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
-            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
-
-        velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
-        return velocity
-
-    def __len__(self):
-        return self.config.num_train_timesteps
-
-    def previous_timestep(self, timestep):
-        if self.custom_timesteps:
-            index = (self.timesteps == timestep).nonzero(as_tuple=True)[0][0]
-            if index == self.timesteps.shape[0] - 1:
-                prev_t = paddle.to_tensor(-1)
-            else:
-                prev_t = self.timesteps[index + 1]
-        else:
-            num_inference_steps = (
-                self.num_inference_steps if self.num_inference_steps else self.config.num_train_timesteps
-            )
-            prev_t = timestep - self.config.num_train_timesteps // num_inference_steps
-
-        return prev_t
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_deis_multistep.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_deis_multistep.py
deleted file mode 100644
index 7d62b0796d6b..000000000000
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_deis_multistep.py
+++ /dev/null
@@ -1,509 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2022 FLAIR Lab and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# DISCLAIMER: check https://arxiv.org/abs/2204.13902 and https://github.com/qsh-zh/deis for more info
-# The codebase is modified based on https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
-
-import math
-from typing import List, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
-
-
-# Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
-    """
-    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
-    (1-beta) over time from t = [0,1].
-
-    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
-    to that part of the diffusion process.
-
-
-    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-
-    Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
-    """
-
-    def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
-
-    betas = []
-    for i in range(num_diffusion_timesteps):
-        t1 = i / num_diffusion_timesteps
-        t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
-    return paddle.to_tensor(betas, dtype=paddle.float32)
-
-
-class DEISMultistepScheduler(SchedulerMixin, ConfigMixin):
-    """
-    DEIS (https://arxiv.org/abs/2204.13902) is a fast high order solver for diffusion ODEs. We slightly modify the
-    polynomial fitting formula in log-rho space instead of the original linear t space in DEIS paper. The modification
-    enjoys closed-form coefficients for exponential multistep update instead of replying on the numerical solver. More
-    variants of DEIS can be found in https://github.com/qsh-zh/deis.
-
-    Currently, we support the log-rho multistep DEIS. We recommend to use `solver_order=2 / 3` while `solver_order=1`
-    reduces to DDIM.
-
-    We also support the "dynamic thresholding" method in Imagen (https://arxiv.org/abs/2205.11487). For pixel-space
-    diffusion models, you can set `thresholding=True` to use the dynamic thresholding.
-
-    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
-    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
-    [`~SchedulerMixin.from_pretrained`] functions.
-
-    Args:
-        num_train_timesteps (`int`): number of diffusion steps used to train the model.
-        beta_start (`float`): the starting `beta` value of inference.
-        beta_end (`float`): the final `beta` value.
-        beta_schedule (`str`):
-            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
-            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
-        trained_betas (`np.ndarray`, optional):
-            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
-        solver_order (`int`, default `2`):
-            the order of DEIS; can be `1` or `2` or `3`. We recommend to use `solver_order=2` for guided sampling, and
-            `solver_order=3` for unconditional sampling.
-        prediction_type (`str`, default `epsilon`):
-            indicates whether the model predicts the noise (epsilon), or the data / `x0`. One of `epsilon`, `sample`,
-            or `v-prediction`.
-        thresholding (`bool`, default `False`):
-            whether to use the "dynamic thresholding" method (introduced by Imagen, https://arxiv.org/abs/2205.11487).
-            Note that the thresholding method is unsuitable for latent-space diffusion models (such as
-            stable-diffusion).
-        dynamic_thresholding_ratio (`float`, default `0.995`):
-            the ratio for the dynamic thresholding method. Default is `0.995`, the same as Imagen
-            (https://arxiv.org/abs/2205.11487).
-        sample_max_value (`float`, default `1.0`):
-            the threshold value for dynamic thresholding. Valid only when `thresholding=True`
-        algorithm_type (`str`, default `deis`):
-            the algorithm type for the solver. current we support multistep deis, we will add other variants of DEIS in
-            the future
-        lower_order_final (`bool`, default `True`):
-            whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps. We empirically
-            find this trick can stabilize the sampling of DEIS for steps < 15, especially for steps <= 10.
-
-    """
-
-    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
-    order = 1
-
-    @register_to_config
-    def __init__(
-        self,
-        num_train_timesteps: int = 1000,
-        beta_start: float = 0.0001,
-        beta_end: float = 0.02,
-        beta_schedule: str = "linear",
-        trained_betas: Optional[np.ndarray] = None,
-        solver_order: int = 2,
-        prediction_type: str = "epsilon",
-        thresholding: bool = False,
-        dynamic_thresholding_ratio: float = 0.995,
-        sample_max_value: float = 1.0,
-        algorithm_type: str = "deis",
-        solver_type: str = "logrho",
-        lower_order_final: bool = True,
-    ):
-        if trained_betas is not None:
-            self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
-        elif beta_schedule == "linear":
-            self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
-        elif beta_schedule == "scaled_linear":
-            # this schedule is very specific to the latent diffusion model.
-            self.betas = (
-                paddle.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=paddle.float32) ** 2
-            )
-        elif beta_schedule == "squaredcos_cap_v2":
-            # Glide cosine schedule
-            self.betas = betas_for_alpha_bar(num_train_timesteps)
-        else:
-            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
-
-        self.alphas = 1.0 - self.betas
-        self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
-        # Currently we only support VP-type noise schedule
-        self.alpha_t = paddle.sqrt(self.alphas_cumprod)
-        self.sigma_t = paddle.sqrt(1 - self.alphas_cumprod)
-        self.lambda_t = paddle.log(self.alpha_t) - paddle.log(self.sigma_t)
-
-        # standard deviation of the initial noise distribution
-        self.init_noise_sigma = 1.0
-
-        # settings for DEIS
-        if algorithm_type not in ["deis"]:
-            if algorithm_type in ["dpmsolver", "dpmsolver++"]:
-                self.register_to_config(algorithm_type="deis")
-            else:
-                raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}")
-
-        if solver_type not in ["logrho"]:
-            if solver_type in ["midpoint", "heun", "bh1", "bh2"]:
-                self.register_to_config(solver_type="logrho")
-            else:
-                raise NotImplementedError(f"solver type {solver_type} does is not implemented for {self.__class__}")
-
-        # setable values
-        self.num_inference_steps = None
-        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32)[::-1].copy()
-        self.timesteps = paddle.to_tensor(timesteps)
-        self.model_outputs = [None] * solver_order
-        self.lower_order_nums = 0
-
-    def set_timesteps(self, num_inference_steps: int):
-        """
-        Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
-
-        Args:
-            num_inference_steps (`int`):
-                the number of diffusion steps used when generating samples with a pre-trained model.
-        """
-        timesteps = (
-            np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps + 1)
-            .round()[::-1][:-1]
-            .copy()
-            .astype(np.int64)
-        )
-
-        # when num_inference_steps == num_train_timesteps, we can end up with
-        # duplicates in timesteps.
-        _, unique_indices = np.unique(timesteps, return_index=True)
-        timesteps = timesteps[np.sort(unique_indices)]
-
-        self.timesteps = paddle.to_tensor(timesteps)
-
-        self.num_inference_steps = len(timesteps)
-
-        self.model_outputs = [
-            None,
-        ] * self.config.solver_order
-        self.lower_order_nums = 0
-
-    def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor:
-        """
-        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
-        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
-        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
-        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
-        photorealism as well as better image-text alignment, especially when using very large guidance weights."
-        https://arxiv.org/abs/2205.11487
-        """
-        dtype = sample.dtype
-        batch_size, channels, height, width = sample.shape
-
-        if dtype not in (paddle.float32, paddle.float64):
-            sample = paddle.cast(
-                sample, "float32"
-            )  # upcast for quantile calculation, and clamp not implemented for cpu half
-
-        # Flatten sample for doing quantile calculation along each image
-        sample = paddle.reshape(sample, [batch_size, channels * height * width])
-
-        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
-
-        s = paddle.quantile(abs_sample, self.config.dynamic_thresholding_ratio, axis=1)
-        # paddle.clip donot support min > max
-        if self.config.sample_max_value < 1:
-            s = paddle.ones_like(s) * self.config.sample_max_value
-        else:
-            s = paddle.clip(
-                s, min=1, max=self.config.sample_max_value
-            )  # When clip to min=1, equivalent to standard clipping to [-1, 1]
-        s = s.unsqueeze(1)  # (batch_size, 1) because clip will broadcast along axis=0
-        sample = paddle.clip(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
-
-        sample = paddle.reshape(sample, [batch_size, channels, height, width])
-        sample = paddle.cast(sample, dtype)
-
-        return sample
-
-    def convert_model_output(self, model_output: paddle.Tensor, timestep: int, sample: paddle.Tensor) -> paddle.Tensor:
-        """
-        Convert the model output to the corresponding type that the algorithm DEIS needs.
-
-        Args:
-            model_output (`paddle.Tensor`): direct output from learned diffusion model.
-            timestep (`int`): current discrete timestep in the diffusion chain.
-            sample (`paddle.Tensor`):
-                current instance of sample being created by diffusion process.
-
-        Returns:
-            `paddle.Tensor`: the converted model output.
-        """
-        if self.config.prediction_type == "epsilon":
-            alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
-            x0_pred = (sample - sigma_t * model_output) / alpha_t
-        elif self.config.prediction_type == "sample":
-            x0_pred = model_output
-        elif self.config.prediction_type == "v_prediction":
-            alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
-            x0_pred = alpha_t * sample - sigma_t * model_output
-        else:
-            raise ValueError(
-                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
-                " `v_prediction` for the DEISMultistepScheduler."
-            )
-
-        if self.config.thresholding:
-            x0_pred = self._threshold_sample(x0_pred)
-
-        if self.config.algorithm_type == "deis":
-            alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
-            return (sample - alpha_t * x0_pred) / sigma_t
-        else:
-            raise NotImplementedError("only support log-rho multistep deis now")
-
-    def deis_first_order_update(
-        self,
-        model_output: paddle.Tensor,
-        timestep: int,
-        prev_timestep: int,
-        sample: paddle.Tensor,
-    ) -> paddle.Tensor:
-        """
-        One step for the first-order DEIS (equivalent to DDIM).
-
-        Args:
-            model_output (`paddle.Tensor`): direct output from learned diffusion model.
-            timestep (`int`): current discrete timestep in the diffusion chain.
-            prev_timestep (`int`): previous discrete timestep in the diffusion chain.
-            sample (`paddle.Tensor`):
-                current instance of sample being created by diffusion process.
-
-        Returns:
-            `paddle.Tensor`: the sample tensor at the previous timestep.
-        """
-        lambda_t, lambda_s = self.lambda_t[prev_timestep], self.lambda_t[timestep]
-        alpha_t, alpha_s = self.alpha_t[prev_timestep], self.alpha_t[timestep]
-        sigma_t, _ = self.sigma_t[prev_timestep], self.sigma_t[timestep]
-        h = lambda_t - lambda_s
-        if self.config.algorithm_type == "deis":
-            x_t = (alpha_t / alpha_s) * sample - (sigma_t * (paddle.exp(h) - 1.0)) * model_output
-        else:
-            raise NotImplementedError("only support log-rho multistep deis now")
-        return x_t
-
-    def multistep_deis_second_order_update(
-        self,
-        model_output_list: List[paddle.Tensor],
-        timestep_list: List[int],
-        prev_timestep: int,
-        sample: paddle.Tensor,
-    ) -> paddle.Tensor:
-        """
-        One step for the second-order multistep DEIS.
-
-        Args:
-            model_output_list (`List[paddle.Tensor]`):
-                direct outputs from learned diffusion model at current and latter timesteps.
-            timestep (`int`): current and latter discrete timestep in the diffusion chain.
-            prev_timestep (`int`): previous discrete timestep in the diffusion chain.
-            sample (`paddle.Tensor`):
-                current instance of sample being created by diffusion process.
-
-        Returns:
-            `paddle.Tensor`: the sample tensor at the previous timestep.
-        """
-        t, s0, s1 = prev_timestep, timestep_list[-1], timestep_list[-2]
-        m0, m1 = model_output_list[-1], model_output_list[-2]
-        alpha_t, alpha_s0, alpha_s1 = self.alpha_t[t], self.alpha_t[s0], self.alpha_t[s1]
-        sigma_t, sigma_s0, sigma_s1 = self.sigma_t[t], self.sigma_t[s0], self.sigma_t[s1]
-
-        rho_t, rho_s0, rho_s1 = sigma_t / alpha_t, sigma_s0 / alpha_s0, sigma_s1 / alpha_s1
-
-        if self.config.algorithm_type == "deis":
-
-            def ind_fn(t, b, c):
-                # Integrate[(log(t) - log(c)) / (log(b) - log(c)), {t}]
-                return t * (-paddle.log(c) + paddle.log(t) - 1) / (paddle.log(b) - paddle.log(c))
-
-            coef1 = ind_fn(rho_t, rho_s0, rho_s1) - ind_fn(rho_s0, rho_s0, rho_s1)
-            coef2 = ind_fn(rho_t, rho_s1, rho_s0) - ind_fn(rho_s0, rho_s1, rho_s0)
-
-            x_t = alpha_t * (sample / alpha_s0 + coef1 * m0 + coef2 * m1)
-            return x_t
-        else:
-            raise NotImplementedError("only support log-rho multistep deis now")
-
-    def multistep_deis_third_order_update(
-        self,
-        model_output_list: List[paddle.Tensor],
-        timestep_list: List[int],
-        prev_timestep: int,
-        sample: paddle.Tensor,
-    ) -> paddle.Tensor:
-        """
-        One step for the third-order multistep DEIS.
-
-        Args:
-            model_output_list (`List[paddle.Tensor]`):
-                direct outputs from learned diffusion model at current and latter timesteps.
-            timestep (`int`): current and latter discrete timestep in the diffusion chain.
-            prev_timestep (`int`): previous discrete timestep in the diffusion chain.
-            sample (`paddle.Tensor`):
-                current instance of sample being created by diffusion process.
-
-        Returns:
-            `paddle.Tensor`: the sample tensor at the previous timestep.
-        """
-        t, s0, s1, s2 = prev_timestep, timestep_list[-1], timestep_list[-2], timestep_list[-3]
-        m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3]
-        alpha_t, alpha_s0, alpha_s1, alpha_s2 = self.alpha_t[t], self.alpha_t[s0], self.alpha_t[s1], self.alpha_t[s2]
-        sigma_t, sigma_s0, sigma_s1, simga_s2 = self.sigma_t[t], self.sigma_t[s0], self.sigma_t[s1], self.sigma_t[s2]
-        rho_t, rho_s0, rho_s1, rho_s2 = (
-            sigma_t / alpha_t,
-            sigma_s0 / alpha_s0,
-            sigma_s1 / alpha_s1,
-            simga_s2 / alpha_s2,
-        )
-
-        if self.config.algorithm_type == "deis":
-
-            def ind_fn(t, b, c, d):
-                # Integrate[(log(t) - log(c))(log(t) - log(d)) / (log(b) - log(c))(log(b) - log(d)), {t}]
-                numerator = t * (
-                    paddle.log(c) * (paddle.log(d) - paddle.log(t) + 1)
-                    - paddle.log(d) * paddle.log(t)
-                    + paddle.log(d)
-                    + paddle.log(t) ** 2
-                    - 2 * paddle.log(t)
-                    + 2
-                )
-                denominator = (paddle.log(b) - paddle.log(c)) * (paddle.log(b) - paddle.log(d))
-                return numerator / denominator
-
-            coef1 = ind_fn(rho_t, rho_s0, rho_s1, rho_s2) - ind_fn(rho_s0, rho_s0, rho_s1, rho_s2)
-            coef2 = ind_fn(rho_t, rho_s1, rho_s2, rho_s0) - ind_fn(rho_s0, rho_s1, rho_s2, rho_s0)
-            coef3 = ind_fn(rho_t, rho_s2, rho_s0, rho_s1) - ind_fn(rho_s0, rho_s2, rho_s0, rho_s1)
-
-            x_t = alpha_t * (sample / alpha_s0 + coef1 * m0 + coef2 * m1 + coef3 * m2)
-
-            return x_t
-        else:
-            raise NotImplementedError("only support log-rho multistep deis now")
-
-    def step(
-        self,
-        model_output: paddle.Tensor,
-        timestep: int,
-        sample: paddle.Tensor,
-        return_dict: bool = True,
-    ) -> Union[SchedulerOutput, Tuple]:
-        """
-        Step function propagating the sample with the multistep DEIS.
-
-        Args:
-            model_output (`paddle.Tensor`): direct output from learned diffusion model.
-            timestep (`int`): current discrete timestep in the diffusion chain.
-            sample (`paddle.Tensor`):
-                current instance of sample being created by diffusion process.
-            return_dict (`bool`): option for returning tuple rather than SchedulerOutput class
-
-        Returns:
-            [`~scheduling_utils.SchedulerOutput`] or `tuple`: [`~scheduling_utils.SchedulerOutput`] if `return_dict` is
-            True, otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
-
-        """
-        if self.num_inference_steps is None:
-            raise ValueError(
-                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
-            )
-
-        step_index = (self.timesteps == timestep).nonzero()
-        if len(step_index) == 0:
-            step_index = len(self.timesteps) - 1
-        else:
-            step_index = step_index.item()
-        prev_timestep = 0 if step_index == len(self.timesteps) - 1 else self.timesteps[step_index + 1]
-        lower_order_final = (
-            (step_index == len(self.timesteps) - 1) and self.config.lower_order_final and len(self.timesteps) < 15
-        )
-        lower_order_second = (
-            (step_index == len(self.timesteps) - 2) and self.config.lower_order_final and len(self.timesteps) < 15
-        )
-
-        model_output = self.convert_model_output(model_output, timestep, sample)
-        for i in range(self.config.solver_order - 1):
-            self.model_outputs[i] = self.model_outputs[i + 1]
-        self.model_outputs[-1] = model_output
-
-        if self.config.solver_order == 1 or self.lower_order_nums < 1 or lower_order_final:
-            prev_sample = self.deis_first_order_update(model_output, timestep, prev_timestep, sample)
-        elif self.config.solver_order == 2 or self.lower_order_nums < 2 or lower_order_second:
-            timestep_list = [self.timesteps[step_index - 1], timestep]
-            prev_sample = self.multistep_deis_second_order_update(
-                self.model_outputs, timestep_list, prev_timestep, sample
-            )
-        else:
-            timestep_list = [self.timesteps[step_index - 2], self.timesteps[step_index - 1], timestep]
-            prev_sample = self.multistep_deis_third_order_update(
-                self.model_outputs, timestep_list, prev_timestep, sample
-            )
-
-        if self.lower_order_nums < self.config.solver_order:
-            self.lower_order_nums += 1
-
-        if not return_dict:
-            return (prev_sample,)
-
-        return SchedulerOutput(prev_sample=prev_sample)
-
-    def scale_model_input(self, sample: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
-        """
-        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
-        current timestep.
-
-        Args:
-            sample (`paddle.Tensor`): input sample
-
-        Returns:
-            `paddle.Tensor`: scaled input sample
-        """
-        return sample
-
-    def add_noise(
-        self,
-        original_samples: paddle.Tensor,
-        noise: paddle.Tensor,
-        timesteps: paddle.Tensor,
-    ) -> paddle.Tensor:
-        # Make sure alphas_cumprod and timestep have same dtype as original_samples
-        alphas_cumprod = self.alphas_cumprod.cast(original_samples.dtype)
-
-        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
-        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
-        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
-            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
-
-        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
-        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
-        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
-            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
-
-        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
-        return noisy_samples
-
-    def __len__(self):
-        return self.config.num_train_timesteps
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_multistep.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_multistep.py
deleted file mode 100644
index 679c51738521..000000000000
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_multistep.py
+++ /dev/null
@@ -1,606 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2022 TSAIL Team and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver
-
-import math
-from typing import List, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
-
-
-# Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
-    """
-    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
-    (1-beta) over time from t = [0,1].
-
-    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
-    to that part of the diffusion process.
-
-
-    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-
-    Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
-    """
-
-    def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
-
-    betas = []
-    for i in range(num_diffusion_timesteps):
-        t1 = i / num_diffusion_timesteps
-        t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
-    return paddle.to_tensor(betas, dtype=paddle.float32)
-
-
-class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
-    """
-    DPM-Solver (and the improved version DPM-Solver++) is a fast dedicated high-order solver for diffusion ODEs with
-    the convergence order guarantee. Empirically, sampling by DPM-Solver with only 20 steps can generate high-quality
-    samples, and it can generate quite good samples even in only 10 steps.
-
-    For more details, see the original paper: https://arxiv.org/abs/2206.00927 and https://arxiv.org/abs/2211.01095
-
-    Currently, we support the multistep DPM-Solver for both noise prediction models and data prediction models. We
-    recommend to use `solver_order=2` for guided sampling, and `solver_order=3` for unconditional sampling.
-
-    We also support the "dynamic thresholding" method in Imagen (https://arxiv.org/abs/2205.11487). For pixel-space
-    diffusion models, you can set both `algorithm_type="dpmsolver++"` and `thresholding=True` to use the dynamic
-    thresholding. Note that the thresholding method is unsuitable for latent-space diffusion models (such as
-    stable-diffusion).
-
-    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
-    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
-    [`~SchedulerMixin.from_pretrained`] functions.
-
-    Args:
-        num_train_timesteps (`int`): number of diffusion steps used to train the model.
-        beta_start (`float`): the starting `beta` value of inference.
-        beta_end (`float`): the final `beta` value.
-        beta_schedule (`str`):
-            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
-            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
-        trained_betas (`np.ndarray`, optional):
-            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
-        solver_order (`int`, default `2`):
-            the order of DPM-Solver; can be `1` or `2` or `3`. We recommend to use `solver_order=2` for guided
-            sampling, and `solver_order=3` for unconditional sampling.
-        prediction_type (`str`, default `epsilon`, optional):
-            prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
-            process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
-            https://imagen.research.google/video/paper.pdf)
-        thresholding (`bool`, default `False`):
-            whether to use the "dynamic thresholding" method (introduced by Imagen, https://arxiv.org/abs/2205.11487).
-            For pixel-space diffusion models, you can set both `algorithm_type=dpmsolver++` and `thresholding=True` to
-            use the dynamic thresholding. Note that the thresholding method is unsuitable for latent-space diffusion
-            models (such as stable-diffusion).
-        dynamic_thresholding_ratio (`float`, default `0.995`):
-            the ratio for the dynamic thresholding method. Default is `0.995`, the same as Imagen
-            (https://arxiv.org/abs/2205.11487).
-        sample_max_value (`float`, default `1.0`):
-            the threshold value for dynamic thresholding. Valid only when `thresholding=True` and
-            `algorithm_type="dpmsolver++`.
-        algorithm_type (`str`, default `dpmsolver++`):
-            the algorithm type for the solver. Either `dpmsolver` or `dpmsolver++`. The `dpmsolver` type implements the
-            algorithms in https://arxiv.org/abs/2206.00927, and the `dpmsolver++` type implements the algorithms in
-            https://arxiv.org/abs/2211.01095. We recommend to use `dpmsolver++` with `solver_order=2` for guided
-            sampling (e.g. stable-diffusion).
-        solver_type (`str`, default `midpoint`):
-            the solver type for the second-order solver. Either `midpoint` or `heun`. The solver type slightly affects
-            the sample quality, especially for small number of steps. We empirically find that `midpoint` solvers are
-            slightly better, so we recommend to use the `midpoint` type.
-        lower_order_final (`bool`, default `True`):
-            whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps. We empirically
-            find this trick can stabilize the sampling of DPM-Solver for steps < 15, especially for steps <= 10.
-        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
-             This parameter controls whether to use Karras sigmas (Karras et al. (2022) scheme) for step sizes in the
-             noise schedule during the sampling process. If True, the sigmas will be determined according to a sequence
-             of noise levels {σi} as defined in Equation (5) of the paper https://arxiv.org/pdf/2206.00364.pdf.
-    """
-
-    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
-    order = 1
-
-    @register_to_config
-    def __init__(
-        self,
-        num_train_timesteps: int = 1000,
-        beta_start: float = 0.0001,
-        beta_end: float = 0.02,
-        beta_schedule: str = "linear",
-        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
-        solver_order: int = 2,
-        prediction_type: str = "epsilon",
-        thresholding: bool = False,
-        dynamic_thresholding_ratio: float = 0.995,
-        sample_max_value: float = 1.0,
-        algorithm_type: str = "dpmsolver++",
-        solver_type: str = "midpoint",
-        lower_order_final: bool = True,
-        use_karras_sigmas: Optional[bool] = False,
-    ):
-        if trained_betas is not None:
-            self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
-        elif beta_schedule == "linear":
-            self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
-        elif beta_schedule == "scaled_linear":
-            # this schedule is very specific to the latent diffusion model.
-            self.betas = (
-                paddle.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=paddle.float32) ** 2
-            )
-        elif beta_schedule == "squaredcos_cap_v2":
-            # Glide cosine schedule
-            self.betas = betas_for_alpha_bar(num_train_timesteps)
-        else:
-            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
-
-        self.alphas = 1.0 - self.betas
-        self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
-        # Currently we only support VP-type noise schedule
-        self.alpha_t = paddle.sqrt(self.alphas_cumprod)
-        self.sigma_t = paddle.sqrt(1 - self.alphas_cumprod)
-        self.lambda_t = paddle.log(self.alpha_t) - paddle.log(self.sigma_t)
-
-        # standard deviation of the initial noise distribution
-        self.init_noise_sigma = 1.0
-
-        # settings for DPM-Solver
-        if algorithm_type not in ["dpmsolver", "dpmsolver++"]:
-            if algorithm_type == "deis":
-                self.register_to_config(algorithm_type="dpmsolver++")
-            else:
-                raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}")
-
-        if solver_type not in ["midpoint", "heun"]:
-            if solver_type in ["logrho", "bh1", "bh2"]:
-                self.register_to_config(solver_type="midpoint")
-            else:
-                raise NotImplementedError(f"{solver_type} does is not implemented for {self.__class__}")
-
-        # setable values
-        self.num_inference_steps = None
-        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32)[::-1].copy()
-        self.timesteps = paddle.to_tensor(timesteps)
-        self.model_outputs = [None] * solver_order
-        self.lower_order_nums = 0
-        self.use_karras_sigmas = use_karras_sigmas
-
-    def set_timesteps(self, num_inference_steps: int):
-        """
-        Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
-
-        Args:
-            num_inference_steps (`int`):
-                the number of diffusion steps used when generating samples with a pre-trained model.
-        """
-        timesteps = (
-            np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps + 1)
-            .round()[::-1][:-1]
-            .copy()
-            .astype(np.int64)
-        )
-        if self.use_karras_sigmas:
-            sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
-            log_sigmas = np.log(sigmas)
-            sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
-            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
-            timesteps = np.flip(timesteps).copy().astype(np.int64)
-
-        # when num_inference_steps == num_train_timesteps, we can end up with
-        # duplicates in timesteps.
-        _, unique_indices = np.unique(timesteps, return_index=True)
-        timesteps = timesteps[np.sort(unique_indices)]
-
-        self.timesteps = paddle.to_tensor(timesteps)
-
-        self.num_inference_steps = len(timesteps)
-
-        self.model_outputs = [
-            None,
-        ] * self.config.solver_order
-        self.lower_order_nums = 0
-
-    def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor:
-        """
-        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
-        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
-        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
-        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
-        photorealism as well as better image-text alignment, especially when using very large guidance weights."
-        https://arxiv.org/abs/2205.11487
-        """
-        dtype = sample.dtype
-        batch_size, channels, height, width = sample.shape
-
-        if dtype not in (paddle.float32, paddle.float64):
-            sample = paddle.cast(
-                sample, "float32"
-            )  # upcast for quantile calculation, and clamp not implemented for cpu half
-
-        # Flatten sample for doing quantile calculation along each image
-        sample = paddle.reshape(sample, [batch_size, channels * height * width])
-
-        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
-
-        s = paddle.quantile(abs_sample, self.config.dynamic_thresholding_ratio, axis=1)
-        # paddle.clip donot support min > max
-        if self.config.sample_max_value < 1:
-            s = paddle.ones_like(s) * self.config.sample_max_value
-        else:
-            s = paddle.clip(
-                s, min=1, max=self.config.sample_max_value
-            )  # When clip to min=1, equivalent to standard clipping to [-1, 1]
-        s = s.unsqueeze(1)  # (batch_size, 1) because clip will broadcast along axis=0
-        sample = paddle.clip(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
-
-        sample = paddle.reshape(sample, [batch_size, channels, height, width])
-        sample = paddle.cast(sample, dtype)
-
-        return sample
-
-    def _sigma_to_t(self, sigma, log_sigmas):
-        # get log sigma
-        log_sigma = np.log(sigma)
-
-        # get distribution
-        dists = log_sigma - log_sigmas[:, np.newaxis]
-
-        # get sigmas range
-        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
-        high_idx = low_idx + 1
-
-        low = log_sigmas[low_idx]
-        high = log_sigmas[high_idx]
-
-        # interpolate sigmas
-        w = (low - log_sigma) / (low - high)
-        w = np.clip(w, 0, 1)
-
-        # transform interpolation to time range
-        t = (1 - w) * low_idx + w * high_idx
-        t = t.reshape(sigma.shape)
-        return t
-
-    def _convert_to_karras(self, in_sigmas: paddle.Tensor, num_inference_steps) -> paddle.Tensor:
-        """Constructs the noise schedule of Karras et al. (2022)."""
-
-        sigma_min = in_sigmas[-1].item()
-        sigma_max = in_sigmas[0].item()
-
-        rho = 7.0  # 7.0 is the value used in the paper
-        ramp = np.linspace(0, 1, num_inference_steps)
-        min_inv_rho = sigma_min ** (1 / rho)
-        max_inv_rho = sigma_max ** (1 / rho)
-        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
-        return sigmas
-
-    def convert_model_output(self, model_output: paddle.Tensor, timestep: int, sample: paddle.Tensor) -> paddle.Tensor:
-        """
-        Convert the model output to the corresponding type that the algorithm (DPM-Solver / DPM-Solver++) needs.
-
-        DPM-Solver is designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to
-        discretize an integral of the data prediction model. So we need to first convert the model output to the
-        corresponding type to match the algorithm.
-
-        Note that the algorithm type and the model type is decoupled. That is to say, we can use either DPM-Solver or
-        DPM-Solver++ for both noise prediction model and data prediction model.
-
-        Args:
-            model_output (`paddle.Tensor`): direct output from learned diffusion model.
-            timestep (`int`): current discrete timestep in the diffusion chain.
-            sample (`paddle.Tensor`):
-                current instance of sample being created by diffusion process.
-
-        Returns:
-            `paddle.Tensor`: the converted model output.
-        """
-        # DPM-Solver++ needs to solve an integral of the data prediction model.
-        if self.config.algorithm_type == "dpmsolver++":
-            if self.config.prediction_type == "epsilon":
-                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
-                x0_pred = (sample - sigma_t * model_output) / alpha_t
-            elif self.config.prediction_type == "sample":
-                x0_pred = model_output
-            elif self.config.prediction_type == "v_prediction":
-                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
-                x0_pred = alpha_t * sample - sigma_t * model_output
-            else:
-                raise ValueError(
-                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
-                    " `v_prediction` for the DPMSolverMultistepScheduler."
-                )
-
-            if self.config.thresholding:
-                x0_pred = self._threshold_sample(x0_pred)
-
-            return x0_pred
-        # DPM-Solver needs to solve an integral of the noise prediction model.
-        elif self.config.algorithm_type == "dpmsolver":
-            if self.config.prediction_type == "epsilon":
-                return model_output
-            elif self.config.prediction_type == "sample":
-                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
-                epsilon = (sample - alpha_t * model_output) / sigma_t
-                return epsilon
-            elif self.config.prediction_type == "v_prediction":
-                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
-                epsilon = alpha_t * model_output + sigma_t * sample
-                return epsilon
-            else:
-                raise ValueError(
-                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
-                    " `v_prediction` for the DPMSolverMultistepScheduler."
-                )
-
-    def dpm_solver_first_order_update(
-        self,
-        model_output: paddle.Tensor,
-        timestep: int,
-        prev_timestep: int,
-        sample: paddle.Tensor,
-    ) -> paddle.Tensor:
-        """
-        One step for the first-order DPM-Solver (equivalent to DDIM).
-
-        See https://arxiv.org/abs/2206.00927 for the detailed derivation.
-
-        Args:
-            model_output (`paddle.Tensor`): direct output from learned diffusion model.
-            timestep (`int`): current discrete timestep in the diffusion chain.
-            prev_timestep (`int`): previous discrete timestep in the diffusion chain.
-            sample (`paddle.Tensor`):
-                current instance of sample being created by diffusion process.
-
-        Returns:
-            `paddle.Tensor`: the sample tensor at the previous timestep.
-        """
-        lambda_t, lambda_s = self.lambda_t[prev_timestep], self.lambda_t[timestep]
-        alpha_t, alpha_s = self.alpha_t[prev_timestep], self.alpha_t[timestep]
-        sigma_t, sigma_s = self.sigma_t[prev_timestep], self.sigma_t[timestep]
-        h = lambda_t - lambda_s
-        if self.config.algorithm_type == "dpmsolver++":
-            x_t = (sigma_t / sigma_s) * sample - (alpha_t * (paddle.exp(-h) - 1.0)) * model_output
-        elif self.config.algorithm_type == "dpmsolver":
-            x_t = (alpha_t / alpha_s) * sample - (sigma_t * (paddle.exp(h) - 1.0)) * model_output
-        return x_t
-
-    def multistep_dpm_solver_second_order_update(
-        self,
-        model_output_list: List[paddle.Tensor],
-        timestep_list: List[int],
-        prev_timestep: int,
-        sample: paddle.Tensor,
-    ) -> paddle.Tensor:
-        """
-        One step for the second-order multistep DPM-Solver.
-
-        Args:
-            model_output_list (`List[paddle.Tensor]`):
-                direct outputs from learned diffusion model at current and latter timesteps.
-            timestep (`int`): current and latter discrete timestep in the diffusion chain.
-            prev_timestep (`int`): previous discrete timestep in the diffusion chain.
-            sample (`paddle.Tensor`):
-                current instance of sample being created by diffusion process.
-
-        Returns:
-            `paddle.Tensor`: the sample tensor at the previous timestep.
-        """
-        t, s0, s1 = prev_timestep, timestep_list[-1], timestep_list[-2]
-        m0, m1 = model_output_list[-1], model_output_list[-2]
-        lambda_t, lambda_s0, lambda_s1 = self.lambda_t[t], self.lambda_t[s0], self.lambda_t[s1]
-        alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0]
-        sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0]
-        h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1
-        r0 = h_0 / h
-        D0, D1 = m0, (1.0 / r0) * (m0 - m1)
-        if self.config.algorithm_type == "dpmsolver++":
-            # See https://arxiv.org/abs/2211.01095 for detailed derivations
-            if self.config.solver_type == "midpoint":
-                x_t = (
-                    (sigma_t / sigma_s0) * sample
-                    - (alpha_t * (paddle.exp(-h) - 1.0)) * D0
-                    - 0.5 * (alpha_t * (paddle.exp(-h) - 1.0)) * D1
-                )
-            elif self.config.solver_type == "heun":
-                x_t = (
-                    (sigma_t / sigma_s0) * sample
-                    - (alpha_t * (paddle.exp(-h) - 1.0)) * D0
-                    + (alpha_t * ((paddle.exp(-h) - 1.0) / h + 1.0)) * D1
-                )
-        elif self.config.algorithm_type == "dpmsolver":
-            # See https://arxiv.org/abs/2206.00927 for detailed derivations
-            if self.config.solver_type == "midpoint":
-                x_t = (
-                    (alpha_t / alpha_s0) * sample
-                    - (sigma_t * (paddle.exp(h) - 1.0)) * D0
-                    - 0.5 * (sigma_t * (paddle.exp(h) - 1.0)) * D1
-                )
-            elif self.config.solver_type == "heun":
-                x_t = (
-                    (alpha_t / alpha_s0) * sample
-                    - (sigma_t * (paddle.exp(h) - 1.0)) * D0
-                    - (sigma_t * ((paddle.exp(h) - 1.0) / h - 1.0)) * D1
-                )
-        return x_t
-
-    def multistep_dpm_solver_third_order_update(
-        self,
-        model_output_list: List[paddle.Tensor],
-        timestep_list: List[int],
-        prev_timestep: int,
-        sample: paddle.Tensor,
-    ) -> paddle.Tensor:
-        """
-        One step for the third-order multistep DPM-Solver.
-
-        Args:
-            model_output_list (`List[paddle.Tensor]`):
-                direct outputs from learned diffusion model at current and latter timesteps.
-            timestep (`int`): current and latter discrete timestep in the diffusion chain.
-            prev_timestep (`int`): previous discrete timestep in the diffusion chain.
-            sample (`paddle.Tensor`):
-                current instance of sample being created by diffusion process.
-
-        Returns:
-            `paddle.Tensor`: the sample tensor at the previous timestep.
-        """
-        t, s0, s1, s2 = prev_timestep, timestep_list[-1], timestep_list[-2], timestep_list[-3]
-        m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3]
-        lambda_t, lambda_s0, lambda_s1, lambda_s2 = (
-            self.lambda_t[t],
-            self.lambda_t[s0],
-            self.lambda_t[s1],
-            self.lambda_t[s2],
-        )
-        alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0]
-        sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0]
-        h, h_0, h_1 = lambda_t - lambda_s0, lambda_s0 - lambda_s1, lambda_s1 - lambda_s2
-        r0, r1 = h_0 / h, h_1 / h
-        D0 = m0
-        D1_0, D1_1 = (1.0 / r0) * (m0 - m1), (1.0 / r1) * (m1 - m2)
-        D1 = D1_0 + (r0 / (r0 + r1)) * (D1_0 - D1_1)
-        D2 = (1.0 / (r0 + r1)) * (D1_0 - D1_1)
-        if self.config.algorithm_type == "dpmsolver++":
-            # See https://arxiv.org/abs/2206.00927 for detailed derivations
-            x_t = (
-                (sigma_t / sigma_s0) * sample
-                - (alpha_t * (paddle.exp(-h) - 1.0)) * D0
-                + (alpha_t * ((paddle.exp(-h) - 1.0) / h + 1.0)) * D1
-                - (alpha_t * ((paddle.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2
-            )
-        elif self.config.algorithm_type == "dpmsolver":
-            # See https://arxiv.org/abs/2206.00927 for detailed derivations
-            x_t = (
-                (alpha_t / alpha_s0) * sample
-                - (sigma_t * (paddle.exp(h) - 1.0)) * D0
-                - (sigma_t * ((paddle.exp(h) - 1.0) / h - 1.0)) * D1
-                - (sigma_t * ((paddle.exp(h) - 1.0 - h) / h**2 - 0.5)) * D2
-            )
-        return x_t
-
-    def step(
-        self,
-        model_output: paddle.Tensor,
-        timestep: int,
-        sample: paddle.Tensor,
-        return_dict: bool = True,
-    ) -> Union[SchedulerOutput, Tuple]:
-        """
-        Step function propagating the sample with the multistep DPM-Solver.
-
-        Args:
-            model_output (`paddle.Tensor`): direct output from learned diffusion model.
-            timestep (`int`): current discrete timestep in the diffusion chain.
-            sample (`paddle.Tensor`):
-                current instance of sample being created by diffusion process.
-            return_dict (`bool`): option for returning tuple rather than SchedulerOutput class
-
-        Returns:
-            [`~scheduling_utils.SchedulerOutput`] or `tuple`: [`~scheduling_utils.SchedulerOutput`] if `return_dict` is
-            True, otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
-
-        """
-        if self.num_inference_steps is None:
-            raise ValueError(
-                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
-            )
-
-        step_index = (self.timesteps == timestep).nonzero()
-        if len(step_index) == 0:
-            step_index = len(self.timesteps) - 1
-        else:
-            step_index = step_index.item()
-        prev_timestep = 0 if step_index == len(self.timesteps) - 1 else self.timesteps[step_index + 1]
-        lower_order_final = (
-            (step_index == len(self.timesteps) - 1) and self.config.lower_order_final and len(self.timesteps) < 15
-        )
-        lower_order_second = (
-            (step_index == len(self.timesteps) - 2) and self.config.lower_order_final and len(self.timesteps) < 15
-        )
-
-        model_output = self.convert_model_output(model_output, timestep, sample)
-        for i in range(self.config.solver_order - 1):
-            self.model_outputs[i] = self.model_outputs[i + 1]
-        self.model_outputs[-1] = model_output
-
-        if self.config.solver_order == 1 or self.lower_order_nums < 1 or lower_order_final:
-            prev_sample = self.dpm_solver_first_order_update(model_output, timestep, prev_timestep, sample)
-        elif self.config.solver_order == 2 or self.lower_order_nums < 2 or lower_order_second:
-            timestep_list = [self.timesteps[step_index - 1], timestep]
-            prev_sample = self.multistep_dpm_solver_second_order_update(
-                self.model_outputs, timestep_list, prev_timestep, sample
-            )
-        else:
-            timestep_list = [self.timesteps[step_index - 2], self.timesteps[step_index - 1], timestep]
-            prev_sample = self.multistep_dpm_solver_third_order_update(
-                self.model_outputs, timestep_list, prev_timestep, sample
-            )
-
-        if self.lower_order_nums < self.config.solver_order:
-            self.lower_order_nums += 1
-
-        if not return_dict:
-            return (prev_sample,)
-
-        return SchedulerOutput(prev_sample=prev_sample)
-
-    def scale_model_input(self, sample: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
-        """
-        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
-        current timestep.
-
-        Args:
-            sample (`paddle.Tensor`): input sample
-
-        Returns:
-            `paddle.Tensor`: scaled input sample
-        """
-        return sample
-
-    def add_noise(
-        self,
-        original_samples: paddle.Tensor,
-        noise: paddle.Tensor,
-        timesteps: paddle.Tensor,
-    ) -> paddle.Tensor:
-        # Make sure alphas_cumprod and timestep have same dtype as original_samples
-        alphas_cumprod = self.alphas_cumprod.cast(original_samples.dtype)
-
-        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
-        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
-        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
-            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
-
-        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
-        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
-        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
-            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
-
-        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
-        return noisy_samples
-
-    def __len__(self):
-        return self.config.num_train_timesteps
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_singlestep.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_singlestep.py
deleted file mode 100644
index 4269e763b55e..000000000000
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_singlestep.py
+++ /dev/null
@@ -1,626 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2022 TSAIL Team and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver
-
-import math
-from typing import List, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
-
-
-# Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
-    """
-    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
-    (1-beta) over time from t = [0,1].
-
-    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
-    to that part of the diffusion process.
-
-
-    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-
-    Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
-    """
-
-    def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
-
-    betas = []
-    for i in range(num_diffusion_timesteps):
-        t1 = i / num_diffusion_timesteps
-        t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
-    return paddle.to_tensor(betas, dtype=paddle.float32)
-
-
-class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
-    """
-    DPM-Solver (and the improved version DPM-Solver++) is a fast dedicated high-order solver for diffusion ODEs with
-    the convergence order guarantee. Empirically, sampling by DPM-Solver with only 20 steps can generate high-quality
-    samples, and it can generate quite good samples even in only 10 steps.
-
-    For more details, see the original paper: https://arxiv.org/abs/2206.00927 and https://arxiv.org/abs/2211.01095
-
-    Currently, we support the singlestep DPM-Solver for both noise prediction models and data prediction models. We
-    recommend to use `solver_order=2` for guided sampling, and `solver_order=3` for unconditional sampling.
-
-    We also support the "dynamic thresholding" method in Imagen (https://arxiv.org/abs/2205.11487). For pixel-space
-    diffusion models, you can set both `algorithm_type="dpmsolver++"` and `thresholding=True` to use the dynamic
-    thresholding. Note that the thresholding method is unsuitable for latent-space diffusion models (such as
-    stable-diffusion).
-
-    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
-    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
-    [`~SchedulerMixin.from_pretrained`] functions.
-
-    Args:
-        num_train_timesteps (`int`): number of diffusion steps used to train the model.
-        beta_start (`float`): the starting `beta` value of inference.
-        beta_end (`float`): the final `beta` value.
-        beta_schedule (`str`):
-            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
-            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
-        trained_betas (`np.ndarray`, optional):
-            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
-        solver_order (`int`, default `2`):
-            the order of DPM-Solver; can be `1` or `2` or `3`. We recommend to use `solver_order=2` for guided
-            sampling, and `solver_order=3` for unconditional sampling.
-        prediction_type (`str`, default `epsilon`):
-            indicates whether the model predicts the noise (epsilon), or the data / `x0`. One of `epsilon`, `sample`,
-            or `v-prediction`.
-        thresholding (`bool`, default `False`):
-            whether to use the "dynamic thresholding" method (introduced by Imagen, https://arxiv.org/abs/2205.11487).
-            For pixel-space diffusion models, you can set both `algorithm_type=dpmsolver++` and `thresholding=True` to
-            use the dynamic thresholding. Note that the thresholding method is unsuitable for latent-space diffusion
-            models (such as stable-diffusion).
-        dynamic_thresholding_ratio (`float`, default `0.995`):
-            the ratio for the dynamic thresholding method. Default is `0.995`, the same as Imagen
-            (https://arxiv.org/abs/2205.11487).
-        sample_max_value (`float`, default `1.0`):
-            the threshold value for dynamic thresholding. Valid only when `thresholding=True` and
-            `algorithm_type="dpmsolver++`.
-        algorithm_type (`str`, default `dpmsolver++`):
-            the algorithm type for the solver. Either `dpmsolver` or `dpmsolver++`. The `dpmsolver` type implements the
-            algorithms in https://arxiv.org/abs/2206.00927, and the `dpmsolver++` type implements the algorithms in
-            https://arxiv.org/abs/2211.01095. We recommend to use `dpmsolver++` with `solver_order=2` for guided
-            sampling (e.g. stable-diffusion).
-        solver_type (`str`, default `midpoint`):
-            the solver type for the second-order solver. Either `midpoint` or `heun`. The solver type slightly affects
-            the sample quality, especially for small number of steps. We empirically find that `midpoint` solvers are
-            slightly better, so we recommend to use the `midpoint` type.
-        lower_order_final (`bool`, default `True`):
-            whether to use lower-order solvers in the final steps. For singlestep schedulers, we recommend to enable
-            this to use up all the function evaluations.
-
-    """
-
-    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
-    order = 1
-
-    @register_to_config
-    def __init__(
-        self,
-        num_train_timesteps: int = 1000,
-        beta_start: float = 0.0001,
-        beta_end: float = 0.02,
-        beta_schedule: str = "linear",
-        trained_betas: Optional[np.ndarray] = None,
-        solver_order: int = 2,
-        prediction_type: str = "epsilon",
-        thresholding: bool = False,
-        dynamic_thresholding_ratio: float = 0.995,
-        sample_max_value: float = 1.0,
-        algorithm_type: str = "dpmsolver++",
-        solver_type: str = "midpoint",
-        lower_order_final: bool = True,
-    ):
-        if trained_betas is not None:
-            self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
-        elif beta_schedule == "linear":
-            self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
-        elif beta_schedule == "scaled_linear":
-            # this schedule is very specific to the latent diffusion model.
-            self.betas = (
-                paddle.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=paddle.float32) ** 2
-            )
-        elif beta_schedule == "squaredcos_cap_v2":
-            # Glide cosine schedule
-            self.betas = betas_for_alpha_bar(num_train_timesteps)
-        else:
-            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
-
-        self.alphas = 1.0 - self.betas
-        self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
-        # Currently we only support VP-type noise schedule
-        self.alpha_t = paddle.sqrt(self.alphas_cumprod)
-        self.sigma_t = paddle.sqrt(1 - self.alphas_cumprod)
-        self.lambda_t = paddle.log(self.alpha_t) - paddle.log(self.sigma_t)
-
-        # standard deviation of the initial noise distribution
-        self.init_noise_sigma = 1.0
-
-        # settings for DPM-Solver
-        if algorithm_type not in ["dpmsolver", "dpmsolver++"]:
-            if algorithm_type == "deis":
-                self.register_to_config(algorithm_type="dpmsolver++")
-            else:
-                raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}")
-        if solver_type not in ["midpoint", "heun"]:
-            if solver_type in ["logrho", "bh1", "bh2"]:
-                self.register_to_config(solver_type="midpoint")
-            else:
-                raise NotImplementedError(f"{solver_type} does is not implemented for {self.__class__}")
-
-        # setable values
-        self.num_inference_steps = None
-        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32)[::-1].copy()
-        self.timesteps = paddle.to_tensor(timesteps)
-        self.model_outputs = [None] * solver_order
-        self.sample = None
-        self.order_list = self.get_order_list(num_train_timesteps)
-
-    def get_order_list(self, num_inference_steps: int) -> List[int]:
-        """
-        Computes the solver order at each time step.
-
-        Args:
-            num_inference_steps (`int`):
-                the number of diffusion steps used when generating samples with a pre-trained model.
-        """
-        steps = num_inference_steps
-        order = self.config.solver_order
-        if self.config.lower_order_final:
-            if order == 3:
-                if steps % 3 == 0:
-                    orders = [1, 2, 3] * (steps // 3 - 1) + [1, 2] + [1]
-                elif steps % 3 == 1:
-                    orders = [1, 2, 3] * (steps // 3) + [1]
-                else:
-                    orders = [1, 2, 3] * (steps // 3) + [1, 2]
-            elif order == 2:
-                if steps % 2 == 0:
-                    orders = [1, 2] * (steps // 2)
-                else:
-                    orders = [1, 2] * (steps // 2) + [1]
-            elif order == 1:
-                orders = [1] * steps
-        else:
-            if order == 3:
-                orders = [1, 2, 3] * (steps // 3)
-            elif order == 2:
-                orders = [1, 2] * (steps // 2)
-            elif order == 1:
-                orders = [1] * steps
-        return orders
-
-    def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor:
-        """
-        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
-        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
-        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
-        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
-        photorealism as well as better image-text alignment, especially when using very large guidance weights."
-        https://arxiv.org/abs/2205.11487
-        """
-        dtype = sample.dtype
-        batch_size, channels, height, width = sample.shape
-
-        if dtype not in (paddle.float32, paddle.float64):
-            sample = paddle.cast(
-                sample, "float32"
-            )  # upcast for quantile calculation, and clamp not implemented for cpu half
-
-        # Flatten sample for doing quantile calculation along each image
-        sample = paddle.reshape(sample, [batch_size, channels * height * width])
-
-        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
-
-        s = paddle.quantile(abs_sample, self.config.dynamic_thresholding_ratio, axis=1)
-        # paddle.clip donot support min > max
-        if self.config.sample_max_value < 1:
-            s = paddle.ones_like(s) * self.config.sample_max_value
-        else:
-            s = paddle.clip(
-                s, min=1, max=self.config.sample_max_value
-            )  # When clip to min=1, equivalent to standard clipping to [-1, 1]
-        s = s.unsqueeze(1)  # (batch_size, 1) because clip will broadcast along axis=0
-        sample = paddle.clip(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
-
-        sample = paddle.reshape(sample, [batch_size, channels, height, width])
-        sample = paddle.cast(sample, dtype)
-
-        return sample
-
-    def set_timesteps(self, num_inference_steps: int):
-        """
-        Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
-
-        Args:
-            num_inference_steps (`int`):
-                the number of diffusion steps used when generating samples with a pre-trained model.
-        """
-        self.num_inference_steps = num_inference_steps
-        timesteps = (
-            np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps + 1)
-            .round()[::-1][:-1]
-            .copy()
-            .astype(np.int64)
-        )
-        self.timesteps = paddle.to_tensor(timesteps)
-        self.model_outputs = [None] * self.config.solver_order
-        self.sample = None
-        self.orders = self.get_order_list(num_inference_steps)
-
-    def convert_model_output(self, model_output: paddle.Tensor, timestep: int, sample: paddle.Tensor) -> paddle.Tensor:
-        """
-        Convert the model output to the corresponding type that the algorithm (DPM-Solver / DPM-Solver++) needs.
-
-        DPM-Solver is designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to
-        discretize an integral of the data prediction model. So we need to first convert the model output to the
-        corresponding type to match the algorithm.
-
-        Note that the algorithm type and the model type is decoupled. That is to say, we can use either DPM-Solver or
-        DPM-Solver++ for both noise prediction model and data prediction model.
-
-        Args:
-            model_output (`paddle.Tensor`): direct output from learned diffusion model.
-            timestep (`int`): current discrete timestep in the diffusion chain.
-            sample (`paddle.Tensor`):
-                current instance of sample being created by diffusion process.
-
-        Returns:
-            `paddle.Tensor`: the converted model output.
-        """
-        # DPM-Solver++ needs to solve an integral of the data prediction model.
-        if self.config.algorithm_type == "dpmsolver++":
-            if self.config.prediction_type == "epsilon":
-                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
-                x0_pred = (sample - sigma_t * model_output) / alpha_t
-            elif self.config.prediction_type == "sample":
-                x0_pred = model_output
-            elif self.config.prediction_type == "v_prediction":
-                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
-                x0_pred = alpha_t * sample - sigma_t * model_output
-            else:
-                raise ValueError(
-                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
-                    " `v_prediction` for the DPMSolverSinglestepScheduler."
-                )
-
-            if self.config.thresholding:
-                x0_pred = self._threshold_sample(x0_pred)
-
-            return x0_pred
-        # DPM-Solver needs to solve an integral of the noise prediction model.
-        elif self.config.algorithm_type == "dpmsolver":
-            if self.config.prediction_type == "epsilon":
-                return model_output
-            elif self.config.prediction_type == "sample":
-                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
-                epsilon = (sample - alpha_t * model_output) / sigma_t
-                return epsilon
-            elif self.config.prediction_type == "v_prediction":
-                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
-                epsilon = alpha_t * model_output + sigma_t * sample
-                return epsilon
-            else:
-                raise ValueError(
-                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
-                    " `v_prediction` for the DPMSolverSinglestepScheduler."
-                )
-
-    def dpm_solver_first_order_update(
-        self,
-        model_output: paddle.Tensor,
-        timestep: int,
-        prev_timestep: int,
-        sample: paddle.Tensor,
-    ) -> paddle.Tensor:
-        """
-        One step for the first-order DPM-Solver (equivalent to DDIM).
-
-        See https://arxiv.org/abs/2206.00927 for the detailed derivation.
-
-        Args:
-            model_output (`paddle.Tensor`): direct output from learned diffusion model.
-            timestep (`int`): current discrete timestep in the diffusion chain.
-            prev_timestep (`int`): previous discrete timestep in the diffusion chain.
-            sample (`paddle.Tensor`):
-                current instance of sample being created by diffusion process.
-
-        Returns:
-            `paddle.Tensor`: the sample tensor at the previous timestep.
-        """
-        lambda_t, lambda_s = self.lambda_t[prev_timestep], self.lambda_t[timestep]
-        alpha_t, alpha_s = self.alpha_t[prev_timestep], self.alpha_t[timestep]
-        sigma_t, sigma_s = self.sigma_t[prev_timestep], self.sigma_t[timestep]
-        h = lambda_t - lambda_s
-        if self.config.algorithm_type == "dpmsolver++":
-            x_t = (sigma_t / sigma_s) * sample - (alpha_t * (paddle.exp(-h) - 1.0)) * model_output
-        elif self.config.algorithm_type == "dpmsolver":
-            x_t = (alpha_t / alpha_s) * sample - (sigma_t * (paddle.exp(h) - 1.0)) * model_output
-        return x_t
-
-    def singlestep_dpm_solver_second_order_update(
-        self,
-        model_output_list: List[paddle.Tensor],
-        timestep_list: List[int],
-        prev_timestep: int,
-        sample: paddle.Tensor,
-    ) -> paddle.Tensor:
-        """
-        One step for the second-order singlestep DPM-Solver.
-
-        It computes the solution at time `prev_timestep` from the time `timestep_list[-2]`.
-
-        Args:
-            model_output_list (`List[paddle.Tensor]`):
-                direct outputs from learned diffusion model at current and latter timesteps.
-            timestep (`int`): current and latter discrete timestep in the diffusion chain.
-            prev_timestep (`int`): previous discrete timestep in the diffusion chain.
-            sample (`paddle.Tensor`):
-                current instance of sample being created by diffusion process.
-
-        Returns:
-            `paddle.Tensor`: the sample tensor at the previous timestep.
-        """
-        t, s0, s1 = prev_timestep, timestep_list[-1], timestep_list[-2]
-        m0, m1 = model_output_list[-1], model_output_list[-2]
-        lambda_t, lambda_s0, lambda_s1 = self.lambda_t[t], self.lambda_t[s0], self.lambda_t[s1]
-        alpha_t, alpha_s1 = self.alpha_t[t], self.alpha_t[s1]
-        sigma_t, sigma_s1 = self.sigma_t[t], self.sigma_t[s1]
-        h, h_0 = lambda_t - lambda_s1, lambda_s0 - lambda_s1
-        r0 = h_0 / h
-        D0, D1 = m1, (1.0 / r0) * (m0 - m1)
-        if self.config.algorithm_type == "dpmsolver++":
-            # See https://arxiv.org/abs/2211.01095 for detailed derivations
-            if self.config.solver_type == "midpoint":
-                x_t = (
-                    (sigma_t / sigma_s1) * sample
-                    - (alpha_t * (paddle.exp(-h) - 1.0)) * D0
-                    - 0.5 * (alpha_t * (paddle.exp(-h) - 1.0)) * D1
-                )
-            elif self.config.solver_type == "heun":
-                x_t = (
-                    (sigma_t / sigma_s1) * sample
-                    - (alpha_t * (paddle.exp(-h) - 1.0)) * D0
-                    + (alpha_t * ((paddle.exp(-h) - 1.0) / h + 1.0)) * D1
-                )
-        elif self.config.algorithm_type == "dpmsolver":
-            # See https://arxiv.org/abs/2206.00927 for detailed derivations
-            if self.config.solver_type == "midpoint":
-                x_t = (
-                    (alpha_t / alpha_s1) * sample
-                    - (sigma_t * (paddle.exp(h) - 1.0)) * D0
-                    - 0.5 * (sigma_t * (paddle.exp(h) - 1.0)) * D1
-                )
-            elif self.config.solver_type == "heun":
-                x_t = (
-                    (alpha_t / alpha_s1) * sample
-                    - (sigma_t * (paddle.exp(h) - 1.0)) * D0
-                    - (sigma_t * ((paddle.exp(h) - 1.0) / h - 1.0)) * D1
-                )
-        return x_t
-
-    def singlestep_dpm_solver_third_order_update(
-        self,
-        model_output_list: List[paddle.Tensor],
-        timestep_list: List[int],
-        prev_timestep: int,
-        sample: paddle.Tensor,
-    ) -> paddle.Tensor:
-        """
-        One step for the third-order singlestep DPM-Solver.
-
-        It computes the solution at time `prev_timestep` from the time `timestep_list[-3]`.
-
-        Args:
-            model_output_list (`List[paddle.Tensor]`):
-                direct outputs from learned diffusion model at current and latter timesteps.
-            timestep (`int`): current and latter discrete timestep in the diffusion chain.
-            prev_timestep (`int`): previous discrete timestep in the diffusion chain.
-            sample (`paddle.Tensor`):
-                current instance of sample being created by diffusion process.
-
-        Returns:
-            `paddle.Tensor`: the sample tensor at the previous timestep.
-        """
-        t, s0, s1, s2 = prev_timestep, timestep_list[-1], timestep_list[-2], timestep_list[-3]
-        m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3]
-        lambda_t, lambda_s0, lambda_s1, lambda_s2 = (
-            self.lambda_t[t],
-            self.lambda_t[s0],
-            self.lambda_t[s1],
-            self.lambda_t[s2],
-        )
-        alpha_t, alpha_s2 = self.alpha_t[t], self.alpha_t[s2]
-        sigma_t, sigma_s2 = self.sigma_t[t], self.sigma_t[s2]
-        h, h_0, h_1 = lambda_t - lambda_s2, lambda_s0 - lambda_s2, lambda_s1 - lambda_s2
-        r0, r1 = h_0 / h, h_1 / h
-        D0 = m2
-        D1_0, D1_1 = (1.0 / r1) * (m1 - m2), (1.0 / r0) * (m0 - m2)
-        D1 = (r0 * D1_0 - r1 * D1_1) / (r0 - r1)
-        D2 = 2.0 * (D1_1 - D1_0) / (r0 - r1)
-        if self.config.algorithm_type == "dpmsolver++":
-            # See https://arxiv.org/abs/2206.00927 for detailed derivations
-            if self.config.solver_type == "midpoint":
-                x_t = (
-                    (sigma_t / sigma_s2) * sample
-                    - (alpha_t * (paddle.exp(-h) - 1.0)) * D0
-                    + (alpha_t * ((paddle.exp(-h) - 1.0) / h + 1.0)) * D1_1
-                )
-            elif self.config.solver_type == "heun":
-                x_t = (
-                    (sigma_t / sigma_s2) * sample
-                    - (alpha_t * (paddle.exp(-h) - 1.0)) * D0
-                    + (alpha_t * ((paddle.exp(-h) - 1.0) / h + 1.0)) * D1
-                    - (alpha_t * ((paddle.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2
-                )
-        elif self.config.algorithm_type == "dpmsolver":
-            # See https://arxiv.org/abs/2206.00927 for detailed derivations
-            if self.config.solver_type == "midpoint":
-                x_t = (
-                    (alpha_t / alpha_s2) * sample
-                    - (sigma_t * (paddle.exp(h) - 1.0)) * D0
-                    - (sigma_t * ((paddle.exp(h) - 1.0) / h - 1.0)) * D1_1
-                )
-            elif self.config.solver_type == "heun":
-                x_t = (
-                    (alpha_t / alpha_s2) * sample
-                    - (sigma_t * (paddle.exp(h) - 1.0)) * D0
-                    - (sigma_t * ((paddle.exp(h) - 1.0) / h - 1.0)) * D1
-                    - (sigma_t * ((paddle.exp(h) - 1.0 - h) / h**2 - 0.5)) * D2
-                )
-        return x_t
-
-    def singlestep_dpm_solver_update(
-        self,
-        model_output_list: List[paddle.Tensor],
-        timestep_list: List[int],
-        prev_timestep: int,
-        sample: paddle.Tensor,
-        order: int,
-    ) -> paddle.Tensor:
-        """
-        One step for the singlestep DPM-Solver.
-
-        Args:
-            model_output_list (`List[paddle.Tensor]`):
-                direct outputs from learned diffusion model at current and latter timesteps.
-            timestep (`int`): current and latter discrete timestep in the diffusion chain.
-            prev_timestep (`int`): previous discrete timestep in the diffusion chain.
-            sample (`paddle.Tensor`):
-                current instance of sample being created by diffusion process.
-            order (`int`):
-                the solver order at this step.
-
-        Returns:
-            `paddle.Tensor`: the sample tensor at the previous timestep.
-        """
-        if order == 1:
-            return self.dpm_solver_first_order_update(model_output_list[-1], timestep_list[-1], prev_timestep, sample)
-        elif order == 2:
-            return self.singlestep_dpm_solver_second_order_update(
-                model_output_list, timestep_list, prev_timestep, sample
-            )
-        elif order == 3:
-            return self.singlestep_dpm_solver_third_order_update(
-                model_output_list, timestep_list, prev_timestep, sample
-            )
-        else:
-            raise ValueError(f"Order must be 1, 2, 3, got {order}")
-
-    def step(
-        self,
-        model_output: paddle.Tensor,
-        timestep: int,
-        sample: paddle.Tensor,
-        return_dict: bool = True,
-    ) -> Union[SchedulerOutput, Tuple]:
-        """
-        Step function propagating the sample with the singlestep DPM-Solver.
-
-        Args:
-            model_output (`paddle.Tensor`): direct output from learned diffusion model.
-            timestep (`int`): current discrete timestep in the diffusion chain.
-            sample (`paddle.Tensor`):
-                current instance of sample being created by diffusion process.
-            return_dict (`bool`): option for returning tuple rather than SchedulerOutput class
-
-        Returns:
-            [`~scheduling_utils.SchedulerOutput`] or `tuple`: [`~scheduling_utils.SchedulerOutput`] if `return_dict` is
-            True, otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
-
-        """
-        if self.num_inference_steps is None:
-            raise ValueError(
-                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
-            )
-
-        step_index = (self.timesteps == timestep).nonzero()
-        if len(step_index) == 0:
-            step_index = len(self.timesteps) - 1
-        else:
-            step_index = step_index.item()
-        prev_timestep = 0 if step_index == len(self.timesteps) - 1 else self.timesteps[step_index + 1]
-
-        model_output = self.convert_model_output(model_output, timestep, sample)
-        for i in range(self.config.solver_order - 1):
-            self.model_outputs[i] = self.model_outputs[i + 1]
-        self.model_outputs[-1] = model_output
-
-        order = self.order_list[step_index]
-        # For single-step solvers, we use the initial value at each time with order = 1.
-        if order == 1:
-            self.sample = sample
-
-        timestep_list = [self.timesteps[step_index - i] for i in range(order - 1, 0, -1)] + [timestep]
-        prev_sample = self.singlestep_dpm_solver_update(
-            self.model_outputs, timestep_list, prev_timestep, self.sample, order
-        )
-
-        if not return_dict:
-            return (prev_sample,)
-
-        return SchedulerOutput(prev_sample=prev_sample)
-
-    def scale_model_input(self, sample: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
-        """
-        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
-        current timestep.
-
-        Args:
-            sample (`paddle.Tensor`): input sample
-
-        Returns:
-            `paddle.Tensor`: scaled input sample
-        """
-        return sample
-
-    def add_noise(
-        self,
-        original_samples: paddle.Tensor,
-        noise: paddle.Tensor,
-        timesteps: paddle.Tensor,
-    ) -> paddle.Tensor:
-        # Make sure alphas_cumprod and timestep have same dtype as original_samples
-        alphas_cumprod = self.alphas_cumprod.cast(original_samples.dtype)
-
-        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
-        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
-        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
-            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
-
-        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
-        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
-        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
-            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
-
-        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
-        return noisy_samples
-
-    def __len__(self):
-        return self.config.num_train_timesteps
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_unidiffuser.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_unidiffuser.py
deleted file mode 100644
index f8a079866159..000000000000
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_unidiffuser.py
+++ /dev/null
@@ -1,473 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import List, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
-
-
-def logaddexp(x, y):
-    return paddle.log(1 + paddle.exp(paddle.minimum(x, y) - paddle.maximum(x, y))) + paddle.maximum(x, y)
-
-
-def interpolate_fn(x: paddle.Tensor, xp: paddle.Tensor, yp: paddle.Tensor) -> paddle.Tensor:
-    """Performs piecewise linear interpolation for x, using xp and yp keypoints (knots).
-    Performs separate interpolation for each channel.
-    Args:
-        x: [N, C] points to be calibrated (interpolated). Batch with C channels.
-        xp: [C, K] x coordinates of the PWL knots. C is the number of channels, K is the number of knots.
-        yp: [C, K] y coordinates of the PWL knots. C is the number of channels, K is the number of knots.
-    Returns:
-        Interpolated points of the shape [N, C].
-    The piecewise linear function extends for the whole x axis (the outermost keypoints define the outermost
-    infinite lines).
-    For example:
-    >>> calibrate1d(paddle.to_tensor([[0.5]]), paddle.to_tensor([[0.0, 1.0]]), paddle.to_tensor([[0.0, 2.0]]))
-    tensor([[1.0000]])
-    >>> calibrate1d(paddle.to_tensor([[-10]]), paddle.to_tensor([[0.0, 1.0]]), paddle.to_tensor([[0.0, 2.0]]))
-    tensor([[-20.0000]])
-    """
-    x_breakpoints = paddle.concat([x.unsqueeze(2), xp.unsqueeze(0).tile((x.shape[0], 1, 1))], axis=2)
-    num_x_points = xp.shape[1]
-    sorted_x_breakpoints = paddle.sort(x_breakpoints, axis=2)
-    x_indices = paddle.argsort(x_breakpoints, axis=2)
-    x_idx = paddle.argmin(x_indices, axis=2)
-    cand_start_idx = x_idx - 1
-    start_idx = paddle.where(
-        paddle.equal(x_idx, 0),
-        paddle.to_tensor([1]),
-        paddle.where(
-            paddle.equal(x_idx, num_x_points),
-            paddle.to_tensor([num_x_points - 2]),
-            cand_start_idx,
-        ),
-    )
-    end_idx = paddle.where(paddle.equal(start_idx, cand_start_idx), start_idx + 2, start_idx + 1)
-    start_x = paddle.take_along_axis(arr=sorted_x_breakpoints, axis=2, indices=start_idx.unsqueeze(axis=2)).squeeze(
-        axis=2
-    )
-    end_x = paddle.take_along_axis(arr=sorted_x_breakpoints, axis=2, indices=end_idx.unsqueeze(axis=2)).squeeze(axis=2)
-    start_idx2 = paddle.where(
-        paddle.equal(x_idx, 0),
-        paddle.to_tensor([0]),
-        paddle.where(
-            paddle.equal(x_idx, num_x_points),
-            paddle.to_tensor([num_x_points - 2]),
-            cand_start_idx,
-        ),
-    )
-    y_positions_expanded = yp.unsqueeze(0).expand([x.shape[0], -1, -1])
-    start_y = paddle.take_along_axis(y_positions_expanded, axis=2, indices=start_idx2.unsqueeze(2)).squeeze(2)
-    end_y = paddle.take_along_axis(y_positions_expanded, axis=2, indices=(start_idx2 + 1).unsqueeze(2)).squeeze(2)
-    cand = start_y + (x - start_x) * (end_y - start_y) / (end_x - start_x)
-    return cand
-
-
-class DPMSolverUniDiffuserScheduler(SchedulerMixin, ConfigMixin):
-    """
-    DPM-Solver (and the improved version DPM-Solver++) is a fast dedicated high-order solver for diffusion ODEs with
-    the convergence order guarantee. Empirically, sampling by DPM-Solver with only 20 steps can generate high-quality
-    samples, and it can generate quite good samples even in only 10 steps.
-
-    For more details, see the original paper: https://arxiv.org/abs/2206.00927 and https://arxiv.org/abs/2211.01095
-
-    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
-    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
-    [`~SchedulerMixin.from_pretrained`] functions.
-
-    Args:
-        num_train_timesteps (`int`): number of diffusion steps used to train the model.
-        beta_start (`float`): the starting `beta` value of inference.
-        beta_end (`float`): the final `beta` value.
-        method (`str`): the update method, Choose from `multistep` or `fast`.
-        schedule (`str`): the schedule of NoiseScheduleVP. Default is `discrete`.
-        beta_schedule (`str`):
-            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Default `scaled_linear`.
-        trained_betas (`np.ndarray`, optional):
-            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
-        prediction_type (`str`, default `epsilon`, optional):
-            prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
-            process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
-            https://imagen.research.google/video/paper.pdf)
-        algorithm_type (`str`, default `dpmsolver++`):
-            the algorithm type for the solver. Either `dpmsolver` or `dpmsolver++`. The `dpmsolver` type implements the
-            algorithms in https://arxiv.org/abs/2206.00927, and the `dpmsolver++` type implements the algorithms in
-            https://arxiv.org/abs/2211.01095. We recommend to use `dpmsolver++` with `solver_order=2` for guided
-            sampling (e.g. stable-diffusion).
-        solver_type (`str`, default `midpoint`):
-            the solver type for the second-order solver. Either `midpoint` or `heun`. The solver type slightly affects
-            the sample quality, especially for small number of steps. We empirically find that `midpoint` solvers are
-            slightly better, so we recommend to use the `midpoint` type.
-    """
-
-    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
-    order = 1
-
-    @register_to_config
-    def __init__(
-        self,
-        num_train_timesteps: int = 1000,
-        beta_start: float = 0.00085,
-        beta_end: float = 0.0120,
-        method="multistep",
-        schedule: str = "discrete",
-        beta_schedule: str = "scaled_linear",
-        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
-        prediction_type: str = "epsilon",
-        algorithm_type: str = "dpmsolver++",
-        solver_type: str = "midpoint",
-    ):
-        if trained_betas is not None:
-            self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
-        if beta_schedule == "scaled_linear":
-            # this schedule is very specific to the unidiffuser model.
-            self.betas = (
-                paddle.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=paddle.float32) ** 2
-            )
-        else:
-            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
-
-        if schedule == "discrete":
-            log_alphas = 0.5 * paddle.log(1 - self.betas).cumsum(axis=0)
-            self.total_N = len(log_alphas)
-            self.t_discrete = paddle.linspace(1.0 / self.total_N, 1.0, self.total_N).reshape([1, -1])
-            self.log_alpha_discrete = log_alphas.reshape((1, -1))
-        else:
-            raise ValueError
-
-        self.method = method
-        self.schedule = schedule
-        self.prediction_type = prediction_type
-        self.algorithm_type = algorithm_type
-        self.solver_type = solver_type
-
-        # settings for DPM-Solver
-        if algorithm_type not in ["dpmsolver++"]:
-            if algorithm_type == "deis":
-                algorithm_type = "dpmsolver++"
-            else:
-                raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}")
-        if solver_type not in ["midpoint"]:
-            if solver_type in ["logrho", "bh1", "bh2"]:
-                solver_type = "midpoint"
-            else:
-                raise NotImplementedError(f"{solver_type} does is not implemented for {self.__class__}")
-
-        # standard deviation of the initial noise distribution
-        self.init_noise_sigma = 1.0
-        self.noise_prev_list = []
-        self.t_prev_list = []
-
-    def marginal_log_mean_coeff(self, t):
-        """
-        Compute log(alpha_t) of a given continuous-time label t in [0, T].
-        """
-        if self.schedule == "discrete":
-            return interpolate_fn(
-                t.reshape((-1, 1)), self.t_discrete.clone(), self.log_alpha_discrete.clone()
-            ).reshape((-1,))
-        else:
-            raise ValueError
-
-    def marginal_alpha(self, t):
-        return paddle.exp(self.marginal_log_mean_coeff(t))
-
-    def marginal_std(self, t):
-        """
-        Compute sigma_t of a given continuous-time label t in [0, T].
-        """
-        return paddle.sqrt(1.0 - paddle.exp(2.0 * self.marginal_log_mean_coeff(t)))
-
-    def marginal_lambda(self, t):
-        """
-        Compute lambda_t = log(alpha_t) - log(sigma_t) of a given continuous-time label t in [0, T].
-        """
-        log_mean_coeff = self.marginal_log_mean_coeff(t)
-        log_std = 0.5 * paddle.log(1.0 - paddle.exp(2.0 * log_mean_coeff))
-        return log_mean_coeff - log_std
-
-    def inverse_lambda(self, lamb):
-        if self.schedule == "discrete":
-            log_alpha = -0.5 * logaddexp(paddle.zeros((1,)), -2.0 * lamb)
-            t = interpolate_fn(
-                log_alpha.reshape((-1, 1)),
-                paddle.flip(self.log_alpha_discrete.clone(), [1]),
-                paddle.flip(self.t_discrete.clone(), [1]),
-            )
-            return t.reshape((-1,))
-        else:
-            raise ValueError
-
-    def set_timesteps(self, num_inference_steps: int):
-        """
-        Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
-
-        Args:
-            num_inference_steps (`int`):
-                the number of diffusion steps used when generating samples with a pre-trained model.
-        """
-        self.num_inference_steps = num_inference_steps
-        self.timesteps = paddle.linspace(1.0, 0.001, num_inference_steps + 1)
-
-        self.noise_prev_list = []
-        self.t_prev_list = []
-
-    def convert_model_output(self, model_output: paddle.Tensor, timestep: int, sample: paddle.Tensor) -> paddle.Tensor:
-        """
-        Convert the model output to the corresponding type that the algorithm (DPM-Solver / DPM-Solver++) needs.
-
-        DPM-Solver is designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to
-        discretize an integral of the data prediction model. So we need to first convert the model output to the
-        corresponding type to match the algorithm.
-
-        Note that the algorithm type and the model type is decoupled. That is to say, we can use either DPM-Solver or
-        DPM-Solver++ for both noise prediction model and data prediction model.
-
-        Args:
-            model_output (`paddle.Tensor`): direct output from learned diffusion model.
-            timestep (`int`): current discrete timestep in the diffusion chain.
-            sample (`paddle.Tensor`):
-                current instance of sample being created by diffusion process.
-
-        Returns:
-            `paddle.Tensor`: the converted model output.
-        """
-        # DPM-Solver++ needs to solve an integral of the data prediction model.
-        alpha_t, sigma_t = self.marginal_alpha(timestep), self.marginal_std(timestep)
-        x0_pred = (sample - sigma_t * model_output) / alpha_t
-        return x0_pred
-
-    def dpm_solver_first_order_update(
-        self,
-        model_output: paddle.Tensor,
-        timestep: int,
-        prev_timestep: int,
-        sample: paddle.Tensor,
-    ) -> paddle.Tensor:
-        """
-        One step for the first-order DPM-Solver (equivalent to DDIM).
-
-        See https://arxiv.org/abs/2206.00927 for the detailed derivation.
-
-        Args:
-            model_output (`paddle.Tensor`): direct output from learned diffusion model.
-            timestep (`int`): current discrete timestep in the diffusion chain.
-            prev_timestep (`int`): previous discrete timestep in the diffusion chain.
-            sample (`paddle.Tensor`):
-                current instance of sample being created by diffusion process.
-
-        Returns:
-            `paddle.Tensor`: the sample tensor at the previous timestep.
-        """
-        lambda_t, lambda_s = self.marginal_lambda(timestep), self.marginal_lambda(prev_timestep)
-        alpha_t = self.marginal_log_mean_coeff(timestep)
-        sigma_t, sigma_s = self.marginal_std(timestep), self.marginal_std(prev_timestep)
-
-        alpha_t = paddle.exp(alpha_t)
-        h = lambda_t - lambda_s
-        if self.config.algorithm_type == "dpmsolver++":
-            x_t = (sigma_t / sigma_s) * sample - (alpha_t * (paddle.exp(-h) - 1.0)) * model_output
-        else:
-            raise ValueError
-        return x_t
-
-    def multistep_dpm_solver_second_order_update(
-        self,
-        model_output_list: List[paddle.Tensor],
-        timestep_list: List[int],
-        prev_timestep: int,
-        sample: paddle.Tensor,
-    ) -> paddle.Tensor:
-        """
-        One step for the second-order multistep DPM-Solver.
-
-        Args:
-            model_output_list (`List[paddle.Tensor]`):
-                direct outputs from learned diffusion model at current and latter timesteps.
-            timestep (`int`): current and latter discrete timestep in the diffusion chain.
-            prev_timestep (`int`): previous discrete timestep in the diffusion chain.
-            sample (`paddle.Tensor`):
-                current instance of sample being created by diffusion process.
-
-        Returns:
-            `paddle.Tensor`: the sample tensor at the previous timestep.
-        """
-        t, s0, s1 = prev_timestep, timestep_list[-1], timestep_list[-2]
-        m0, m1 = model_output_list[-1], model_output_list[-2]
-        lambda_t, lambda_s0, lambda_s1 = self.marginal_lambda(t), self.marginal_lambda(s0), self.marginal_lambda(s1)
-        log_alpha_t = self.marginal_log_mean_coeff(t)
-        sigma_t, sigma_s0 = self.marginal_std(t), self.marginal_std(s0)
-        h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1
-        r0 = h_0 / h
-        D0, D1 = m0, (1.0 / r0) * (m0 - m1)
-        alpha_t = paddle.exp(log_alpha_t)  # Note: diff
-        if self.config.algorithm_type == "dpmsolver++":
-            # See https://arxiv.org/abs/2211.01095 for detailed derivations
-            if self.config.solver_type == "midpoint":
-                x_t = (
-                    (sigma_t / sigma_s0) * sample
-                    - (alpha_t * (paddle.exp(-h) - 1.0)) * D0
-                    - 0.5 * (alpha_t * (paddle.exp(-h) - 1.0)) * D1
-                )
-            else:
-                raise ValueError
-        return x_t
-
-    def multistep_dpm_solver_third_order_update(
-        self,
-        model_output_list: List[paddle.Tensor],
-        timestep_list: List[int],
-        prev_timestep: int,
-        sample: paddle.Tensor,
-    ) -> paddle.Tensor:
-        """
-        One step for the third-order multistep DPM-Solver.
-
-        Args:
-            model_output_list (`List[paddle.Tensor]`):
-                direct outputs from learned diffusion model at current and latter timesteps.
-            timestep (`int`): current and latter discrete timestep in the diffusion chain.
-            prev_timestep (`int`): previous discrete timestep in the diffusion chain.
-            sample (`paddle.Tensor`):
-                current instance of sample being created by diffusion process.
-
-        Returns:
-            `paddle.Tensor`: the sample tensor at the previous timestep.
-        """
-        t, s0, s1, s2 = prev_timestep, timestep_list[-1], timestep_list[-2], timestep_list[-3]
-        m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3]
-        lambda_t, lambda_s0, lambda_s1, lambda_s2 = (
-            self.marginal_lambda(t),
-            self.marginal_lambda(s0),
-            self.marginal_lambda(s1),
-            self.marginal_lambda(s2),
-        )
-        alpha_t = self.marginal_log_mean_coeff(t)
-        alpha_t = paddle.exp(alpha_t)
-        sigma_t, sigma_s0 = self.marginal_std(t), self.marginal_std(s0)
-        h, h_0, h_1 = lambda_t - lambda_s0, lambda_s0 - lambda_s1, lambda_s1 - lambda_s2
-        r0, r1 = h_0 / h, h_1 / h
-        D0 = m0
-        D1_0, D1_1 = (1.0 / r0) * (m0 - m1), (1.0 / r1) * (m1 - m2)
-        D1 = D1_0 + (r0 / (r0 + r1)) * (D1_0 - D1_1)
-        D2 = (1.0 / (r0 + r1)) * (D1_0 - D1_1)
-        if self.config.algorithm_type == "dpmsolver++":
-            # See https://arxiv.org/abs/2206.00927 for detailed derivations
-            x_t = (
-                (sigma_t / sigma_s0) * sample
-                - (alpha_t * (paddle.exp(-h) - 1.0)) * D0
-                + (alpha_t * ((paddle.exp(-h) - 1.0) / h + 1.0)) * D1
-                - (alpha_t * ((paddle.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2
-            )
-        else:
-            raise ValueError
-        return x_t
-
-    def step(
-        self,
-        model_output: paddle.Tensor,
-        timestep: int,
-        sample: paddle.Tensor,
-        return_dict: bool = True,
-    ) -> Union[SchedulerOutput, Tuple]:
-        """
-        Step function propagating the sample with the multistep DPM-Solver.
-
-        Args:
-            model_output (`paddle.Tensor`): direct output from learned diffusion model.
-            timestep (`int`): current discrete timestep in the diffusion chain.
-            sample (`paddle.Tensor`):
-                current instance of sample being created by diffusion process.
-            return_dict (`bool`): option for returning tuple rather than SchedulerOutput class
-
-        Returns:
-            [`~scheduling_utils.SchedulerOutput`] or `tuple`: [`~scheduling_utils.SchedulerOutput`] if `return_dict` is
-            True, otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
-
-        """
-        if self.num_inference_steps is None:
-            raise ValueError(
-                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
-            )
-
-        step_index = (self.timesteps == timestep).nonzero()
-        if len(step_index) == 0:
-            step_index = len(self.timesteps) - 1
-        else:
-            step_index = step_index.item()
-
-        order = 3
-        if self.method == "multistep":
-            if step_index == 0:
-                vec_t = timestep.expand([sample.shape[0]])
-                model_output = self.convert_model_output(model_output, vec_t, sample)
-                self.noise_prev_list.append(model_output)
-                self.t_prev_list.append(vec_t)
-
-            if step_index > 0 and step_index < order:
-                vec_t = timestep.expand([sample.shape[0]])
-                sample = self.dpm_multistep_update(sample, self.noise_prev_list, self.t_prev_list, vec_t, step_index)
-                model_output = self.convert_model_output(model_output, vec_t, sample)
-                self.noise_prev_list.append(model_output)
-                self.t_prev_list.append(vec_t)
-
-            if step_index >= order and step_index < len(self.timesteps):
-                vec_t = timestep.expand([sample.shape[0]])
-                sample = self.dpm_multistep_update(sample, self.noise_prev_list, self.t_prev_list, vec_t, order)
-                for i in range(order - 1):
-                    self.t_prev_list[i] = self.t_prev_list[i + 1]
-                    self.noise_prev_list[i] = self.noise_prev_list[i + 1]
-                self.t_prev_list[-1] = vec_t
-                if step_index < len(self.timesteps) - 1:
-                    self.noise_prev_list[-1] = self.convert_model_output(model_output, vec_t, sample)
-        else:
-            raise ValueError
-
-        prev_sample = sample
-
-        if not return_dict:
-            return (prev_sample,)
-
-        return SchedulerOutput(prev_sample=prev_sample)
-
-    def dpm_multistep_update(self, x, noise_prev_list, t_prev_list, t, order):
-        if order == 1:
-            return self.dpm_solver_first_order_update(noise_prev_list[-1], t, t_prev_list[-1], x)
-        elif order == 2:
-            return self.multistep_dpm_solver_second_order_update(noise_prev_list, t_prev_list, t, x)
-        elif order == 3:
-            return self.multistep_dpm_solver_third_order_update(noise_prev_list, t_prev_list, t, x)
-        else:
-            raise ValueError("Solver order must be 1 or 2 or 3, got {}".format(order))
-
-    def scale_model_input(self, sample: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
-        """
-        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
-        current timestep.
-
-        Args:
-            sample (`paddle.Tensor`): input sample
-
-        Returns:
-            `paddle.Tensor`: scaled input sample
-        """
-        return sample
-
-    def __len__(self):
-        return self.config.num_train_timesteps
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_euler_ancestral_discrete.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_euler_ancestral_discrete.py
deleted file mode 100644
index 2492231d2380..000000000000
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_euler_ancestral_discrete.py
+++ /dev/null
@@ -1,273 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2022 Katherine Crowson and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, logging, randn_tensor
-from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-@dataclass
-# Copied from ppdiffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->EulerAncestralDiscrete
-class EulerAncestralDiscreteSchedulerOutput(BaseOutput):
-    """
-    Output class for the scheduler's step function output.
-
-    Args:
-        prev_sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
-            Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
-            denoising loop.
-        pred_original_sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
-            The predicted denoised sample (x_{0}) based on the model output from the current timestep.
-            `pred_original_sample` can be used to preview progress or for guidance.
-    """
-
-    prev_sample: paddle.Tensor
-    pred_original_sample: Optional[paddle.Tensor] = None
-
-
-# Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> paddle.Tensor:
-    """
-    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
-    (1-beta) over time from t = [0,1].
-
-    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
-    to that part of the diffusion process.
-
-
-    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-
-    Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
-    """
-
-    def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
-
-    betas = []
-    for i in range(num_diffusion_timesteps):
-        t1 = i / num_diffusion_timesteps
-        t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
-    return paddle.to_tensor(betas, dtype=paddle.float32)
-
-
-class EulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
-    """
-    Ancestral sampling with Euler method steps. Based on the original k-diffusion implementation by Katherine Crowson:
-    https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L72
-
-    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
-    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
-    [`~SchedulerMixin.from_pretrained`] functions.
-
-    Args:
-        num_train_timesteps (`int`): number of diffusion steps used to train the model.
-        beta_start (`float`): the starting `beta` value of inference.
-        beta_end (`float`): the final `beta` value.
-        beta_schedule (`str`):
-            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
-            `linear` or `scaled_linear`.
-        trained_betas (`np.ndarray`, optional):
-            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
-        prediction_type (`str`, default `epsilon`, optional):
-            prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
-            process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
-            https://imagen.research.google/video/paper.pdf)
-
-    """
-
-    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
-    order = 1
-
-    @register_to_config
-    def __init__(
-        self,
-        num_train_timesteps: int = 1000,
-        beta_start: float = 0.0001,
-        beta_end: float = 0.02,
-        beta_schedule: str = "linear",
-        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
-        prediction_type: str = "epsilon",
-    ):
-        if trained_betas is not None:
-            self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
-        elif beta_schedule == "linear":
-            self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
-        elif beta_schedule == "scaled_linear":
-            # this schedule is very specific to the latent diffusion model.
-            self.betas = (
-                paddle.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=paddle.float32) ** 2
-            )
-        elif beta_schedule == "squaredcos_cap_v2":
-            # Glide cosine schedule
-            self.betas = betas_for_alpha_bar(num_train_timesteps)
-        else:
-            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
-
-        self.alphas = 1.0 - self.betas
-        self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
-
-        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
-        sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32)
-        self.sigmas = paddle.to_tensor(sigmas)
-
-        # standard deviation of the initial noise distribution
-        self.init_noise_sigma = self.sigmas.max()
-
-        # setable values
-        self.num_inference_steps = None
-        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy()
-        self.timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32)
-        self.is_scale_input_called = False
-
-    def scale_model_input(self, sample: paddle.Tensor, timestep: Union[float, paddle.Tensor]) -> paddle.Tensor:
-        """
-        Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm.
-
-        Args:
-            sample (`paddle.Tensor`): input sample
-            timestep (`float` or `paddle.Tensor`): the current timestep in the diffusion chain
-
-        Returns:
-            `paddle.Tensor`: scaled input sample
-        """
-        step_index = (self.timesteps == timestep).nonzero().item()
-        sigma = self.sigmas[step_index]
-        sample = sample / ((sigma**2 + 1) ** 0.5)
-        self.is_scale_input_called = True
-        return sample
-
-    def set_timesteps(self, num_inference_steps: int):
-        """
-        Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
-
-        Args:
-            num_inference_steps (`int`):
-                the number of diffusion steps used when generating samples with a pre-trained model.
-        """
-        self.num_inference_steps = num_inference_steps
-
-        timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
-        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
-        sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
-        sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
-        self.sigmas = paddle.to_tensor(sigmas)
-        self.timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32)
-
-    def step(
-        self,
-        model_output: paddle.Tensor,
-        timestep: Union[float, paddle.Tensor],
-        sample: paddle.Tensor,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        return_dict: bool = True,
-    ) -> Union[EulerAncestralDiscreteSchedulerOutput, Tuple]:
-        """
-        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
-        process from the learned model outputs (most often the predicted noise).
-
-        Args:
-            model_output (`paddle.Tensor`): direct output from learned diffusion model.
-            timestep (`float`): current timestep in the diffusion chain.
-            sample (`paddle.Tensor`):
-                current instance of sample being created by diffusion process.
-            generator (`paddle.Generator`, optional): Random number generator.
-            return_dict (`bool`): option for returning tuple rather than EulerAncestralDiscreteSchedulerOutput class
-
-        Returns:
-            [`~schedulers.scheduling_utils.EulerAncestralDiscreteSchedulerOutput`] or `tuple`:
-            [`~schedulers.scheduling_utils.EulerAncestralDiscreteSchedulerOutput`] if `return_dict` is True, otherwise
-            a `tuple`. When returning a tuple, the first element is the sample tensor.
-
-        """
-        if not self.is_scale_input_called:
-            logger.warning(
-                "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
-                "See `StableDiffusionPipeline` for a usage example."
-            )
-        step_index = (self.timesteps == timestep).nonzero().item()
-        sigma = self.sigmas[step_index]
-
-        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
-        if self.config.prediction_type == "epsilon":
-            pred_original_sample = sample - sigma * model_output
-        elif self.config.prediction_type == "v_prediction":
-            # * c_out + input * c_skip
-            pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
-        elif self.config.prediction_type == "sample":
-            raise NotImplementedError("prediction_type not implemented yet: sample")
-        else:
-            raise ValueError(
-                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
-            )
-
-        sigma_from = self.sigmas[step_index]
-        sigma_to = self.sigmas[step_index + 1]
-        sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5
-        sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
-
-        # 2. Convert to an ODE derivative
-        derivative = (sample - pred_original_sample) / sigma
-
-        dt = sigma_down - sigma
-
-        prev_sample = sample + derivative * dt
-
-        noise = randn_tensor(model_output.shape, dtype=model_output.dtype, generator=generator)
-
-        prev_sample = prev_sample + noise * sigma_up
-
-        if not return_dict:
-            return (prev_sample,)
-
-        return EulerAncestralDiscreteSchedulerOutput(
-            prev_sample=prev_sample, pred_original_sample=pred_original_sample
-        )
-
-    def add_noise(
-        self,
-        original_samples: paddle.Tensor,
-        noise: paddle.Tensor,
-        timesteps: paddle.Tensor,
-    ) -> paddle.Tensor:
-        # Make sure sigmas and timesteps have the same dtype as original_samples
-        sigmas = self.sigmas.cast(original_samples.dtype)
-
-        schedule_timesteps = self.timesteps
-        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
-
-        sigma = sigmas[step_indices].flatten()
-        while len(sigma.shape) < len(original_samples.shape):
-            sigma = sigma.unsqueeze(-1)
-
-        noisy_samples = original_samples + noise * sigma
-        return noisy_samples
-
-    def __len__(self):
-        return self.config.num_train_timesteps
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_euler_discrete.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_euler_discrete.py
deleted file mode 100644
index 85c539e10488..000000000000
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_euler_discrete.py
+++ /dev/null
@@ -1,347 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2022 Katherine Crowson and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, logging, randn_tensor
-from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-@dataclass
-# Copied from ppdiffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->EulerDiscrete
-class EulerDiscreteSchedulerOutput(BaseOutput):
-    """
-    Output class for the scheduler's step function output.
-
-    Args:
-        prev_sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
-            Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
-            denoising loop.
-        pred_original_sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
-            The predicted denoised sample (x_{0}) based on the model output from the current timestep.
-            `pred_original_sample` can be used to preview progress or for guidance.
-    """
-
-    prev_sample: paddle.Tensor
-    pred_original_sample: Optional[paddle.Tensor] = None
-
-
-# Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
-    """
-    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
-    (1-beta) over time from t = [0,1].
-
-    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
-    to that part of the diffusion process.
-
-
-    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-
-    Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
-    """
-
-    def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
-
-    betas = []
-    for i in range(num_diffusion_timesteps):
-        t1 = i / num_diffusion_timesteps
-        t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
-    return paddle.to_tensor(betas, dtype=paddle.float32)
-
-
-class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
-    """
-    Euler scheduler (Algorithm 2) from Karras et al. (2022) https://arxiv.org/abs/2206.00364. . Based on the original
-    k-diffusion implementation by Katherine Crowson:
-    https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L51
-
-    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
-    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
-    [`~SchedulerMixin.from_pretrained`] functions.
-
-    Args:
-        num_train_timesteps (`int`): number of diffusion steps used to train the model.
-        beta_start (`float`): the starting `beta` value of inference.
-        beta_end (`float`): the final `beta` value.
-        beta_schedule (`str`):
-            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
-            `linear` or `scaled_linear`.
-        trained_betas (`np.ndarray`, optional):
-            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
-        prediction_type (`str`, default `"epsilon"`, optional):
-            prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
-            process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
-            https://imagen.research.google/video/paper.pdf)
-        interpolation_type (`str`, default `"linear"`, optional):
-            interpolation type to compute intermediate sigmas for the scheduler denoising steps. Should be one of
-            [`"linear"`, `"log_linear"`].
-        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
-             This parameter controls whether to use Karras sigmas (Karras et al. (2022) scheme) for step sizes in the
-             noise schedule during the sampling process. If True, the sigmas will be determined according to a sequence
-             of noise levels {σi} as defined in Equation (5) of the paper https://arxiv.org/pdf/2206.00364.pdf.
-    """
-
-    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
-    order = 1
-
-    @register_to_config
-    def __init__(
-        self,
-        num_train_timesteps: int = 1000,
-        beta_start: float = 0.0001,
-        beta_end: float = 0.02,
-        beta_schedule: str = "linear",
-        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
-        prediction_type: str = "epsilon",
-        interpolation_type: str = "linear",
-        use_karras_sigmas: Optional[bool] = False,
-    ):
-        if trained_betas is not None:
-            self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
-        elif beta_schedule == "linear":
-            self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
-        elif beta_schedule == "scaled_linear":
-            # this schedule is very specific to the latent diffusion model.
-            self.betas = (
-                paddle.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=paddle.float32) ** 2
-            )
-        elif beta_schedule == "squaredcos_cap_v2":
-            # Glide cosine schedule
-            self.betas = betas_for_alpha_bar(num_train_timesteps)
-        else:
-            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
-
-        self.alphas = 1.0 - self.betas
-        self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
-
-        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
-        sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32)
-        self.sigmas = paddle.to_tensor(sigmas)
-
-        # standard deviation of the initial noise distribution
-        self.init_noise_sigma = self.sigmas.max()
-
-        # setable values
-        self.num_inference_steps = None
-        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy()
-        self.timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32)
-        self.is_scale_input_called = False
-        self.use_karras_sigmas = use_karras_sigmas
-
-    def scale_model_input(self, sample: paddle.Tensor, timestep: Union[float, paddle.Tensor]) -> paddle.Tensor:
-        """
-        Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm.
-
-        Args:
-            sample (`paddle.Tensor`): input sample
-            timestep (`float` or `paddle.Tensor`): the current timestep in the diffusion chain
-
-        Returns:
-            `paddle.Tensor`: scaled input sample
-        """
-        step_index = (self.timesteps == timestep).nonzero().item()
-        sigma = self.sigmas[step_index]
-
-        sample = sample / ((sigma**2 + 1) ** 0.5)
-
-        self.is_scale_input_called = True
-        return sample
-
-    def set_timesteps(self, num_inference_steps: int):
-        """
-        Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
-
-        Args:
-            num_inference_steps (`int`):
-                the number of diffusion steps used when generating samples with a pre-trained model.
-        """
-        self.num_inference_steps = num_inference_steps
-
-        timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
-        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
-        log_sigmas = np.log(sigmas)
-
-        if self.config.interpolation_type == "linear":
-            sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
-        elif self.config.interpolation_type == "log_linear":
-            sigmas = paddle.linspace(np.log(sigmas[-1]), np.log(sigmas[0]), num_inference_steps + 1).exp()
-        else:
-            raise ValueError(
-                f"{self.config.interpolation_type} is not implemented. Please specify interpolation_type to either"
-                " 'linear' or 'log_linear'"
-            )
-
-        if self.use_karras_sigmas:
-            sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=self.num_inference_steps)
-            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
-
-        sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
-        self.sigmas = paddle.to_tensor(sigmas)
-        self.timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32)
-
-    def _sigma_to_t(self, sigma, log_sigmas):
-        # get log sigma
-        log_sigma = np.log(sigma)
-
-        # get distribution
-        dists = log_sigma - log_sigmas[:, np.newaxis]
-
-        # get sigmas range
-        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
-        high_idx = low_idx + 1
-
-        low = log_sigmas[low_idx]
-        high = log_sigmas[high_idx]
-
-        # interpolate sigmas
-        w = (low - log_sigma) / (low - high)
-        w = np.clip(w, 0, 1)
-
-        # transform interpolation to time range
-        t = (1 - w) * low_idx + w * high_idx
-        t = t.reshape(sigma.shape)
-        return t
-
-    def _convert_to_karras(self, in_sigmas: paddle.Tensor, num_inference_steps) -> paddle.Tensor:
-        """Constructs the noise schedule of Karras et al. (2022)."""
-
-        sigma_min = in_sigmas[-1].item()
-        sigma_max = in_sigmas[0].item()
-
-        rho = 7.0  # 7.0 is the value used in the paper
-        ramp = np.linspace(0, 1, num_inference_steps)
-        min_inv_rho = sigma_min ** (1 / rho)
-        max_inv_rho = sigma_max ** (1 / rho)
-        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
-        return sigmas
-
-    def step(
-        self,
-        model_output: paddle.Tensor,
-        timestep: Union[float, paddle.Tensor],
-        sample: paddle.Tensor,
-        s_churn: float = 0.0,
-        s_tmin: float = 0.0,
-        s_tmax: float = float("inf"),
-        s_noise: float = 1.0,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        return_dict: bool = True,
-    ) -> Union[EulerDiscreteSchedulerOutput, Tuple]:
-        """
-        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
-        process from the learned model outputs (most often the predicted noise).
-
-        Args:
-            model_output (`paddle.Tensor`): direct output from learned diffusion model.
-            timestep (`float`): current timestep in the diffusion chain.
-            sample (`paddle.Tensor`):
-                current instance of sample being created by diffusion process.
-            s_churn (`float`)
-            s_tmin  (`float`)
-            s_tmax  (`float`)
-            s_noise (`float`)
-            generator (`paddle.Generator`, optional): Random number generator.
-            return_dict (`bool`): option for returning tuple rather than EulerDiscreteSchedulerOutput class
-
-        Returns:
-            [`~schedulers.scheduling_utils.EulerDiscreteSchedulerOutput`] or `tuple`:
-            [`~schedulers.scheduling_utils.EulerDiscreteSchedulerOutput`] if `return_dict` is True, otherwise a
-            `tuple`. When returning a tuple, the first element is the sample tensor.
-
-        """
-
-        if not self.is_scale_input_called:
-            logger.warning(
-                "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
-                "See `StableDiffusionPipeline` for a usage example."
-            )
-
-        step_index = (self.timesteps == timestep).nonzero().item()
-        sigma = self.sigmas[step_index]
-
-        gamma = min(s_churn / (len(self.sigmas) - 1), 2**0.5 - 1) if s_tmin <= sigma <= s_tmax else 0.0
-
-        noise = randn_tensor(model_output.shape, dtype=model_output.dtype, generator=generator)
-
-        eps = noise * s_noise
-        sigma_hat = sigma * (gamma + 1)
-
-        if gamma > 0:
-            sample = sample + eps * (sigma_hat**2 - sigma**2) ** 0.5
-
-        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
-        # NOTE: "original_sample" should not be an expected prediction_type but is left in for
-        # backwards compatibility
-        if self.config.prediction_type == "original_sample" or self.config.prediction_type == "sample":
-            pred_original_sample = model_output
-        elif self.config.prediction_type == "epsilon":
-            pred_original_sample = sample - sigma_hat * model_output
-        elif self.config.prediction_type == "v_prediction":
-            # * c_out + input * c_skip
-            pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
-        else:
-            raise ValueError(
-                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
-            )
-
-        # 2. Convert to an ODE derivative
-        derivative = (sample - pred_original_sample) / sigma_hat
-
-        dt = self.sigmas[step_index + 1] - sigma_hat
-
-        prev_sample = sample + derivative * dt
-
-        if not return_dict:
-            return (prev_sample,)
-
-        return EulerDiscreteSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
-
-    def add_noise(
-        self,
-        original_samples: paddle.Tensor,
-        noise: paddle.Tensor,
-        timesteps: paddle.Tensor,
-    ) -> paddle.Tensor:
-        # Make sure sigmas and timesteps have the same dtype as original_samples
-        sigmas = self.sigmas.cast(original_samples.dtype)
-
-        schedule_timesteps = self.timesteps
-        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
-
-        sigma = sigmas[step_indices].flatten()
-        while len(sigma.shape) < len(original_samples.shape):
-            sigma = sigma.unsqueeze(-1)
-
-        noisy_samples = original_samples + noise * sigma
-        return noisy_samples
-
-    def __len__(self):
-        return self.config.num_train_timesteps
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_heun_discrete.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_heun_discrete.py
deleted file mode 100644
index ce7691521271..000000000000
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_heun_discrete.py
+++ /dev/null
@@ -1,339 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2022 Katherine Crowson, The HuggingFace Team and hlky. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-from typing import List, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
-
-
-# Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> paddle.Tensor:
-    """
-    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
-    (1-beta) over time from t = [0,1].
-
-    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
-    to that part of the diffusion process.
-
-
-    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-
-    Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
-    """
-
-    def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
-
-    betas = []
-    for i in range(num_diffusion_timesteps):
-        t1 = i / num_diffusion_timesteps
-        t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
-    return paddle.to_tensor(betas, dtype=paddle.float32)
-
-
-class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
-    """
-    Implements Algorithm 2 (Heun steps) from Karras et al. (2022). for discrete beta schedules. Based on the original
-    k-diffusion implementation by Katherine Crowson:
-    https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L90
-
-    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
-    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
-    [`~SchedulerMixin.from_pretrained`] functions.
-
-    Args:
-        num_train_timesteps (`int`): number of diffusion steps used to train the model. beta_start (`float`): the
-        starting `beta` value of inference. beta_end (`float`): the final `beta` value. beta_schedule (`str`):
-            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
-            `linear` or `scaled_linear`.
-        trained_betas (`np.ndarray`, optional):
-            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
-            options to clip the variance used when adding noise to the denoised sample. Choose from `fixed_small`,
-            `fixed_small_log`, `fixed_large`, `fixed_large_log`, `learned` or `learned_range`.
-        prediction_type (`str`, default `epsilon`, optional):
-            prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
-            process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
-            https://imagen.research.google/video/paper.pdf)
-        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
-             This parameter controls whether to use Karras sigmas (Karras et al. (2022) scheme) for step sizes in the
-             noise schedule during the sampling process. If True, the sigmas will be determined according to a sequence
-             of noise levels {σi} as defined in Equation (5) of the paper https://arxiv.org/pdf/2206.00364.pdf.
-    """
-
-    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
-    order = 2
-
-    @register_to_config
-    def __init__(
-        self,
-        num_train_timesteps: int = 1000,
-        beta_start: float = 0.00085,  # sensible defaults
-        beta_end: float = 0.012,
-        beta_schedule: str = "linear",
-        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
-        prediction_type: str = "epsilon",
-        use_karras_sigmas: Optional[bool] = False,
-    ):
-        if trained_betas is not None:
-            self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
-        elif beta_schedule == "linear":
-            self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
-        elif beta_schedule == "scaled_linear":
-            # this schedule is very specific to the latent diffusion model.
-            self.betas = (
-                paddle.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=paddle.float32) ** 2
-            )
-        elif beta_schedule == "squaredcos_cap_v2":
-            # Glide cosine schedule
-            self.betas = betas_for_alpha_bar(num_train_timesteps)
-        else:
-            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
-
-        self.alphas = 1.0 - self.betas
-        self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
-
-        #  set all values
-        self.set_timesteps(num_train_timesteps, num_train_timesteps)
-        self.use_karras_sigmas = use_karras_sigmas
-
-    def index_for_timestep(self, timestep, schedule_timesteps=None):
-        if schedule_timesteps is None:
-            schedule_timesteps = self.timesteps
-
-        indices = (schedule_timesteps == timestep).nonzero()
-
-        if self.state_in_first_order:
-            pos = -1
-        else:
-            pos = 0
-        return indices[pos].item()
-
-    def scale_model_input(
-        self,
-        sample: paddle.Tensor,
-        timestep: Union[float, paddle.Tensor],
-    ) -> paddle.Tensor:
-        """
-        Args:
-        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
-        current timestep.
-            sample (`paddle.Tensor`): input sample timestep (`int`, optional): current timestep
-        Returns:
-            `paddle.Tensor`: scaled input sample
-        """
-        step_index = self.index_for_timestep(timestep)
-
-        sigma = self.sigmas[step_index]
-        sample = sample / ((sigma**2 + 1) ** 0.5)
-        return sample
-
-    def set_timesteps(
-        self,
-        num_inference_steps: int,
-        num_train_timesteps: Optional[int] = None,
-    ):
-        """
-        Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
-
-        Args:
-            num_inference_steps (`int`):
-                the number of diffusion steps used when generating samples with a pre-trained model.
-        """
-        self.num_inference_steps = num_inference_steps
-
-        num_train_timesteps = num_train_timesteps or self.config.num_train_timesteps
-
-        timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
-
-        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
-        log_sigmas = np.log(sigmas)
-        sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
-
-        if self.use_karras_sigmas:
-            sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=self.num_inference_steps)
-            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
-
-        sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
-        sigmas = paddle.to_tensor(sigmas)
-        self.sigmas = paddle.concat([sigmas[:1], sigmas[1:-1].repeat_interleave(2), sigmas[-1:]])
-
-        # standard deviation of the initial noise distribution
-        self.init_noise_sigma = self.sigmas.max()
-
-        timesteps = paddle.to_tensor(timesteps)
-        timesteps = paddle.concat([timesteps[:1], timesteps[1:].repeat_interleave(2)])
-
-        self.timesteps = timesteps.cast(paddle.float32)
-
-        # empty dt and derivative
-        self.prev_derivative = None
-        self.dt = None
-
-    def _sigma_to_t(self, sigma, log_sigmas):
-        # get log sigma
-        log_sigma = np.log(sigma)
-
-        # get distribution
-        dists = log_sigma - log_sigmas[:, np.newaxis]
-
-        # get sigmas range
-        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
-        high_idx = low_idx + 1
-
-        low = log_sigmas[low_idx]
-        high = log_sigmas[high_idx]
-
-        # interpolate sigmas
-        w = (low - log_sigma) / (low - high)
-        w = np.clip(w, 0, 1)
-
-        # transform interpolation to time range
-        t = (1 - w) * low_idx + w * high_idx
-        t = t.reshape(sigma.shape)
-        return t
-
-    def _convert_to_karras(self, in_sigmas: paddle.Tensor, num_inference_steps) -> paddle.Tensor:
-        """Constructs the noise schedule of Karras et al. (2022)."""
-
-        sigma_min = in_sigmas[-1].item()
-        sigma_max = in_sigmas[0].item()
-
-        rho = 7.0  # 7.0 is the value used in the paper
-        ramp = np.linspace(0, 1, num_inference_steps)
-        min_inv_rho = sigma_min ** (1 / rho)
-        max_inv_rho = sigma_max ** (1 / rho)
-        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
-        return sigmas
-
-    @property
-    def state_in_first_order(self):
-        return self.dt is None
-
-    def step(
-        self,
-        model_output: Union[paddle.Tensor, np.ndarray],
-        timestep: Union[float, paddle.Tensor],
-        sample: Union[paddle.Tensor, np.ndarray],
-        return_dict: bool = True,
-    ) -> Union[SchedulerOutput, Tuple]:
-        """
-        Args:
-        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
-        process from the learned model outputs (most often the predicted noise).
-            model_output (`paddle.Tensor` or `np.ndarray`): direct output from learned diffusion model. timestep
-            (`int`): current discrete timestep in the diffusion chain. sample (`paddle.Tensor` or `np.ndarray`):
-                current instance of sample being created by diffusion process.
-            return_dict (`bool`): option for returning tuple rather than SchedulerOutput class
-        Returns:
-            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
-            [`~schedulers.scheduling_utils.SchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. When
-            returning a tuple, the first element is the sample tensor.
-        """
-        step_index = self.index_for_timestep(timestep)
-
-        if self.state_in_first_order:
-            sigma = self.sigmas[step_index]
-            sigma_next = self.sigmas[step_index + 1]
-        else:
-            # 2nd order / Heun's method
-            sigma = self.sigmas[step_index - 1]
-            sigma_next = self.sigmas[step_index]
-
-        # currently only gamma=0 is supported. This usually works best anyways.
-        # We can support gamma in the future but then need to scale the timestep before
-        # passing it to the model which requires a change in API
-        gamma = 0
-        sigma_hat = sigma * (gamma + 1)  # Note: sigma_hat == sigma for now
-
-        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
-        if self.config.prediction_type == "epsilon":
-            sigma_input = sigma_hat if self.state_in_first_order else sigma_next
-            pred_original_sample = sample - sigma_input * model_output
-        elif self.config.prediction_type == "v_prediction":
-            sigma_input = sigma_hat if self.state_in_first_order else sigma_next
-            pred_original_sample = model_output * (-sigma_input / (sigma_input**2 + 1) ** 0.5) + (
-                sample / (sigma_input**2 + 1)
-            )
-        elif self.config.prediction_type == "sample":
-            raise NotImplementedError("prediction_type not implemented yet: sample")
-        else:
-            raise ValueError(
-                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
-            )
-
-        if self.state_in_first_order:
-            # 2. Convert to an ODE derivative for 1st order
-            derivative = (sample - pred_original_sample) / sigma_hat
-            # 3. delta timestep
-            dt = sigma_next - sigma_hat
-
-            # store for 2nd order step
-            self.prev_derivative = derivative
-            self.dt = dt
-            self.sample = sample
-        else:
-            # 2. 2nd order / Heun's method
-            derivative = (sample - pred_original_sample) / sigma_next
-            derivative = (self.prev_derivative + derivative) / 2
-
-            # 3. take prev timestep & sample
-            dt = self.dt
-            sample = self.sample
-
-            # free dt and derivative
-            # Note, this puts the scheduler in "first order mode"
-            self.prev_derivative = None
-            self.dt = None
-            self.sample = None
-
-        prev_sample = sample + derivative * dt
-
-        if not return_dict:
-            return (prev_sample,)
-
-        return SchedulerOutput(prev_sample=prev_sample)
-
-    def add_noise(
-        self,
-        original_samples: paddle.Tensor,
-        noise: paddle.Tensor,
-        timesteps: paddle.Tensor,
-    ) -> paddle.Tensor:
-        # Make sure sigmas and timesteps have the same dtype as original_samples
-        sigmas = self.sigmas.cast(original_samples.dtype)
-
-        schedule_timesteps = self.timesteps
-        step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
-
-        sigma = sigmas[step_indices].flatten()
-        while len(sigma.shape) < len(original_samples.shape):
-            sigma = sigma.unsqueeze(-1)
-
-        noisy_samples = original_samples + noise * sigma
-        return noisy_samples
-
-    def __len__(self):
-        return self.config.num_train_timesteps
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_ipndm.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_ipndm.py
deleted file mode 100644
index b4323a6dcf8b..000000000000
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_ipndm.py
+++ /dev/null
@@ -1,165 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2022 Zhejiang University Team and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-from typing import List, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from .scheduling_utils import SchedulerMixin, SchedulerOutput
-
-
-class IPNDMScheduler(SchedulerMixin, ConfigMixin):
-    """
-    Improved Pseudo numerical methods for diffusion models (iPNDM) ported from @crowsonkb's amazing k-diffusion
-    [library](https://github.com/crowsonkb/v-diffusion-pytorch/blob/987f8985e38208345c1959b0ea767a625831cc9b/diffusion/sampling.py#L296)
-
-    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
-    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
-    [`~SchedulerMixin.from_pretrained`] functions.
-
-    For more details, see the original paper: https://arxiv.org/abs/2202.09778
-
-    Args:
-        num_train_timesteps (`int`): number of diffusion steps used to train the model.
-    """
-
-    order = 1
-
-    @register_to_config
-    def __init__(
-        self, num_train_timesteps: int = 1000, trained_betas: Optional[Union[np.ndarray, List[float]]] = None
-    ):
-        # set `betas`, `alphas`, `timesteps`
-        self.set_timesteps(num_train_timesteps)
-
-        # standard deviation of the initial noise distribution
-        self.init_noise_sigma = 1.0
-
-        # For now we only support F-PNDM, i.e. the runge-kutta method
-        # For more information on the algorithm please take a look at the paper: https://arxiv.org/pdf/2202.09778.pdf
-        # mainly at formula (9), (12), (13) and the Algorithm 2.
-        self.pndm_order = 4
-
-        # running values
-        self.ets = []
-
-    def set_timesteps(self, num_inference_steps: int):
-        """
-        Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
-
-        Args:
-            num_inference_steps (`int`):
-                the number of diffusion steps used when generating samples with a pre-trained model.
-        """
-        self.num_inference_steps = num_inference_steps
-        steps = paddle.linspace(1, 0, num_inference_steps + 1)[:-1]
-        steps = paddle.concat([steps, paddle.to_tensor([0.0])])
-
-        if self.config.trained_betas is not None:
-            self.betas = paddle.to_tensor(self.config.trained_betas, dtype=paddle.float32)
-        else:
-            self.betas = paddle.sin(steps * math.pi / 2) ** 2
-
-        self.alphas = (1.0 - self.betas**2) ** 0.5
-
-        self.timesteps = (paddle.atan2(self.betas, self.alphas) / math.pi * 2)[:-1]
-
-        self.ets = []
-
-    def step(
-        self,
-        model_output: paddle.Tensor,
-        timestep: int,
-        sample: paddle.Tensor,
-        return_dict: bool = True,
-    ) -> Union[SchedulerOutput, Tuple]:
-        """
-        Step function propagating the sample with the linear multi-step method. This has one forward pass with multiple
-        times to approximate the solution.
-
-        Args:
-            model_output (`paddle.Tensor`): direct output from learned diffusion model.
-            timestep (`int`): current discrete timestep in the diffusion chain.
-            sample (`paddle.Tensor`):
-                current instance of sample being created by diffusion process.
-            return_dict (`bool`): option for returning tuple rather than SchedulerOutput class
-
-        Returns:
-            [`~scheduling_utils.SchedulerOutput`] or `tuple`: [`~scheduling_utils.SchedulerOutput`] if `return_dict` is
-            True, otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
-
-        """
-        # TODO we use float32 to compute model_output and sample due to 1e-8
-        model_output = model_output.cast("float32")
-        sample = sample.cast("float32")
-
-        if self.num_inference_steps is None:
-            raise ValueError(
-                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
-            )
-
-        timestep_index = (self.timesteps == timestep).nonzero().item()
-        prev_timestep_index = timestep_index + 1
-
-        ets = sample * self.betas[timestep_index] + model_output * self.alphas[timestep_index]
-        self.ets.append(ets)
-
-        if len(self.ets) == 1:
-            ets = self.ets[-1]
-        elif len(self.ets) == 2:
-            ets = (3 * self.ets[-1] - self.ets[-2]) / 2
-        elif len(self.ets) == 3:
-            ets = (23 * self.ets[-1] - 16 * self.ets[-2] + 5 * self.ets[-3]) / 12
-        else:
-            ets = (1 / 24) * (55 * self.ets[-1] - 59 * self.ets[-2] + 37 * self.ets[-3] - 9 * self.ets[-4])
-
-        prev_sample = self._get_prev_sample(sample, timestep_index, prev_timestep_index, ets)
-
-        if not return_dict:
-            return (prev_sample,)
-
-        return SchedulerOutput(prev_sample=prev_sample)
-
-    def scale_model_input(self, sample: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
-        """
-        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
-        current timestep.
-
-        Args:
-            sample (`paddle.Tensor`): input sample
-
-        Returns:
-            `paddle.Tensor`: scaled input sample
-        """
-        return sample
-
-    def _get_prev_sample(self, sample, timestep_index, prev_timestep_index, ets):
-        alpha = self.alphas[timestep_index]
-        sigma = self.betas[timestep_index]
-
-        next_alpha = self.alphas[prev_timestep_index]
-        next_sigma = self.betas[prev_timestep_index]
-
-        pred = (sample - sigma * ets) / max(alpha, 1e-8)
-        prev_sample = next_alpha * pred + ets * next_sigma
-
-        return prev_sample
-
-    def __len__(self):
-        return self.config.num_train_timesteps
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
deleted file mode 100644
index 573715495dee..000000000000
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
+++ /dev/null
@@ -1,345 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2022 Katherine Crowson, The HuggingFace Team and hlky. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-from typing import List, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import randn_tensor
-from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
-
-
-# Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> paddle.Tensor:
-    """
-    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
-    (1-beta) over time from t = [0,1].
-
-    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
-    to that part of the diffusion process.
-
-
-    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-
-    Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
-    """
-
-    def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
-
-    betas = []
-    for i in range(num_diffusion_timesteps):
-        t1 = i / num_diffusion_timesteps
-        t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
-    return paddle.to_tensor(betas, dtype=paddle.float32)
-
-
-class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
-    """
-    Scheduler created by @crowsonkb in [k_diffusion](https://github.com/crowsonkb/k-diffusion), see:
-    https://github.com/crowsonkb/k-diffusion/blob/5b3af030dd83e0297272d861c19477735d0317ec/k_diffusion/sampling.py#L188
-
-    Scheduler inspired by DPM-Solver-2 and Algorthim 2 from Karras et al. (2022).
-
-    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
-    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
-    [`~SchedulerMixin.from_pretrained`] functions.
-
-    Args:
-        num_train_timesteps (`int`): number of diffusion steps used to train the model. beta_start (`float`): the
-        starting `beta` value of inference. beta_end (`float`): the final `beta` value. beta_schedule (`str`):
-            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
-            `linear` or `scaled_linear`.
-        trained_betas (`np.ndarray`, optional):
-            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
-            options to clip the variance used when adding noise to the denoised sample. Choose from `fixed_small`,
-            `fixed_small_log`, `fixed_large`, `fixed_large_log`, `learned` or `learned_range`.
-        prediction_type (`str`, default `epsilon`, optional):
-            prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
-            process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
-            https://imagen.research.google/video/paper.pdf)
-    """
-
-    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
-    order = 2
-
-    @register_to_config
-    def __init__(
-        self,
-        num_train_timesteps: int = 1000,
-        beta_start: float = 0.00085,  # sensible defaults
-        beta_end: float = 0.012,
-        beta_schedule: str = "linear",
-        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
-        prediction_type: str = "epsilon",
-    ):
-        if trained_betas is not None:
-            self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
-        elif beta_schedule == "linear":
-            self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
-        elif beta_schedule == "scaled_linear":
-            # this schedule is very specific to the latent diffusion model.
-            self.betas = (
-                paddle.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=paddle.float32) ** 2
-            )
-        elif beta_schedule == "squaredcos_cap_v2":
-            # Glide cosine schedule
-            self.betas = betas_for_alpha_bar(num_train_timesteps)
-        else:
-            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
-
-        self.alphas = 1.0 - self.betas
-        self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
-
-        #  set all values
-        self.set_timesteps(num_train_timesteps, num_train_timesteps)
-
-    def index_for_timestep(self, timestep, schedule_timesteps=None):
-        if schedule_timesteps is None:
-            schedule_timesteps = self.timesteps
-
-        indices = (schedule_timesteps == timestep).nonzero()
-
-        if self.state_in_first_order:
-            pos = -1
-        else:
-            pos = 0
-        return indices[pos].item()
-
-    def scale_model_input(
-        self,
-        sample: paddle.Tensor,
-        timestep: Union[float, paddle.Tensor],
-    ) -> paddle.Tensor:
-        """
-        Args:
-        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
-        current timestep.
-            sample (`paddle.Tensor`): input sample timestep (`int`, optional): current timestep
-        Returns:
-            `paddle.Tensor`: scaled input sample
-        """
-        step_index = self.index_for_timestep(timestep)
-
-        if self.state_in_first_order:
-            sigma = self.sigmas[step_index]
-        else:
-            sigma = self.sigmas_interpol[step_index - 1]
-
-        sample = sample / ((sigma**2 + 1) ** 0.5)
-        return sample
-
-    def set_timesteps(
-        self,
-        num_inference_steps: int,
-        num_train_timesteps: Optional[int] = None,
-    ):
-        """
-        Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
-
-        Args:
-            num_inference_steps (`int`):
-                the number of diffusion steps used when generating samples with a pre-trained model.
-        """
-        self.num_inference_steps = num_inference_steps
-
-        num_train_timesteps = num_train_timesteps or self.config.num_train_timesteps
-
-        timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
-
-        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
-        self.log_sigmas = paddle.to_tensor(np.log(sigmas), dtype=paddle.float32)
-
-        sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
-        sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
-        sigmas = paddle.to_tensor(sigmas)
-
-        # compute up and down sigmas
-        sigmas_next = sigmas.roll(-1)
-        sigmas_next[-1] = 0.0
-        sigmas_up = (sigmas_next**2 * (sigmas**2 - sigmas_next**2) / sigmas**2) ** 0.5
-        sigmas_down = (sigmas_next**2 - sigmas_up**2) ** 0.5
-        sigmas_down[-1] = 0.0
-
-        # compute interpolated sigmas
-        sigmas_interpol = sigmas.log().lerp(sigmas_down.log(), 0.5).exp()
-        sigmas_interpol[-2:] = 0.0
-
-        # set sigmas
-        self.sigmas = paddle.concat([sigmas[:1], sigmas[1:].repeat_interleave(2), sigmas[-1:]])
-        self.sigmas_interpol = paddle.concat(
-            [sigmas_interpol[:1], sigmas_interpol[1:].repeat_interleave(2), sigmas_interpol[-1:]]
-        )
-        self.sigmas_up = paddle.concat([sigmas_up[:1], sigmas_up[1:].repeat_interleave(2), sigmas_up[-1:]])
-        self.sigmas_down = paddle.concat([sigmas_down[:1], sigmas_down[1:].repeat_interleave(2), sigmas_down[-1:]])
-
-        # standard deviation of the initial noise distribution
-        self.init_noise_sigma = self.sigmas.max()
-
-        timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32)
-
-        timesteps_interpol = self.sigma_to_t(sigmas_interpol)
-        timesteps_interpol = paddle.cast(timesteps_interpol, dtype=timesteps.dtype)
-
-        interleaved_timesteps = paddle.stack((timesteps_interpol[:-2, None], timesteps[1:, None]), axis=-1).flatten()
-
-        self.timesteps = paddle.concat([timesteps[:1], interleaved_timesteps])
-
-        self.sample = None
-
-    def sigma_to_t(self, sigma):
-        # get log sigma
-        log_sigma = sigma.log()
-
-        # get distribution
-        dists = log_sigma - self.log_sigmas[:, None]
-
-        # get sigmas range
-        low_idx = (dists >= 0).cast("int64").cumsum(axis=0).argmax(axis=0).clip(max=self.log_sigmas.shape[0] - 2)
-        high_idx = low_idx + 1
-
-        low = self.log_sigmas[low_idx]
-        high = self.log_sigmas[high_idx]
-
-        # interpolate sigmas
-        w = (low - log_sigma) / (low - high)
-        w = w.clip(0, 1)
-
-        # transform interpolation to time range
-        t = (1 - w) * low_idx + w * high_idx
-        t = t.reshape(sigma.shape)
-        return t
-
-    @property
-    def state_in_first_order(self):
-        return self.sample is None
-
-    def step(
-        self,
-        model_output: Union[paddle.Tensor, np.ndarray],
-        timestep: Union[float, paddle.Tensor],
-        sample: Union[paddle.Tensor, np.ndarray],
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        return_dict: bool = True,
-    ) -> Union[SchedulerOutput, Tuple]:
-        """
-        Args:
-        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
-        process from the learned model outputs (most often the predicted noise).
-            model_output (`paddle.Tensor` or `np.ndarray`): direct output from learned diffusion model. timestep
-            (`int`): current discrete timestep in the diffusion chain. sample (`paddle.Tensor` or `np.ndarray`):
-                current instance of sample being created by diffusion process.
-            return_dict (`bool`): option for returning tuple rather than SchedulerOutput class
-        Returns:
-            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
-            [`~schedulers.scheduling_utils.SchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. When
-            returning a tuple, the first element is the sample tensor.
-        """
-        step_index = self.index_for_timestep(timestep)
-
-        if self.state_in_first_order:
-            sigma = self.sigmas[step_index]
-            sigma_interpol = self.sigmas_interpol[step_index]
-            sigma_up = self.sigmas_up[step_index]
-            sigma_down = self.sigmas_down[step_index - 1]
-        else:
-            # 2nd order / KPDM2's method
-            sigma = self.sigmas[step_index - 1]
-            sigma_interpol = self.sigmas_interpol[step_index - 1]
-            sigma_up = self.sigmas_up[step_index - 1]
-            sigma_down = self.sigmas_down[step_index - 1]
-
-        # currently only gamma=0 is supported. This usually works best anyways.
-        # We can support gamma in the future but then need to scale the timestep before
-        # passing it to the model which requires a change in API
-        gamma = 0
-        sigma_hat = sigma * (gamma + 1)  # Note: sigma_hat == sigma for now
-
-        noise = randn_tensor(model_output.shape, dtype=model_output.dtype, generator=generator)
-
-        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
-        if self.config.prediction_type == "epsilon":
-            sigma_input = sigma_hat if self.state_in_first_order else sigma_interpol
-            pred_original_sample = sample - sigma_input * model_output
-        elif self.config.prediction_type == "v_prediction":
-            sigma_input = sigma_hat if self.state_in_first_order else sigma_interpol
-            pred_original_sample = model_output * (-sigma_input / (sigma_input**2 + 1) ** 0.5) + (
-                sample / (sigma_input**2 + 1)
-            )
-        elif self.config.prediction_type == "sample":
-            raise NotImplementedError("prediction_type not implemented yet: sample")
-        else:
-            raise ValueError(
-                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
-            )
-
-        if self.state_in_first_order:
-            # 2. Convert to an ODE derivative for 1st order
-            derivative = (sample - pred_original_sample) / sigma_hat
-            # 3. delta timestep
-            dt = sigma_interpol - sigma_hat
-
-            # store for 2nd order step
-            self.sample = sample
-            self.dt = dt
-            prev_sample = sample + derivative * dt
-        else:
-            # DPM-Solver-2
-            # 2. Convert to an ODE derivative for 2nd order
-            derivative = (sample - pred_original_sample) / sigma_interpol
-            # 3. delta timestep
-            dt = sigma_down - sigma_hat
-
-            sample = self.sample
-            self.sample = None
-
-            prev_sample = sample + derivative * dt
-            prev_sample = prev_sample + noise * sigma_up
-
-        if not return_dict:
-            return (prev_sample,)
-
-        return SchedulerOutput(prev_sample=prev_sample)
-
-    def add_noise(
-        self,
-        original_samples: paddle.Tensor,
-        noise: paddle.Tensor,
-        timesteps: paddle.Tensor,
-    ) -> paddle.Tensor:
-        # Make sure sigmas and timesteps have the same dtype as original_samples
-        sigmas = self.sigmas.cast(original_samples.dtype)
-
-        schedule_timesteps = self.timesteps
-        step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
-
-        sigma = sigmas[step_indices].flatten()
-        while len(sigma.shape) < len(original_samples.shape):
-            sigma = sigma.unsqueeze(-1)
-
-        noisy_samples = original_samples + noise * sigma
-        return noisy_samples
-
-    def __len__(self):
-        return self.config.num_train_timesteps
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_k_dpm_2_discrete.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_k_dpm_2_discrete.py
deleted file mode 100644
index ffcc918a3041..000000000000
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_k_dpm_2_discrete.py
+++ /dev/null
@@ -1,327 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2022 Katherine Crowson, The HuggingFace Team and hlky. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-from typing import List, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
-
-
-# Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> paddle.Tensor:
-    """
-    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
-    (1-beta) over time from t = [0,1].
-
-    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
-    to that part of the diffusion process.
-
-
-    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-
-    Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
-    """
-
-    def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
-
-    betas = []
-    for i in range(num_diffusion_timesteps):
-        t1 = i / num_diffusion_timesteps
-        t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
-    return paddle.to_tensor(betas, dtype=paddle.float32)
-
-
-class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin):
-    """
-    Scheduler created by @crowsonkb in [k_diffusion](https://github.com/crowsonkb/k-diffusion), see:
-    https://github.com/crowsonkb/k-diffusion/blob/5b3af030dd83e0297272d861c19477735d0317ec/k_diffusion/sampling.py#L188
-
-    Scheduler inspired by DPM-Solver-2 and Algorthim 2 from Karras et al. (2022).
-
-    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
-    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
-    [`~SchedulerMixin.from_pretrained`] functions.
-
-    Args:
-        num_train_timesteps (`int`): number of diffusion steps used to train the model. beta_start (`float`): the
-        starting `beta` value of inference. beta_end (`float`): the final `beta` value. beta_schedule (`str`):
-            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
-            `linear` or `scaled_linear`.
-        trained_betas (`np.ndarray`, optional):
-            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
-            options to clip the variance used when adding noise to the denoised sample. Choose from `fixed_small`,
-            `fixed_small_log`, `fixed_large`, `fixed_large_log`, `learned` or `learned_range`.
-        prediction_type (`str`, default `epsilon`, optional):
-            prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
-            process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
-            https://imagen.research.google/video/paper.pdf)
-    """
-
-    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
-    order = 2
-
-    @register_to_config
-    def __init__(
-        self,
-        num_train_timesteps: int = 1000,
-        beta_start: float = 0.00085,  # sensible defaults
-        beta_end: float = 0.012,
-        beta_schedule: str = "linear",
-        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
-        prediction_type: str = "epsilon",
-    ):
-        if trained_betas is not None:
-            self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
-        elif beta_schedule == "linear":
-            self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
-        elif beta_schedule == "scaled_linear":
-            # this schedule is very specific to the latent diffusion model.
-            self.betas = (
-                paddle.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=paddle.float32) ** 2
-            )
-        elif beta_schedule == "squaredcos_cap_v2":
-            # Glide cosine schedule
-            self.betas = betas_for_alpha_bar(num_train_timesteps)
-        else:
-            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
-
-        self.alphas = 1.0 - self.betas
-        self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
-
-        #  set all values
-        self.set_timesteps(num_train_timesteps, num_train_timesteps)
-
-    def index_for_timestep(self, timestep, schedule_timesteps=None):
-        if schedule_timesteps is None:
-            schedule_timesteps = self.timesteps
-
-        indices = (schedule_timesteps == timestep).nonzero()
-
-        if self.state_in_first_order:
-            pos = -1
-        else:
-            pos = 0
-        return indices[pos].item()
-
-    def scale_model_input(
-        self,
-        sample: paddle.Tensor,
-        timestep: Union[float, paddle.Tensor],
-    ) -> paddle.Tensor:
-        """
-        Args:
-        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
-        current timestep.
-            sample (`paddle.Tensor`): input sample timestep (`int`, optional): current timestep
-        Returns:
-            `paddle.Tensor`: scaled input sample
-        """
-        step_index = self.index_for_timestep(timestep)
-
-        if self.state_in_first_order:
-            sigma = self.sigmas[step_index]
-        else:
-            sigma = self.sigmas_interpol[step_index]
-
-        sample = sample / ((sigma**2 + 1) ** 0.5)
-        return sample
-
-    def set_timesteps(
-        self,
-        num_inference_steps: int,
-        num_train_timesteps: Optional[int] = None,
-    ):
-        """
-        Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
-
-        Args:
-            num_inference_steps (`int`):
-                the number of diffusion steps used when generating samples with a pre-trained model.
-        """
-        self.num_inference_steps = num_inference_steps
-
-        num_train_timesteps = num_train_timesteps or self.config.num_train_timesteps
-
-        timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
-
-        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
-        self.log_sigmas = paddle.to_tensor(np.log(sigmas), dtype=paddle.float32)
-
-        sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
-        sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
-        sigmas = paddle.to_tensor(sigmas)
-
-        # interpolate sigmas
-        sigmas_interpol = sigmas.log().lerp(sigmas.roll(1).log(), 0.5).exp()
-        # must set to 0.0
-        sigmas_interpol[-1] = 0.0
-
-        self.sigmas = paddle.concat([sigmas[:1], sigmas[1:].repeat_interleave(2), sigmas[-1:]])
-        self.sigmas_interpol = paddle.concat(
-            [sigmas_interpol[:1], sigmas_interpol[1:].repeat_interleave(2), sigmas_interpol[-1:]]
-        )
-
-        # standard deviation of the initial noise distribution
-        self.init_noise_sigma = self.sigmas.max()
-
-        timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32)
-        # interpolate timesteps
-        timesteps_interpol = self.sigma_to_t(sigmas_interpol)
-        timesteps_interpol = paddle.cast(timesteps_interpol, dtype=timesteps.dtype)
-        interleaved_timesteps = paddle.stack((timesteps_interpol[1:-1, None], timesteps[1:, None]), axis=-1).flatten()
-
-        self.timesteps = paddle.concat([timesteps[:1], interleaved_timesteps])
-
-        self.sample = None
-
-    def sigma_to_t(self, sigma):
-        # get log sigma
-        log_sigma = sigma.log()
-
-        # get distribution
-        dists = log_sigma - self.log_sigmas[:, None]
-
-        # get sigmas range
-        low_idx = (dists >= 0).cast("int64").cumsum(axis=0).argmax(axis=0).clip(max=self.log_sigmas.shape[0] - 2)
-        high_idx = low_idx + 1
-
-        low = self.log_sigmas[low_idx]
-        high = self.log_sigmas[high_idx]
-
-        # interpolate sigmas
-        w = (low - log_sigma) / (low - high)
-        w = w.clip(0, 1)
-
-        # transform interpolation to time range
-        t = (1 - w) * low_idx + w * high_idx
-        t = t.reshape(sigma.shape)
-        return t
-
-    @property
-    def state_in_first_order(self):
-        return self.sample is None
-
-    def step(
-        self,
-        model_output: Union[paddle.Tensor, np.ndarray],
-        timestep: Union[float, paddle.Tensor],
-        sample: Union[paddle.Tensor, np.ndarray],
-        return_dict: bool = True,
-    ) -> Union[SchedulerOutput, Tuple]:
-        """
-        Args:
-        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
-        process from the learned model outputs (most often the predicted noise).
-            model_output (`paddle.Tensor` or `np.ndarray`): direct output from learned diffusion model. timestep
-            (`int`): current discrete timestep in the diffusion chain. sample (`paddle.Tensor` or `np.ndarray`):
-                current instance of sample being created by diffusion process.
-            return_dict (`bool`): option for returning tuple rather than SchedulerOutput class
-        Returns:
-            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
-            [`~schedulers.scheduling_utils.SchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. When
-            returning a tuple, the first element is the sample tensor.
-        """
-        step_index = self.index_for_timestep(timestep)
-
-        if self.state_in_first_order:
-            sigma = self.sigmas[step_index]
-            sigma_interpol = self.sigmas_interpol[step_index + 1]
-            sigma_next = self.sigmas[step_index + 1]
-        else:
-            # 2nd order / KDPM2's method
-            sigma = self.sigmas[step_index - 1]
-            sigma_interpol = self.sigmas_interpol[step_index]
-            sigma_next = self.sigmas[step_index]
-
-        # currently only gamma=0 is supported. This usually works best anyways.
-        # We can support gamma in the future but then need to scale the timestep before
-        # passing it to the model which requires a change in API
-        gamma = 0
-        sigma_hat = sigma * (gamma + 1)  # Note: sigma_hat == sigma for now
-
-        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
-        if self.config.prediction_type == "epsilon":
-            sigma_input = sigma_hat if self.state_in_first_order else sigma_interpol
-            pred_original_sample = sample - sigma_input * model_output
-        elif self.config.prediction_type == "v_prediction":
-            sigma_input = sigma_hat if self.state_in_first_order else sigma_interpol
-            pred_original_sample = model_output * (-sigma_input / (sigma_input**2 + 1) ** 0.5) + (
-                sample / (sigma_input**2 + 1)
-            )
-        elif self.config.prediction_type == "sample":
-            raise NotImplementedError("prediction_type not implemented yet: sample")
-        else:
-            raise ValueError(
-                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
-            )
-
-        if self.state_in_first_order:
-            # 2. Convert to an ODE derivative for 1st order
-            derivative = (sample - pred_original_sample) / sigma_hat
-            # 3. delta timestep
-            dt = sigma_interpol - sigma_hat
-
-            # store for 2nd order step
-            self.sample = sample
-        else:
-            # DPM-Solver-2
-            # 2. Convert to an ODE derivative for 2nd order
-            derivative = (sample - pred_original_sample) / sigma_interpol
-
-            # 3. delta timestep
-            dt = sigma_next - sigma_hat
-
-            sample = self.sample
-            self.sample = None
-
-        prev_sample = sample + derivative * dt
-
-        if not return_dict:
-            return (prev_sample,)
-
-        return SchedulerOutput(prev_sample=prev_sample)
-
-    def add_noise(
-        self,
-        original_samples: paddle.Tensor,
-        noise: paddle.Tensor,
-        timesteps: paddle.Tensor,
-    ) -> paddle.Tensor:
-        # Make sure sigmas and timesteps have the same dtype as original_samples
-        sigmas = self.sigmas.cast(original_samples.dtype)
-
-        schedule_timesteps = self.timesteps
-        step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
-
-        sigma = sigmas[step_indices].flatten()
-        while len(sigma.shape) < len(original_samples.shape):
-            sigma = sigma.unsqueeze(-1)
-
-        noisy_samples = original_samples + noise * sigma
-        return noisy_samples
-
-    def __len__(self):
-        return self.config.num_train_timesteps
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_karras_ve.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_karras_ve.py
deleted file mode 100644
index ea135e48d2b0..000000000000
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_karras_ve.py
+++ /dev/null
@@ -1,233 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2022 NVIDIA and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from dataclasses import dataclass
-from typing import Optional, Tuple, Union
-
-import numpy as np
-import paddle
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, randn_tensor
-from .scheduling_utils import SchedulerMixin
-
-
-@dataclass
-class KarrasVeOutput(BaseOutput):
-    """
-    Output class for the scheduler's step function output.
-
-    Args:
-        prev_sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
-            Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
-            denoising loop.
-        derivative (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
-            Derivative of predicted original image sample (x_0).
-        pred_original_sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
-            The predicted denoised sample (x_{0}) based on the model output from the current timestep.
-            `pred_original_sample` can be used to preview progress or for guidance.
-    """
-
-    prev_sample: paddle.Tensor
-    derivative: paddle.Tensor
-    pred_original_sample: Optional[paddle.Tensor] = None
-
-
-class KarrasVeScheduler(SchedulerMixin, ConfigMixin):
-    """
-    Stochastic sampling from Karras et al. [1] tailored to the Variance-Expanding (VE) models [2]. Use Algorithm 2 and
-    the VE column of Table 1 from [1] for reference.
-
-    [1] Karras, Tero, et al. "Elucidating the Design Space of Diffusion-Based Generative Models."
-    https://arxiv.org/abs/2206.00364 [2] Song, Yang, et al. "Score-based generative modeling through stochastic
-    differential equations." https://arxiv.org/abs/2011.13456
-
-    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
-    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
-    [`~SchedulerMixin.from_pretrained`] functions.
-
-    For more details on the parameters, see the original paper's Appendix E.: "Elucidating the Design Space of
-    Diffusion-Based Generative Models." https://arxiv.org/abs/2206.00364. The grid search values used to find the
-    optimal {s_noise, s_churn, s_min, s_max} for a specific model are described in Table 5 of the paper.
-
-    Args:
-        sigma_min (`float`): minimum noise magnitude
-        sigma_max (`float`): maximum noise magnitude
-        s_noise (`float`): the amount of additional noise to counteract loss of detail during sampling.
-            A reasonable range is [1.000, 1.011].
-        s_churn (`float`): the parameter controlling the overall amount of stochasticity.
-            A reasonable range is [0, 100].
-        s_min (`float`): the start value of the sigma range where we add noise (enable stochasticity).
-            A reasonable range is [0, 10].
-        s_max (`float`): the end value of the sigma range where we add noise.
-            A reasonable range is [0.2, 80].
-
-    """
-
-    order = 2
-
-    @register_to_config
-    def __init__(
-        self,
-        sigma_min: float = 0.02,
-        sigma_max: float = 100,
-        s_noise: float = 1.007,
-        s_churn: float = 80,
-        s_min: float = 0.05,
-        s_max: float = 50,
-    ):
-        # standard deviation of the initial noise distribution
-        self.init_noise_sigma = sigma_max
-
-        # setable values
-        self.num_inference_steps: int = None
-        self.timesteps: paddle.Tensor = None
-        self.schedule: paddle.Tensor = None  # sigma(t_i)
-
-    def scale_model_input(self, sample: paddle.Tensor, timestep: Optional[int] = None) -> paddle.Tensor:
-        """
-        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
-        current timestep.
-
-        Args:
-            sample (`paddle.Tensor`): input sample
-            timestep (`int`, optional): current timestep
-
-        Returns:
-            `paddle.Tensor`: scaled input sample
-        """
-        return sample
-
-    def set_timesteps(self, num_inference_steps: int):
-        """
-        Sets the continuous timesteps used for the diffusion chain. Supporting function to be run before inference.
-
-        Args:
-            num_inference_steps (`int`):
-                the number of diffusion steps used when generating samples with a pre-trained model.
-
-        """
-        self.num_inference_steps = num_inference_steps
-        timesteps = np.arange(0, self.num_inference_steps)[::-1].copy()
-        self.timesteps = paddle.to_tensor(timesteps)
-        schedule = [
-            (
-                self.config.sigma_max**2
-                * (self.config.sigma_min**2 / self.config.sigma_max**2) ** (i / (num_inference_steps - 1))
-            )
-            for i in self.timesteps
-        ]
-        self.schedule = paddle.to_tensor(schedule, dtype=paddle.float32)
-
-    def add_noise_to_input(
-        self, sample: paddle.Tensor, sigma: float, generator: Optional[paddle.Generator] = None
-    ) -> Tuple[paddle.Tensor, float]:
-        """
-        Explicit Langevin-like "churn" step of adding noise to the sample according to a factor gamma_i ≥ 0 to reach a
-        higher noise level sigma_hat = sigma_i + gamma_i*sigma_i.
-
-        TODO Args:
-        """
-        if self.config.s_min <= sigma <= self.config.s_max:
-            gamma = min(self.config.s_churn / self.num_inference_steps, 2**0.5 - 1)
-        else:
-            gamma = 0
-
-        # sample eps ~ N(0, S_noise^2 * I)
-        eps = self.config.s_noise * randn_tensor(sample.shape, generator=generator)
-        sigma_hat = sigma + gamma * sigma
-        sample_hat = sample + ((sigma_hat**2 - sigma**2) ** 0.5 * eps)
-
-        return sample_hat, sigma_hat
-
-    def step(
-        self,
-        model_output: paddle.Tensor,
-        sigma_hat: float,
-        sigma_prev: float,
-        sample_hat: paddle.Tensor,
-        return_dict: bool = True,
-    ) -> Union[KarrasVeOutput, Tuple]:
-        """
-        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
-        process from the learned model outputs (most often the predicted noise).
-
-        Args:
-            model_output (`paddle.Tensor`): direct output from learned diffusion model.
-            sigma_hat (`float`): TODO
-            sigma_prev (`float`): TODO
-            sample_hat (`paddle.Tensor`): TODO
-            return_dict (`bool`): option for returning tuple rather than KarrasVeOutput class
-
-            KarrasVeOutput: updated sample in the diffusion chain and derivative (TODO double check).
-        Returns:
-            [`~schedulers.scheduling_karras_ve.KarrasVeOutput`] or `tuple`:
-            [`~schedulers.scheduling_karras_ve.KarrasVeOutput`] if `return_dict` is True, otherwise a `tuple`. When
-            returning a tuple, the first element is the sample tensor.
-
-        """
-
-        pred_original_sample = sample_hat + sigma_hat * model_output
-        derivative = (sample_hat - pred_original_sample) / sigma_hat
-        sample_prev = sample_hat + (sigma_prev - sigma_hat) * derivative
-
-        if not return_dict:
-            return (sample_prev, derivative)
-
-        return KarrasVeOutput(
-            prev_sample=sample_prev, derivative=derivative, pred_original_sample=pred_original_sample
-        )
-
-    def step_correct(
-        self,
-        model_output: paddle.Tensor,
-        sigma_hat: float,
-        sigma_prev: float,
-        sample_hat: paddle.Tensor,
-        sample_prev: paddle.Tensor,
-        derivative: paddle.Tensor,
-        return_dict: bool = True,
-    ) -> Union[KarrasVeOutput, Tuple]:
-        """
-        Correct the predicted sample based on the output model_output of the network. TODO complete description
-
-        Args:
-            model_output (`paddle.Tensor`): direct output from learned diffusion model.
-            sigma_hat (`float`): TODO
-            sigma_prev (`float`): TODO
-            sample_hat (`paddle.Tensor`): TODO
-            sample_prev (`paddle.Tensor`): TODO
-            derivative (`paddle.Tensor`): TODO
-            return_dict (`bool`): option for returning tuple rather than KarrasVeOutput class
-
-        Returns:
-            prev_sample (TODO): updated sample in the diffusion chain. derivative (TODO): TODO
-
-        """
-        pred_original_sample = sample_prev + sigma_prev * model_output
-        derivative_corr = (sample_prev - pred_original_sample) / sigma_prev
-        sample_prev = sample_hat + (sigma_prev - sigma_hat) * (0.5 * derivative + 0.5 * derivative_corr)
-
-        if not return_dict:
-            return (sample_prev, derivative)
-
-        return KarrasVeOutput(
-            prev_sample=sample_prev, derivative=derivative, pred_original_sample=pred_original_sample
-        )
-
-    def add_noise(self, original_samples, noise, timesteps):
-        raise NotImplementedError()
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_lms_discrete.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_lms_discrete.py
deleted file mode 100644
index fdf4dc9794db..000000000000
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_lms_discrete.py
+++ /dev/null
@@ -1,295 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2022 Katherine Crowson and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-import warnings
-from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-from scipy import integrate
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput
-from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
-
-
-@dataclass
-# Copied from ppdiffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->LMSDiscrete
-class LMSDiscreteSchedulerOutput(BaseOutput):
-    """
-    Output class for the scheduler's step function output.
-
-    Args:
-        prev_sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
-            Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
-            denoising loop.
-        pred_original_sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
-            The predicted denoised sample (x_{0}) based on the model output from the current timestep.
-            `pred_original_sample` can be used to preview progress or for guidance.
-    """
-
-    prev_sample: paddle.Tensor
-    pred_original_sample: Optional[paddle.Tensor] = None
-
-
-# Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
-    """
-    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
-    (1-beta) over time from t = [0,1].
-
-    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
-    to that part of the diffusion process.
-
-
-    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-
-    Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
-    """
-
-    def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
-
-    betas = []
-    for i in range(num_diffusion_timesteps):
-        t1 = i / num_diffusion_timesteps
-        t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
-    return paddle.to_tensor(betas, dtype=paddle.float32)
-
-
-class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
-    """
-    Linear Multistep Scheduler for discrete beta schedules. Based on the original k-diffusion implementation by
-    Katherine Crowson:
-    https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L181
-
-    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
-    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
-    [`~SchedulerMixin.from_pretrained`] functions.
-
-    Args:
-        num_train_timesteps (`int`): number of diffusion steps used to train the model.
-        beta_start (`float`): the starting `beta` value of inference.
-        beta_end (`float`): the final `beta` value.
-        beta_schedule (`str`):
-            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
-            `linear` or `scaled_linear`.
-        trained_betas (`np.ndarray`, optional):
-            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
-        prediction_type (`str`, default `epsilon`, optional):
-            prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
-            process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
-            https://imagen.research.google/video/paper.pdf)
-    """
-
-    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
-    order = 1
-
-    @register_to_config
-    def __init__(
-        self,
-        num_train_timesteps: int = 1000,
-        beta_start: float = 0.0001,
-        beta_end: float = 0.02,
-        beta_schedule: str = "linear",
-        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
-        prediction_type: str = "epsilon",
-    ):
-        if trained_betas is not None:
-            self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
-        elif beta_schedule == "linear":
-            self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
-        elif beta_schedule == "scaled_linear":
-            # this schedule is very specific to the latent diffusion model.
-            self.betas = (
-                paddle.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=paddle.float32) ** 2
-            )
-        elif beta_schedule == "squaredcos_cap_v2":
-            # Glide cosine schedule
-            self.betas = betas_for_alpha_bar(num_train_timesteps)
-        else:
-            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
-
-        self.alphas = 1.0 - self.betas
-        self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
-
-        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
-        sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32)
-        self.sigmas = paddle.to_tensor(sigmas)
-
-        # standard deviation of the initial noise distribution
-        self.init_noise_sigma = self.sigmas.max()
-
-        # setable values
-        self.num_inference_steps = None
-        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy()
-        self.timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32)
-        self.derivatives = []
-        self.is_scale_input_called = False
-
-    def scale_model_input(self, sample: paddle.Tensor, timestep: Union[float, paddle.Tensor]) -> paddle.Tensor:
-        """
-        Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the K-LMS algorithm.
-
-        Args:
-            sample (`paddle.Tensor`): input sample
-            timestep (`float` or `paddle.Tensor`): the current timestep in the diffusion chain
-
-        Returns:
-            `paddle.Tensor`: scaled input sample
-        """
-        step_index = (self.timesteps == timestep).nonzero().item()
-        sigma = self.sigmas[step_index]
-        sample = sample / ((sigma**2 + 1) ** 0.5)
-        self.is_scale_input_called = True
-        return sample
-
-    def get_lms_coefficient(self, order, t, current_order):
-        """
-        Compute a linear multistep coefficient.
-
-        Args:
-            order (TODO):
-            t (TODO):
-            current_order (TODO):
-        """
-
-        def lms_derivative(tau):
-            prod = 1.0
-            for k in range(order):
-                if current_order == k:
-                    continue
-                prod *= (tau - self.sigmas[t - k]) / (self.sigmas[t - current_order] - self.sigmas[t - k])
-            return prod
-
-        integrated_coeff = integrate.quad(lms_derivative, self.sigmas[t], self.sigmas[t + 1], epsrel=1e-4)[0]
-
-        return integrated_coeff
-
-    def set_timesteps(self, num_inference_steps: int):
-        """
-        Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
-
-        Args:
-            num_inference_steps (`int`):
-                the number of diffusion steps used when generating samples with a pre-trained model.
-        """
-        self.num_inference_steps = num_inference_steps
-
-        timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
-        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
-        sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
-        sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
-        self.sigmas = paddle.to_tensor(sigmas)
-        self.timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32)
-
-        self.derivatives = []
-
-    def step(
-        self,
-        model_output: paddle.Tensor,
-        timestep: Union[float, paddle.Tensor],
-        sample: paddle.Tensor,
-        order: int = 4,
-        return_dict: bool = True,
-    ) -> Union[LMSDiscreteSchedulerOutput, Tuple]:
-        """
-        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
-        process from the learned model outputs (most often the predicted noise).
-
-        Args:
-            model_output (`paddle.Tensor`): direct output from learned diffusion model.
-            timestep (`float`): current timestep in the diffusion chain.
-            sample (`paddle.Tensor`):
-                current instance of sample being created by diffusion process.
-            order: coefficient for multi-step inference.
-            return_dict (`bool`): option for returning tuple rather than LMSDiscreteSchedulerOutput class
-
-        Returns:
-            [`~schedulers.scheduling_utils.LMSDiscreteSchedulerOutput`] or `tuple`:
-            [`~schedulers.scheduling_utils.LMSDiscreteSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`.
-            When returning a tuple, the first element is the sample tensor.
-
-        """
-        if not self.is_scale_input_called:
-            warnings.warn(
-                "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
-                "See `StableDiffusionPipeline` for a usage example."
-            )
-
-        step_index = (self.timesteps == timestep).nonzero().item()
-        sigma = self.sigmas[step_index]
-
-        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
-        if self.config.prediction_type == "epsilon":
-            pred_original_sample = sample - sigma * model_output
-        elif self.config.prediction_type == "v_prediction":
-            # * c_out + input * c_skip
-            pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
-        elif self.config.prediction_type == "sample":
-            pred_original_sample = model_output
-        else:
-            raise ValueError(
-                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
-            )
-
-        # 2. Convert to an ODE derivative
-        derivative = (sample - pred_original_sample) / sigma
-        self.derivatives.append(derivative)
-        if len(self.derivatives) > order:
-            self.derivatives.pop(0)
-
-        # 3. Compute linear multistep coefficients
-        order = min(step_index + 1, order)
-        lms_coeffs = [self.get_lms_coefficient(order, step_index, curr_order) for curr_order in range(order)]
-
-        # 4. Compute previous sample based on the derivatives path
-        prev_sample = sample + sum(
-            coeff * derivative for coeff, derivative in zip(lms_coeffs, reversed(self.derivatives))
-        )
-
-        if not return_dict:
-            return (prev_sample,)
-
-        return LMSDiscreteSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
-
-    def add_noise(
-        self,
-        original_samples: paddle.Tensor,
-        noise: paddle.Tensor,
-        timesteps: paddle.Tensor,
-    ) -> paddle.Tensor:
-        # Make sure sigmas and timesteps have the same dtype as original_samples
-        sigmas = self.sigmas.cast(original_samples.dtype)
-        schedule_timesteps = self.timesteps
-
-        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
-
-        sigma = sigmas[step_indices].flatten()
-        while len(sigma.shape) < len(original_samples.shape):
-            sigma = sigma.unsqueeze(-1)
-
-        noisy_samples = original_samples + noise * sigma
-        return noisy_samples
-
-    def __len__(self):
-        return self.config.num_train_timesteps
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_pndm.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_pndm.py
deleted file mode 100644
index 9ea8851ca729..000000000000
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_pndm.py
+++ /dev/null
@@ -1,426 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2022 Zhejiang University Team and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# DISCLAIMER: This file is strongly influenced by https://github.com/ermongroup/ddim
-
-import math
-from typing import List, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
-
-
-# Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
-    """
-    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
-    (1-beta) over time from t = [0,1].
-
-    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
-    to that part of the diffusion process.
-
-
-    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-
-    Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
-    """
-
-    def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
-
-    betas = []
-    for i in range(num_diffusion_timesteps):
-        t1 = i / num_diffusion_timesteps
-        t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
-    return paddle.to_tensor(betas, dtype=paddle.float32)
-
-
-class PNDMScheduler(SchedulerMixin, ConfigMixin):
-    """
-    Pseudo numerical methods for diffusion models (PNDM) proposes using more advanced ODE integration techniques,
-    namely Runge-Kutta method and a linear multi-step method.
-
-    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
-    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
-    [`~SchedulerMixin.from_pretrained`] functions.
-
-    For more details, see the original paper: https://arxiv.org/abs/2202.09778
-
-    Args:
-        num_train_timesteps (`int`): number of diffusion steps used to train the model.
-        beta_start (`float`): the starting `beta` value of inference.
-        beta_end (`float`): the final `beta` value.
-        beta_schedule (`str`):
-            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
-            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
-        trained_betas (`np.ndarray`, optional):
-            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
-        skip_prk_steps (`bool`):
-            allows the scheduler to skip the Runge-Kutta steps that are defined in the original paper as being required
-            before plms steps; defaults to `False`.
-        set_alpha_to_one (`bool`, default `False`):
-            each diffusion step uses the value of alphas product at that step and at the previous one. For the final
-            step there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
-            otherwise it uses the value of alpha at step 0.
-        prediction_type (`str`, default `epsilon`, optional):
-            prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion process)
-            or `v_prediction` (see section 2.4 https://imagen.research.google/video/paper.pdf)
-        steps_offset (`int`, default `0`):
-            an offset added to the inference steps. You can use a combination of `offset=1` and
-            `set_alpha_to_one=False`, to make the last step use step 0 for the previous alpha product, as done in
-            stable diffusion.
-
-    """
-
-    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
-    order = 1
-
-    @register_to_config
-    def __init__(
-        self,
-        num_train_timesteps: int = 1000,
-        beta_start: float = 0.0001,
-        beta_end: float = 0.02,
-        beta_schedule: str = "linear",
-        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
-        skip_prk_steps: bool = False,
-        set_alpha_to_one: bool = False,
-        prediction_type: str = "epsilon",
-        steps_offset: int = 0,
-    ):
-        if trained_betas is not None:
-            self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
-        elif beta_schedule == "linear":
-            self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
-        elif beta_schedule == "scaled_linear":
-            # this schedule is very specific to the latent diffusion model.
-            self.betas = (
-                paddle.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=paddle.float32) ** 2
-            )
-        elif beta_schedule == "squaredcos_cap_v2":
-            # Glide cosine schedule
-            self.betas = betas_for_alpha_bar(num_train_timesteps)
-        else:
-            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
-
-        self.alphas = 1.0 - self.betas
-        self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
-
-        self.final_alpha_cumprod = paddle.to_tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
-
-        # standard deviation of the initial noise distribution
-        self.init_noise_sigma = 1.0
-
-        # For now we only support F-PNDM, i.e. the runge-kutta method
-        # For more information on the algorithm please take a look at the paper: https://arxiv.org/pdf/2202.09778.pdf
-        # mainly at formula (9), (12), (13) and the Algorithm 2.
-        self.pndm_order = 4
-
-        # running values
-        self.cur_model_output = 0
-        self.counter = 0
-        self.cur_sample = None
-        self.ets = []
-
-        # setable values
-        self.num_inference_steps = None
-        self._timesteps = np.arange(0, num_train_timesteps)[::-1].copy()
-        self.prk_timesteps = None
-        self.plms_timesteps = None
-        self.timesteps = None
-
-    def set_timesteps(self, num_inference_steps: int):
-        """
-        Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
-
-        Args:
-            num_inference_steps (`int`):
-                the number of diffusion steps used when generating samples with a pre-trained model.
-        """
-
-        self.num_inference_steps = num_inference_steps
-        step_ratio = self.config.num_train_timesteps // self.num_inference_steps
-        # creates integer timesteps by multiplying by ratio
-        # casting to int to avoid issues when num_inference_step is power of 3
-        self._timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()
-        self._timesteps += self.config.steps_offset
-
-        if self.config.skip_prk_steps:
-            # for some models like stable diffusion the prk steps can/should be skipped to
-            # produce better results. When using PNDM with `self.config.skip_prk_steps` the implementation
-            # is based on crowsonkb's PLMS sampler implementation: https://github.com/CompVis/latent-diffusion/pull/51
-            self.prk_timesteps = np.array([])
-            self.plms_timesteps = np.concatenate([self._timesteps[:-1], self._timesteps[-2:-1], self._timesteps[-1:]])[
-                ::-1
-            ].copy()
-        else:
-            prk_timesteps = np.array(self._timesteps[-self.pndm_order :]).repeat(2) + np.tile(
-                np.array([0, self.config.num_train_timesteps // num_inference_steps // 2]), self.pndm_order
-            )
-            self.prk_timesteps = (prk_timesteps[:-1].repeat(2)[1:-1])[::-1].copy()
-            self.plms_timesteps = self._timesteps[:-3][
-                ::-1
-            ].copy()  # we copy to avoid having negative strides which are not supported by paddle
-
-        timesteps = np.concatenate([self.prk_timesteps, self.plms_timesteps]).astype(np.int64)
-        self.timesteps = paddle.to_tensor(timesteps)
-
-        self.ets = []
-        self.counter = 0
-        self.cur_model_output = 0
-
-    def step(
-        self,
-        model_output: paddle.Tensor,
-        timestep: int,
-        sample: paddle.Tensor,
-        return_dict: bool = True,
-    ) -> Union[SchedulerOutput, Tuple]:
-        """
-        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
-        process from the learned model outputs (most often the predicted noise).
-
-        This function calls `step_prk()` or `step_plms()` depending on the internal variable `counter`.
-
-        Args:
-            model_output (`paddle.Tensor`): direct output from learned diffusion model.
-            timestep (`int`): current discrete timestep in the diffusion chain.
-            sample (`paddle.Tensor`):
-                current instance of sample being created by diffusion process.
-            return_dict (`bool`): option for returning tuple rather than SchedulerOutput class
-
-        Returns:
-            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
-            [`~schedulers.scheduling_utils.SchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. When
-            returning a tuple, the first element is the sample tensor.
-
-        """
-        if self.counter < len(self.prk_timesteps) and not self.config.skip_prk_steps:
-            return self.step_prk(model_output=model_output, timestep=timestep, sample=sample, return_dict=return_dict)
-        else:
-            return self.step_plms(model_output=model_output, timestep=timestep, sample=sample, return_dict=return_dict)
-
-    def step_prk(
-        self,
-        model_output: paddle.Tensor,
-        timestep: int,
-        sample: paddle.Tensor,
-        return_dict: bool = True,
-    ) -> Union[SchedulerOutput, Tuple]:
-        """
-        Step function propagating the sample with the Runge-Kutta method. RK takes 4 forward passes to approximate the
-        solution to the differential equation.
-
-        Args:
-            model_output (`paddle.Tensor`): direct output from learned diffusion model.
-            timestep (`int`): current discrete timestep in the diffusion chain.
-            sample (`paddle.Tensor`):
-                current instance of sample being created by diffusion process.
-            return_dict (`bool`): option for returning tuple rather than SchedulerOutput class
-
-        Returns:
-            [`~scheduling_utils.SchedulerOutput`] or `tuple`: [`~scheduling_utils.SchedulerOutput`] if `return_dict` is
-            True, otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
-
-        """
-        if self.num_inference_steps is None:
-            raise ValueError(
-                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
-            )
-
-        diff_to_prev = 0 if self.counter % 2 else self.config.num_train_timesteps // self.num_inference_steps // 2
-        prev_timestep = timestep - diff_to_prev
-        timestep = self.prk_timesteps[self.counter // 4 * 4]
-
-        if self.counter % 4 == 0:
-            self.cur_model_output += 1 / 6 * model_output
-            self.ets.append(model_output)
-            self.cur_sample = sample
-        elif (self.counter - 1) % 4 == 0:
-            self.cur_model_output += 1 / 3 * model_output
-        elif (self.counter - 2) % 4 == 0:
-            self.cur_model_output += 1 / 3 * model_output
-        elif (self.counter - 3) % 4 == 0:
-            model_output = self.cur_model_output + 1 / 6 * model_output
-            self.cur_model_output = 0
-
-        # cur_sample should not be `None`
-        cur_sample = self.cur_sample if self.cur_sample is not None else sample
-
-        prev_sample = self._get_prev_sample(cur_sample, timestep, prev_timestep, model_output)
-        self.counter += 1
-
-        if not return_dict:
-            return (prev_sample,)
-
-        return SchedulerOutput(prev_sample=prev_sample)
-
-    def step_plms(
-        self,
-        model_output: paddle.Tensor,
-        timestep: int,
-        sample: paddle.Tensor,
-        return_dict: bool = True,
-    ) -> Union[SchedulerOutput, Tuple]:
-        """
-        Step function propagating the sample with the linear multi-step method. This has one forward pass with multiple
-        times to approximate the solution.
-
-        Args:
-            model_output (`paddle.Tensor`): direct output from learned diffusion model.
-            timestep (`int`): current discrete timestep in the diffusion chain.
-            sample (`paddle.Tensor`):
-                current instance of sample being created by diffusion process.
-            return_dict (`bool`): option for returning tuple rather than SchedulerOutput class
-
-        Returns:
-            [`~scheduling_utils.SchedulerOutput`] or `tuple`: [`~scheduling_utils.SchedulerOutput`] if `return_dict` is
-            True, otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
-
-        """
-        if self.num_inference_steps is None:
-            raise ValueError(
-                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
-            )
-
-        if not self.config.skip_prk_steps and len(self.ets) < 3:
-            raise ValueError(
-                f"{self.__class__} can only be run AFTER scheduler has been run "
-                "in 'prk' mode for at least 12 iterations "
-                "See: https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_pndm.py "
-                "for more information."
-            )
-
-        prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps
-
-        if self.counter != 1:
-            self.ets = self.ets[-3:]
-            self.ets.append(model_output)
-        else:
-            prev_timestep = timestep
-            timestep = timestep + self.config.num_train_timesteps // self.num_inference_steps
-
-        if len(self.ets) == 1 and self.counter == 0:
-            model_output = model_output
-            self.cur_sample = sample
-        elif len(self.ets) == 1 and self.counter == 1:
-            model_output = (model_output + self.ets[-1]) / 2
-            sample = self.cur_sample
-            self.cur_sample = None
-        elif len(self.ets) == 2:
-            model_output = (3 * self.ets[-1] - self.ets[-2]) / 2
-        elif len(self.ets) == 3:
-            model_output = (23 * self.ets[-1] - 16 * self.ets[-2] + 5 * self.ets[-3]) / 12
-        else:
-            model_output = (1 / 24) * (55 * self.ets[-1] - 59 * self.ets[-2] + 37 * self.ets[-3] - 9 * self.ets[-4])
-
-        prev_sample = self._get_prev_sample(sample, timestep, prev_timestep, model_output)
-        self.counter += 1
-
-        if not return_dict:
-            return (prev_sample,)
-
-        return SchedulerOutput(prev_sample=prev_sample)
-
-    def scale_model_input(self, sample: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
-        """
-        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
-        current timestep.
-
-        Args:
-            sample (`paddle.Tensor`): input sample
-
-        Returns:
-            `paddle.Tensor`: scaled input sample
-        """
-        return sample
-
-    def _get_prev_sample(self, sample, timestep, prev_timestep, model_output):
-        # See formula (9) of PNDM paper https://arxiv.org/pdf/2202.09778.pdf
-        # this function computes x_(t−δ) using the formula of (9)
-        # Note that x_t needs to be added to both sides of the equation
-
-        # Notation (<variable name> -> <name in paper>
-        # alpha_prod_t -> α_t
-        # alpha_prod_t_prev -> α_(t−δ)
-        # beta_prod_t -> (1 - α_t)
-        # beta_prod_t_prev -> (1 - α_(t−δ))
-        # sample -> x_t
-        # model_output -> e_θ(x_t, t)
-        # prev_sample -> x_(t−δ)
-        alpha_prod_t = self.alphas_cumprod[timestep]
-        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
-        beta_prod_t = 1 - alpha_prod_t
-        beta_prod_t_prev = 1 - alpha_prod_t_prev
-
-        if self.config.prediction_type == "v_prediction":
-            model_output = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
-        elif self.config.prediction_type != "epsilon":
-            raise ValueError(
-                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon` or `v_prediction`"
-            )
-
-        # corresponds to (α_(t−δ) - α_t) divided by
-        # denominator of x_t in formula (9) and plus 1
-        # Note: (α_(t−δ) - α_t) / (sqrt(α_t) * (sqrt(α_(t−δ)) + sqr(α_t))) =
-        # sqrt(α_(t−δ)) / sqrt(α_t))
-        sample_coeff = (alpha_prod_t_prev / alpha_prod_t) ** (0.5)
-
-        # corresponds to denominator of e_θ(x_t, t) in formula (9)
-        model_output_denom_coeff = alpha_prod_t * beta_prod_t_prev ** (0.5) + (
-            alpha_prod_t * beta_prod_t * alpha_prod_t_prev
-        ) ** (0.5)
-
-        # full formula (9)
-        prev_sample = (
-            sample_coeff * sample - (alpha_prod_t_prev - alpha_prod_t) * model_output / model_output_denom_coeff
-        )
-
-        return prev_sample
-
-    # Copied from ppdiffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
-    def add_noise(
-        self,
-        original_samples: paddle.Tensor,
-        noise: paddle.Tensor,
-        timesteps: paddle.Tensor,
-    ) -> paddle.Tensor:
-        # Make sure alphas_cumprod and timestep have same dtype as original_samples
-        alphas_cumprod = self.alphas_cumprod.cast(original_samples.dtype)
-
-        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
-        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
-        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
-            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
-
-        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
-        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
-        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
-            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
-
-        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
-        return noisy_samples
-
-    def __len__(self):
-        return self.config.num_train_timesteps
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_repaint.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_repaint.py
deleted file mode 100644
index 5a506ab29142..000000000000
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_repaint.py
+++ /dev/null
@@ -1,324 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2022 ETH Zurich Computer Vision Lab and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-import paddle.nn.functional as F
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, randn_tensor
-from .scheduling_utils import SchedulerMixin
-
-
-@dataclass
-class RePaintSchedulerOutput(BaseOutput):
-    """
-    Output class for the scheduler's step function output.
-
-    Args:
-        prev_sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
-            Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
-            denoising loop.
-        pred_original_sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
-            The predicted denoised sample (x_{0}) based on the model output from
-             the current timestep. `pred_original_sample` can be used to preview progress or for guidance.
-    """
-
-    prev_sample: paddle.Tensor
-    pred_original_sample: paddle.Tensor
-
-
-# Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
-    """
-    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
-    (1-beta) over time from t = [0,1].
-
-    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
-    to that part of the diffusion process.
-
-
-    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-
-    Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
-    """
-
-    def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
-
-    betas = []
-    for i in range(num_diffusion_timesteps):
-        t1 = i / num_diffusion_timesteps
-        t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
-    return paddle.to_tensor(betas, dtype=paddle.float32)
-
-
-class RePaintScheduler(SchedulerMixin, ConfigMixin):
-    """
-    RePaint is a schedule for DDPM inpainting inside a given mask.
-
-    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
-    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
-    [`~SchedulerMixin.from_pretrained`] functions.
-
-    For more details, see the original paper: https://arxiv.org/pdf/2201.09865.pdf
-
-    Args:
-        num_train_timesteps (`int`): number of diffusion steps used to train the model.
-        beta_start (`float`): the starting `beta` value of inference.
-        beta_end (`float`): the final `beta` value.
-        beta_schedule (`str`):
-            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
-            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
-        eta (`float`):
-            The weight of noise for added noise in a diffusion step. Its value is between 0.0 and 1.0 -0.0 is DDIM and
-            1.0 is DDPM scheduler respectively.
-        trained_betas (`np.ndarray`, optional):
-            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
-        variance_type (`str`):
-            options to clip the variance used when adding noise to the denoised sample. Choose from `fixed_small`,
-            `fixed_small_log`, `fixed_large`, `fixed_large_log`, `learned` or `learned_range`.
-        clip_sample (`bool`, default `True`):
-            option to clip predicted sample between -1 and 1 for numerical stability.
-
-    """
-
-    order = 1
-
-    @register_to_config
-    def __init__(
-        self,
-        num_train_timesteps: int = 1000,
-        beta_start: float = 0.0001,
-        beta_end: float = 0.02,
-        beta_schedule: str = "linear",
-        eta: float = 0.0,
-        trained_betas: Optional[np.ndarray] = None,
-        clip_sample: bool = True,
-    ):
-        if trained_betas is not None:
-            self.betas = paddle.to_tensor(trained_betas)
-        elif beta_schedule == "linear":
-            self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
-        elif beta_schedule == "scaled_linear":
-            # this schedule is very specific to the latent diffusion model.
-            self.betas = (
-                paddle.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=paddle.float32) ** 2
-            )
-        elif beta_schedule == "squaredcos_cap_v2":
-            # Glide cosine schedule
-            self.betas = betas_for_alpha_bar(num_train_timesteps)
-        elif beta_schedule == "sigmoid":
-            # GeoDiff sigmoid schedule
-            betas = paddle.linspace(-6, 6, num_train_timesteps)
-            self.betas = F.sigmoid(betas) * (beta_end - beta_start) + beta_start
-        else:
-            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
-
-        self.alphas = 1.0 - self.betas
-        self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
-        self.one = paddle.to_tensor(1.0)
-
-        self.final_alpha_cumprod = paddle.to_tensor(1.0)
-
-        # standard deviation of the initial noise distribution
-        self.init_noise_sigma = 1.0
-
-        # setable values
-        self.num_inference_steps = None
-        self.timesteps = paddle.to_tensor(np.arange(0, num_train_timesteps)[::-1].copy())
-
-        self.eta = eta
-
-    def scale_model_input(self, sample: paddle.Tensor, timestep: Optional[int] = None) -> paddle.Tensor:
-        """
-        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
-        current timestep.
-
-        Args:
-            sample (`paddle.Tensor`): input sample
-            timestep (`int`, optional): current timestep
-
-        Returns:
-            `paddle.Tensor`: scaled input sample
-        """
-        return sample
-
-    def set_timesteps(
-        self,
-        num_inference_steps: int,
-        jump_length: int = 10,
-        jump_n_sample: int = 10,
-    ):
-        num_inference_steps = min(self.config.num_train_timesteps, num_inference_steps)
-        self.num_inference_steps = num_inference_steps
-
-        timesteps = []
-
-        jumps = {}
-        for j in range(0, num_inference_steps - jump_length, jump_length):
-            jumps[j] = jump_n_sample - 1
-
-        t = num_inference_steps
-        while t >= 1:
-            t = t - 1
-            timesteps.append(t)
-
-            if jumps.get(t, 0) > 0:
-                jumps[t] = jumps[t] - 1
-                for _ in range(jump_length):
-                    t = t + 1
-                    timesteps.append(t)
-
-        timesteps = np.array(timesteps) * (self.config.num_train_timesteps // self.num_inference_steps)
-        self.timesteps = paddle.to_tensor(timesteps)
-
-    def _get_variance(self, t):
-        prev_timestep = t - self.config.num_train_timesteps // self.num_inference_steps
-
-        alpha_prod_t = self.alphas_cumprod[t]
-        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
-        beta_prod_t = 1 - alpha_prod_t
-        beta_prod_t_prev = 1 - alpha_prod_t_prev
-
-        # For t > 0, compute predicted variance βt (see formula (6) and (7) from
-        # https://arxiv.org/pdf/2006.11239.pdf) and sample from it to get
-        # previous sample x_{t-1} ~ N(pred_prev_sample, variance) == add
-        # variance to pred_sample
-        # Is equivalent to formula (16) in https://arxiv.org/pdf/2010.02502.pdf
-        # without eta.
-        # variance = (1 - alpha_prod_t_prev) / (1 - alpha_prod_t) * self.betas[t]
-        variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
-
-        return variance
-
-    def step(
-        self,
-        model_output: paddle.Tensor,
-        timestep: int,
-        sample: paddle.Tensor,
-        original_image: paddle.Tensor,
-        mask: paddle.Tensor,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        return_dict: bool = True,
-    ) -> Union[RePaintSchedulerOutput, Tuple]:
-        """
-        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
-        process from the learned model outputs (most often the predicted noise).
-
-        Args:
-            model_output (`paddle.Tensor`): direct output from learned
-                diffusion model.
-            timestep (`int`): current discrete timestep in the diffusion chain.
-            sample (`paddle.Tensor`):
-                current instance of sample being created by diffusion process.
-            original_image (`paddle.Tensor`):
-                the original image to inpaint on.
-            mask (`paddle.Tensor`):
-                the mask where 0.0 values define which part of the original image to inpaint (change).
-            generator (`paddle.Generator`, *optional*): random number generator.
-            return_dict (`bool`): option for returning tuple rather than
-                DDPMSchedulerOutput class
-
-        Returns:
-            [`~schedulers.scheduling_utils.RePaintSchedulerOutput`] or `tuple`:
-            [`~schedulers.scheduling_utils.RePaintSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. When
-            returning a tuple, the first element is the sample tensor.
-
-        """
-        t = timestep
-        prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps
-
-        # 1. compute alphas, betas
-        alpha_prod_t = self.alphas_cumprod[t]
-        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
-        beta_prod_t = 1 - alpha_prod_t
-
-        # 2. compute predicted original sample from predicted noise also called
-        # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
-        pred_original_sample = (sample - beta_prod_t**0.5 * model_output) / alpha_prod_t**0.5
-
-        # 3. Clip "predicted x_0"
-        if self.config.clip_sample:
-            pred_original_sample = paddle.clip(pred_original_sample, -1, 1)
-
-        # We choose to follow RePaint Algorithm 1 to get x_{t-1}, however we
-        # substitute formula (7) in the algorithm coming from DDPM paper
-        # (formula (4) Algorithm 2 - Sampling) with formula (12) from DDIM paper.
-        # DDIM schedule gives the same results as DDPM with eta = 1.0
-        # Noise is being reused in 7. and 8., but no impact on quality has
-        # been observed.
-
-        # 5. Add noise
-        noise = randn_tensor(model_output.shape, generator=generator, dtype=model_output.dtype)
-        std_dev_t = self.eta * self._get_variance(timestep) ** 0.5
-
-        variance = 0
-        if t > 0 and self.eta > 0:
-            variance = std_dev_t * noise
-
-        # 6. compute "direction pointing to x_t" of formula (12)
-        # from https://arxiv.org/pdf/2010.02502.pdf
-        pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** 0.5 * model_output
-
-        # 7. compute x_{t-1} of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-        prev_unknown_part = alpha_prod_t_prev**0.5 * pred_original_sample + pred_sample_direction + variance
-
-        # 8. Algorithm 1 Line 5 https://arxiv.org/pdf/2201.09865.pdf
-        prev_known_part = (alpha_prod_t_prev**0.5) * original_image + ((1 - alpha_prod_t_prev) ** 0.5) * noise
-
-        # 9. Algorithm 1 Line 8 https://arxiv.org/pdf/2201.09865.pdf
-        pred_prev_sample = mask * prev_known_part + (1.0 - mask) * prev_unknown_part
-
-        if not return_dict:
-            return (
-                pred_prev_sample,
-                pred_original_sample,
-            )
-
-        return RePaintSchedulerOutput(prev_sample=pred_prev_sample, pred_original_sample=pred_original_sample)
-
-    def undo_step(self, sample, timestep, generator=None):
-        n = self.config.num_train_timesteps // self.num_inference_steps
-
-        for i in range(n):
-            beta = self.betas[timestep + i]
-            noise = randn_tensor(sample.shape, generator=generator, dtype=sample.dtype)
-
-            # 10. Algorithm 1 Line 10 https://arxiv.org/pdf/2201.09865.pdf
-            sample = (1 - beta) ** 0.5 * sample + beta**0.5 * noise
-
-        return sample
-
-    def add_noise(
-        self,
-        original_samples: paddle.Tensor,
-        noise: paddle.Tensor,
-        timesteps: paddle.Tensor,
-    ) -> paddle.Tensor:
-        raise NotImplementedError("Use `DDPMScheduler.add_noise()` to train for sampling with RePaint.")
-
-    def __len__(self):
-        return self.config.num_train_timesteps
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_sde_ve.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_sde_ve.py
deleted file mode 100644
index 9c6e131cfe94..000000000000
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_sde_ve.py
+++ /dev/null
@@ -1,277 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2022 Google Brain and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# DISCLAIMER: This file is strongly influenced by https://github.com/yang-song/score_sde_pytorch
-
-import math
-from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
-
-import paddle
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, randn_tensor
-from .scheduling_utils import SchedulerMixin, SchedulerOutput
-
-
-@dataclass
-class SdeVeOutput(BaseOutput):
-    """
-    Output class for the ScoreSdeVeScheduler's step function output.
-
-    Args:
-        prev_sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
-            Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
-            denoising loop.
-        prev_sample_mean (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
-            Mean averaged `prev_sample`. Same as `prev_sample`, only mean-averaged over previous timesteps.
-    """
-
-    prev_sample: paddle.Tensor
-    prev_sample_mean: paddle.Tensor
-
-
-class ScoreSdeVeScheduler(SchedulerMixin, ConfigMixin):
-    """
-    The variance exploding stochastic differential equation (SDE) scheduler.
-
-    For more information, see the original paper: https://arxiv.org/abs/2011.13456
-
-    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
-    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
-    [`~SchedulerMixin.from_pretrained`] functions.
-
-    Args:
-        num_train_timesteps (`int`): number of diffusion steps used to train the model.
-        snr (`float`):
-            coefficient weighting the step from the model_output sample (from the network) to the random noise.
-        sigma_min (`float`):
-                initial noise scale for sigma sequence in sampling procedure. The minimum sigma should mirror the
-                distribution of the data.
-        sigma_max (`float`): maximum value used for the range of continuous timesteps passed into the model.
-        sampling_eps (`float`): the end value of sampling, where timesteps decrease progressively from 1 to
-        epsilon.
-        correct_steps (`int`): number of correction steps performed on a produced sample.
-    """
-
-    order = 1
-
-    @register_to_config
-    def __init__(
-        self,
-        num_train_timesteps: int = 2000,
-        snr: float = 0.15,
-        sigma_min: float = 0.01,
-        sigma_max: float = 1348.0,
-        sampling_eps: float = 1e-5,
-        correct_steps: int = 1,
-    ):
-        # standard deviation of the initial noise distribution
-        self.init_noise_sigma = sigma_max
-
-        # setable values
-        self.timesteps = None
-
-        self.set_sigmas(num_train_timesteps, sigma_min, sigma_max, sampling_eps)
-
-    def scale_model_input(self, sample: paddle.Tensor, timestep: Optional[int] = None) -> paddle.Tensor:
-        """
-        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
-        current timestep.
-
-        Args:
-            sample (`paddle.Tensor`): input sample
-            timestep (`int`, optional): current timestep
-
-        Returns:
-            `paddle.Tensor`: scaled input sample
-        """
-        return sample
-
-    def set_timesteps(self, num_inference_steps: int, sampling_eps: float = None):
-        """
-        Sets the continuous timesteps used for the diffusion chain. Supporting function to be run before inference.
-
-        Args:
-            num_inference_steps (`int`):
-                the number of diffusion steps used when generating samples with a pre-trained model.
-            sampling_eps (`float`, optional):
-                final timestep value (overrides value given at Scheduler instantiation).
-
-        """
-        sampling_eps = sampling_eps if sampling_eps is not None else self.config.sampling_eps
-
-        self.timesteps = paddle.linspace(1, sampling_eps, num_inference_steps)
-
-    def set_sigmas(
-        self, num_inference_steps: int, sigma_min: float = None, sigma_max: float = None, sampling_eps: float = None
-    ):
-        """
-        Sets the noise scales used for the diffusion chain. Supporting function to be run before inference.
-
-        The sigmas control the weight of the `drift` and `diffusion` components of sample update.
-
-        Args:
-            num_inference_steps (`int`):
-                the number of diffusion steps used when generating samples with a pre-trained model.
-            sigma_min (`float`, optional):
-                initial noise scale value (overrides value given at Scheduler instantiation).
-            sigma_max (`float`, optional):
-                final noise scale value (overrides value given at Scheduler instantiation).
-            sampling_eps (`float`, optional):
-                final timestep value (overrides value given at Scheduler instantiation).
-
-        """
-        sigma_min = sigma_min if sigma_min is not None else self.config.sigma_min
-        sigma_max = sigma_max if sigma_max is not None else self.config.sigma_max
-        sampling_eps = sampling_eps if sampling_eps is not None else self.config.sampling_eps
-        if self.timesteps is None:
-            self.set_timesteps(num_inference_steps, sampling_eps)
-
-        self.sigmas = sigma_min * (sigma_max / sigma_min) ** (self.timesteps / sampling_eps)
-        self.discrete_sigmas = paddle.exp(
-            paddle.linspace(math.log(sigma_min), math.log(sigma_max), num_inference_steps)
-        )
-        self.sigmas = paddle.to_tensor([sigma_min * (sigma_max / sigma_min) ** t for t in self.timesteps])
-
-    def get_adjacent_sigma(self, timesteps, t):
-        return paddle.where(
-            timesteps == 0,
-            paddle.zeros_like(t),
-            self.discrete_sigmas[timesteps - 1],
-        )
-
-    def step_pred(
-        self,
-        model_output: paddle.Tensor,
-        timestep: int,
-        sample: paddle.Tensor,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        return_dict: bool = True,
-    ) -> Union[SdeVeOutput, Tuple]:
-        """
-        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
-        process from the learned model outputs (most often the predicted noise).
-
-        Args:
-            model_output (`paddle.Tensor`): direct output from learned diffusion model.
-            timestep (`int`): current discrete timestep in the diffusion chain.
-            sample (`paddle.Tensor`):
-                current instance of sample being created by diffusion process.
-            generator: random number generator.
-            return_dict (`bool`): option for returning tuple rather than SchedulerOutput class
-
-        Returns:
-            [`~schedulers.scheduling_sde_ve.SdeVeOutput`] or `tuple`: [`~schedulers.scheduling_sde_ve.SdeVeOutput`] if
-            `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
-
-        """
-        if self.timesteps is None:
-            raise ValueError(
-                "`self.timesteps` is not set, you need to run 'set_timesteps' after creating the scheduler"
-            )
-
-        timestep = timestep * paddle.ones((sample.shape[0],))  # paddle.repeat_interleave(timestep, sample.shape[0])
-        timesteps = (timestep * (len(self.timesteps) - 1)).cast("int64")
-
-        sigma = self.discrete_sigmas[timesteps]
-        adjacent_sigma = self.get_adjacent_sigma(timesteps, timestep)
-        drift = paddle.zeros_like(sample)
-        diffusion = (sigma**2 - adjacent_sigma**2) ** 0.5
-
-        # equation 6 in the paper: the model_output modeled by the network is grad_x log pt(x)
-        # also equation 47 shows the analog from SDE models to ancestral sampling methods
-        diffusion = diffusion.flatten()
-        while len(diffusion.shape) < len(sample.shape):
-            diffusion = diffusion.unsqueeze(-1)
-        drift = drift - diffusion**2 * model_output
-
-        #  equation 6: sample noise for the diffusion term of
-        noise = randn_tensor(sample.shape, generator=generator, dtype=sample.dtype)
-        prev_sample_mean = sample - drift  # subtract because `dt` is a small negative timestep
-        # TODO is the variable diffusion the correct scaling term for the noise?
-        prev_sample = prev_sample_mean + diffusion * noise  # add impact of diffusion field g
-
-        if not return_dict:
-            return (prev_sample, prev_sample_mean)
-
-        return SdeVeOutput(prev_sample=prev_sample, prev_sample_mean=prev_sample_mean)
-
-    def step_correct(
-        self,
-        model_output: paddle.Tensor,
-        sample: paddle.Tensor,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        return_dict: bool = True,
-    ) -> Union[SchedulerOutput, Tuple]:
-        """
-        Correct the predicted sample based on the output model_output of the network. This is often run repeatedly
-        after making the prediction for the previous timestep.
-
-        Args:
-            model_output (`paddle.Tensor`): direct output from learned diffusion model.
-            sample (`paddle.Tensor`):
-                current instance of sample being created by diffusion process.
-            generator: random number generator.
-            return_dict (`bool`): option for returning tuple rather than SchedulerOutput class
-
-        Returns:
-            [`~schedulers.scheduling_sde_ve.SdeVeOutput`] or `tuple`: [`~schedulers.scheduling_sde_ve.SdeVeOutput`] if
-            `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
-
-        """
-        if self.timesteps is None:
-            raise ValueError(
-                "`self.timesteps` is not set, you need to run 'set_timesteps' after creating the scheduler"
-            )
-
-        # For small batch sizes, the paper "suggest replacing norm(z) with sqrt(d), where d is the dim. of z"
-        # sample noise for correction
-        noise = randn_tensor(sample.shape, generator=generator)
-
-        # compute step size from the model_output, the noise, and the snr
-        grad_norm = paddle.norm(model_output.reshape([model_output.shape[0], -1]), axis=-1).mean()
-        noise_norm = paddle.norm(noise.reshape([noise.shape[0], -1]), axis=-1).mean()
-        step_size = (self.config.snr * noise_norm / grad_norm) ** 2 * 2
-        step_size = step_size * paddle.ones((sample.shape[0],))
-        # self.repeat_scalar(step_size, sample.shape[0])
-
-        # compute corrected sample: model_output term and noise term
-        step_size = step_size.flatten()
-        while len(step_size.shape) < len(sample.shape):
-            step_size = step_size.unsqueeze(-1)
-        prev_sample_mean = sample + step_size * model_output
-        prev_sample = prev_sample_mean + ((step_size * 2) ** 0.5) * noise
-
-        if not return_dict:
-            return (prev_sample,)
-
-        return SchedulerOutput(prev_sample=prev_sample)
-
-    def add_noise(
-        self,
-        original_samples: paddle.Tensor,
-        noise: paddle.Tensor,
-        timesteps: paddle.Tensor,
-    ) -> paddle.Tensor:
-        # Make sure sigmas and timesteps have the same dtype as original_samples
-        sigmas = self.discrete_sigmas[timesteps]
-        noise = paddle.randn(original_samples.shape, dtype=original_samples.dtype) * sigmas[:, None, None, None]
-        noisy_samples = noise + original_samples
-        return noisy_samples
-
-    def __len__(self):
-        return self.config.num_train_timesteps
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_sde_vp.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_sde_vp.py
deleted file mode 100644
index c0e1eebc3eb9..000000000000
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_sde_vp.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2022 Google Brain and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# DISCLAIMER: This file is strongly influenced by https://github.com/yang-song/score_sde_pytorch
-
-import math
-
-import paddle
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import randn_tensor
-from .scheduling_utils import SchedulerMixin
-
-
-class ScoreSdeVpScheduler(SchedulerMixin, ConfigMixin):
-    """
-    The variance preserving stochastic differential equation (SDE) scheduler.
-
-    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
-    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
-    [`~SchedulerMixin.from_pretrained`] functions.
-
-    For more information, see the original paper: https://arxiv.org/abs/2011.13456
-
-    UNDER CONSTRUCTION
-
-    """
-
-    order = 1
-
-    @register_to_config
-    def __init__(self, num_train_timesteps=2000, beta_min=0.1, beta_max=20, sampling_eps=1e-3):
-        self.sigmas = None
-        self.discrete_sigmas = None
-        self.timesteps = None
-
-    def set_timesteps(self, num_inference_steps):
-        self.timesteps = paddle.linspace(1, self.config.sampling_eps, num_inference_steps)
-
-    def step_pred(self, score, x, t, generator=None):
-        if self.timesteps is None:
-            raise ValueError(
-                "`self.timesteps` is not set, you need to run 'set_timesteps' after creating the scheduler"
-            )
-
-        # TODO(Patrick) better comments + non-Paddle
-        # postprocess model score
-        log_mean_coeff = (
-            -0.25 * t**2 * (self.config.beta_max - self.config.beta_min) - 0.5 * t * self.config.beta_min
-        )
-        std = paddle.sqrt(1.0 - paddle.exp(2.0 * log_mean_coeff))
-        std = std.flatten()
-        while len(std.shape) < len(score.shape):
-            std = std.unsqueeze(-1)
-        score = -score / std
-
-        # compute
-        dt = -1.0 / len(self.timesteps)
-
-        beta_t = self.config.beta_min + t * (self.config.beta_max - self.config.beta_min)
-        beta_t = beta_t.flatten()
-        while len(beta_t.shape) < len(x.shape):
-            beta_t = beta_t.unsqueeze(-1)
-        drift = -0.5 * beta_t * x
-
-        diffusion = paddle.sqrt(beta_t)
-        drift = drift - diffusion**2 * score
-        x_mean = x + drift * dt
-
-        # add noise
-        noise = randn_tensor(x.shape, generator=generator, dtype=x.dtype)
-        x = x_mean + diffusion * math.sqrt(-dt) * noise
-
-        return x, x_mean
-
-    def __len__(self):
-        return self.config.num_train_timesteps
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_unclip.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_unclip.py
deleted file mode 100644
index 900e920ef473..000000000000
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_unclip.py
+++ /dev/null
@@ -1,312 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2022 Kakao Brain and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-from dataclasses import dataclass
-from typing import Optional, Tuple, Union
-
-import numpy as np
-import paddle
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, randn_tensor
-from .scheduling_utils import SchedulerMixin
-
-
-@dataclass
-# Copied from ppdiffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->UnCLIP
-class UnCLIPSchedulerOutput(BaseOutput):
-    """
-    Output class for the scheduler's step function output.
-
-    Args:
-        prev_sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
-            Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
-            denoising loop.
-        pred_original_sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
-            The predicted denoised sample (x_{0}) based on the model output from the current timestep.
-            `pred_original_sample` can be used to preview progress or for guidance.
-    """
-
-    prev_sample: paddle.Tensor
-    pred_original_sample: Optional[paddle.Tensor] = None
-
-
-# Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
-    """
-    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
-    (1-beta) over time from t = [0,1].
-
-    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
-    to that part of the diffusion process.
-
-
-    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-
-    Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
-    """
-
-    def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
-
-    betas = []
-    for i in range(num_diffusion_timesteps):
-        t1 = i / num_diffusion_timesteps
-        t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
-    return paddle.to_tensor(betas, dtype=paddle.float32)
-
-
-class UnCLIPScheduler(SchedulerMixin, ConfigMixin):
-    """
-    This is a modified DDPM Scheduler specifically for the karlo unCLIP model.
-
-    This scheduler has some minor variations in how it calculates the learned range variance and dynamically
-    re-calculates betas based off the timesteps it is skipping.
-
-    The scheduler also uses a slightly different step ratio when computing timesteps to use for inference.
-
-    See [`~DDPMScheduler`] for more information on DDPM scheduling
-
-    Args:
-        num_train_timesteps (`int`): number of diffusion steps used to train the model.
-        variance_type (`str`):
-            options to clip the variance used when adding noise to the denoised sample. Choose from `fixed_small_log`
-            or `learned_range`.
-        clip_sample (`bool`, default `True`):
-            option to clip predicted sample between `-clip_sample_range` and `clip_sample_range` for numerical
-            stability.
-        clip_sample_range (`float`, default `1.0`):
-            The range to clip the sample between. See `clip_sample`.
-        prediction_type (`str`, default `epsilon`, optional):
-            prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion process)
-            or `sample` (directly predicting the noisy sample`)
-    """
-
-    @register_to_config
-    def __init__(
-        self,
-        num_train_timesteps: int = 1000,
-        variance_type: str = "fixed_small_log",
-        clip_sample: bool = True,
-        clip_sample_range: Optional[float] = 1.0,
-        prediction_type: str = "epsilon",
-        beta_schedule: str = "squaredcos_cap_v2",
-    ):
-        if beta_schedule != "squaredcos_cap_v2":
-            raise ValueError("UnCLIPScheduler only supports `beta_schedule`: 'squaredcos_cap_v2'")
-
-        self.betas = betas_for_alpha_bar(num_train_timesteps)
-
-        self.alphas = 1.0 - self.betas
-        self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
-        self.one = paddle.to_tensor(1.0)
-
-        # standard deviation of the initial noise distribution
-        self.init_noise_sigma = 1.0
-
-        # setable values
-        self.num_inference_steps = None
-        self.timesteps = paddle.to_tensor(np.arange(0, num_train_timesteps)[::-1].copy())
-
-        self.variance_type = variance_type
-
-    def scale_model_input(self, sample: paddle.Tensor, timestep: Optional[int] = None) -> paddle.Tensor:
-        """
-        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
-        current timestep.
-
-        Args:
-            sample (`paddle.Tensor`): input sample
-            timestep (`int`, optional): current timestep
-
-        Returns:
-            `paddle.Tensor`: scaled input sample
-        """
-        return sample
-
-    def set_timesteps(self, num_inference_steps: int):
-        """
-        Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
-
-        Note that this scheduler uses a slightly different step ratio than the other diffusers schedulers. The
-        different step ratio is to mimic the original karlo implementation and does not affect the quality or accuracy
-        of the results.
-
-        Args:
-            num_inference_steps (`int`):
-                the number of diffusion steps used when generating samples with a pre-trained model.
-        """
-        self.num_inference_steps = num_inference_steps
-        step_ratio = (self.config.num_train_timesteps - 1) / (self.num_inference_steps - 1)
-        timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
-        self.timesteps = paddle.to_tensor(timesteps)
-
-    def _get_variance(self, t, prev_timestep=None, predicted_variance=None, variance_type=None):
-        if prev_timestep is None:
-            prev_timestep = t - 1
-
-        alpha_prod_t = self.alphas_cumprod[t]
-        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.one
-        beta_prod_t = 1 - alpha_prod_t
-        beta_prod_t_prev = 1 - alpha_prod_t_prev
-
-        if prev_timestep == t - 1:
-            beta = self.betas[t]
-        else:
-            beta = 1 - alpha_prod_t / alpha_prod_t_prev
-
-        # For t > 0, compute predicted variance βt (see formula (6) and (7) from https://arxiv.org/pdf/2006.11239.pdf)
-        # and sample from it to get previous sample
-        # x_{t-1} ~ N(pred_prev_sample, variance) == add variance to pred_sample
-        variance = beta_prod_t_prev / beta_prod_t * beta
-
-        if variance_type is None:
-            variance_type = self.config.variance_type
-
-        # hacks - were probably added for training stability
-        if variance_type == "fixed_small_log":
-            variance = paddle.log(paddle.clip(variance, min=1e-20))
-            variance = paddle.exp(0.5 * variance)
-        elif variance_type == "learned_range":
-            # NOTE difference with DDPM scheduler
-            min_log = variance.log()
-            max_log = beta.log()
-
-            frac = (predicted_variance + 1) / 2
-            variance = frac * max_log + (1 - frac) * min_log
-
-        return variance
-
-    def step(
-        self,
-        model_output: paddle.Tensor,
-        timestep: int,
-        sample: paddle.Tensor,
-        prev_timestep: Optional[int] = None,
-        generator=None,
-        return_dict: bool = True,
-    ) -> Union[UnCLIPSchedulerOutput, Tuple]:
-        """
-        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
-        process from the learned model outputs (most often the predicted noise).
-
-        Args:
-            model_output (`paddle.Tensor`): direct output from learned diffusion model.
-            timestep (`int`): current discrete timestep in the diffusion chain.
-            sample (`paddle.Tensor`):
-                current instance of sample being created by diffusion process.
-            prev_timestep (`int`, *optional*): The previous timestep to predict the previous sample at.
-                Used to dynamically compute beta. If not given, `t-1` is used and the pre-computed beta is used.
-            generator: random number generator.
-            return_dict (`bool`): option for returning tuple rather than UnCLIPSchedulerOutput class
-
-        Returns:
-            [`~schedulers.scheduling_utils.UnCLIPSchedulerOutput`] or `tuple`:
-            [`~schedulers.scheduling_utils.UnCLIPSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. When
-            returning a tuple, the first element is the sample tensor.
-
-        """
-        t = timestep
-
-        if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type == "learned_range":
-            # must split like this, 3 -> split 2 -> [2, 1]
-            model_output, predicted_variance = model_output.split(
-                [sample.shape[1], model_output.shape[1] - sample.shape[1]], axis=1
-            )
-        else:
-            predicted_variance = None
-
-        # 1. compute alphas, betas
-        if prev_timestep is None:
-            prev_timestep = t - 1
-
-        alpha_prod_t = self.alphas_cumprod[t]
-        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.one
-        beta_prod_t = 1 - alpha_prod_t
-        beta_prod_t_prev = 1 - alpha_prod_t_prev
-
-        if prev_timestep == t - 1:
-            beta = self.betas[t]
-            alpha = self.alphas[t]
-        else:
-            beta = 1 - alpha_prod_t / alpha_prod_t_prev
-            alpha = 1 - beta
-
-        # 2. compute predicted original sample from predicted noise also called
-        # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
-        if self.config.prediction_type == "epsilon":
-            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
-        elif self.config.prediction_type == "sample":
-            pred_original_sample = model_output
-        else:
-            raise ValueError(
-                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon` or `sample`"
-                " for the UnCLIPScheduler."
-            )
-
-        # 3. Clip "predicted x_0"
-        if self.config.clip_sample:
-            pred_original_sample = paddle.clip(
-                pred_original_sample, -self.config.clip_sample_range, self.config.clip_sample_range
-            )
-
-        # 4. Compute coefficients for pred_original_sample x_0 and current sample x_t
-        # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
-        pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * beta) / beta_prod_t
-        current_sample_coeff = alpha ** (0.5) * beta_prod_t_prev / beta_prod_t
-
-        # 5. Compute predicted previous sample µ_t
-        # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
-        pred_prev_sample = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * sample
-
-        # 6. Add noise
-        variance = 0
-        if t > 0:
-            variance_noise = randn_tensor(
-                model_output.shape,
-                dtype=model_output.dtype,
-                generator=generator,
-            )
-
-            variance = self._get_variance(
-                t,
-                predicted_variance=predicted_variance,
-                prev_timestep=prev_timestep,
-            )
-
-            if self.variance_type == "fixed_small_log":
-                variance = variance
-            elif self.variance_type == "learned_range":
-                variance = (0.5 * variance).exp()
-            else:
-                raise ValueError(
-                    f"variance_type given as {self.variance_type} must be one of `fixed_small_log` or `learned_range`"
-                    " for the UnCLIPScheduler."
-                )
-
-            variance = variance * variance_noise
-
-        pred_prev_sample = pred_prev_sample + variance
-
-        if not return_dict:
-            return (pred_prev_sample,)
-
-        return UnCLIPSchedulerOutput(prev_sample=pred_prev_sample, pred_original_sample=pred_original_sample)
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_unipc_multistep.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_unipc_multistep.py
deleted file mode 100644
index b4c0dda357ea..000000000000
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_unipc_multistep.py
+++ /dev/null
@@ -1,640 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2022 TSAIL Team and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# DISCLAIMER: check https://arxiv.org/abs/2302.04867 and https://github.com/wl-zhao/UniPC for more info
-# The codebase is modified based on https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
-
-import math
-from typing import List, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
-
-
-def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
-    """
-    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
-    (1-beta) over time from t = [0,1].
-
-    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
-    to that part of the diffusion process.
-
-
-    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-
-    Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
-    """
-
-    def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
-
-    betas = []
-    for i in range(num_diffusion_timesteps):
-        t1 = i / num_diffusion_timesteps
-        t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
-    return paddle.to_tensor(betas, dtype=paddle.float32)
-
-
-class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
-    """
-    UniPC is a training-free framework designed for the fast sampling of diffusion models, which consists of a
-    corrector (UniC) and a predictor (UniP) that share a unified analytical form and support arbitrary orders. UniPC is
-    by desinged model-agnostic, supporting pixel-space/latent-space DPMs on unconditional/conditional sampling. It can
-    also be applied to both noise prediction model and data prediction model. The corrector UniC can be also applied
-    after any off-the-shelf solvers to increase the order of accuracy.
-
-    For more details, see the original paper: https://arxiv.org/abs/2302.04867
-
-    Currently, we support the multistep UniPC for both noise prediction models and data prediction models. We recommend
-    to use `solver_order=2` for guided sampling, and `solver_order=3` for unconditional sampling.
-
-    We also support the "dynamic thresholding" method in Imagen (https://arxiv.org/abs/2205.11487). For pixel-space
-    diffusion models, you can set both `predict_x0=True` and `thresholding=True` to use the dynamic thresholding. Note
-    that the thresholding method is unsuitable for latent-space diffusion models (such as stable-diffusion).
-
-    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
-    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
-    [`~SchedulerMixin.from_pretrained`] functions.
-
-    Args:
-        num_train_timesteps (`int`): number of diffusion steps used to train the model.
-        beta_start (`float`): the starting `beta` value of inference.
-        beta_end (`float`): the final `beta` value.
-        beta_schedule (`str`):
-            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
-            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
-        trained_betas (`np.ndarray`, optional):
-            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
-        solver_order (`int`, default `2`):
-            the order of UniPC, also the p in UniPC-p; can be any positive integer. Note that the effective order of
-            accuracy is `solver_order + 1` due to the UniC. We recommend to use `solver_order=2` for guided sampling,
-            and `solver_order=3` for unconditional sampling.
-        prediction_type (`str`, default `epsilon`, optional):
-            prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
-            process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
-            https://imagen.research.google/video/paper.pdf)
-        thresholding (`bool`, default `False`):
-            whether to use the "dynamic thresholding" method (introduced by Imagen, https://arxiv.org/abs/2205.11487).
-            For pixel-space diffusion models, you can set both `predict_x0=True` and `thresholding=True` to use the
-            dynamic thresholding. Note that the thresholding method is unsuitable for latent-space diffusion models
-            (such as stable-diffusion).
-        dynamic_thresholding_ratio (`float`, default `0.995`):
-            the ratio for the dynamic thresholding method. Default is `0.995`, the same as Imagen
-            (https://arxiv.org/abs/2205.11487).
-        sample_max_value (`float`, default `1.0`):
-            the threshold value for dynamic thresholding. Valid only when `thresholding=True` and `predict_x0=True`.
-        predict_x0 (`bool`, default `True`):
-            whether to use the updating algrithm on the predicted x0. See https://arxiv.org/abs/2211.01095 for details
-        solver_type (`str`, default `bh2`):
-            the solver type of UniPC. We recommend use `bh1` for unconditional sampling when steps < 10, and use `bh2`
-            otherwise.
-        lower_order_final (`bool`, default `True`):
-            whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps. We empirically
-            find this trick can stabilize the sampling of DPM-Solver for steps < 15, especially for steps <= 10.
-        disable_corrector (`list`, default `[]`):
-            decide which step to disable the corrector. For large guidance scale, the misalignment between the
-            `epsilon_theta(x_t, c)`and `epsilon_theta(x_t^c, c)` might influence the convergence. This can be mitigated
-            by disable the corrector at the first few steps (e.g., disable_corrector=[0])
-        solver_p (`SchedulerMixin`, default `None`):
-            can be any other scheduler. If specified, the algorithm will become solver_p + UniC.
-    """
-
-    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
-    order = 1
-
-    @register_to_config
-    def __init__(
-        self,
-        num_train_timesteps: int = 1000,
-        beta_start: float = 0.0001,
-        beta_end: float = 0.02,
-        beta_schedule: str = "linear",
-        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
-        solver_order: int = 2,
-        prediction_type: str = "epsilon",
-        thresholding: bool = False,
-        dynamic_thresholding_ratio: float = 0.995,
-        sample_max_value: float = 1.0,
-        predict_x0: bool = True,
-        solver_type: str = "bh2",
-        lower_order_final: bool = True,
-        disable_corrector: List[int] = [],
-        solver_p: SchedulerMixin = None,
-    ):
-        if trained_betas is not None:
-            self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32)
-        elif beta_schedule == "linear":
-            self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32)
-        elif beta_schedule == "scaled_linear":
-            # this schedule is very specific to the latent diffusion model.
-            self.betas = (
-                paddle.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=paddle.float32) ** 2
-            )
-        elif beta_schedule == "squaredcos_cap_v2":
-            # Glide cosine schedule
-            self.betas = betas_for_alpha_bar(num_train_timesteps)
-        else:
-            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
-
-        self.alphas = 1.0 - self.betas
-        self.alphas_cumprod = paddle.cumprod(self.alphas, 0)
-        # Currently we only support VP-type noise schedule
-        self.alpha_t = paddle.sqrt(self.alphas_cumprod)
-        self.sigma_t = paddle.sqrt(1 - self.alphas_cumprod)
-        self.lambda_t = paddle.log(self.alpha_t) - paddle.log(self.sigma_t)
-
-        # standard deviation of the initial noise distribution
-        self.init_noise_sigma = 1.0
-
-        if solver_type not in ["bh1", "bh2"]:
-            if solver_type in ["midpoint", "heun", "logrho"]:
-                self.register_to_config(solver_type="bh1")
-            else:
-                raise NotImplementedError(f"{solver_type} does is not implemented for {self.__class__}")
-
-        self.predict_x0 = predict_x0
-        # setable values
-        self.num_inference_steps = None
-        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32)[::-1].copy()
-        self.timesteps = paddle.to_tensor(timesteps)
-        self.model_outputs = [None] * solver_order
-        self.timestep_list = [None] * solver_order
-        self.lower_order_nums = 0
-        self.disable_corrector = disable_corrector
-        self.solver_p = solver_p
-        self.last_sample = None
-
-    def set_timesteps(self, num_inference_steps: int):
-        """
-        Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
-
-        Args:
-            num_inference_steps (`int`):
-                the number of diffusion steps used when generating samples with a pre-trained model.
-        """
-        timesteps = (
-            np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps + 1)
-            .round()[::-1][:-1]
-            .copy()
-            .astype(np.int64)
-        )
-
-        # when num_inference_steps == num_train_timesteps, we can end up with
-        # duplicates in timesteps.
-        _, unique_indices = np.unique(timesteps, return_index=True)
-        timesteps = timesteps[np.sort(unique_indices)]
-
-        self.timesteps = paddle.to_tensor(timesteps)
-
-        self.num_inference_steps = len(timesteps)
-
-        self.model_outputs = [
-            None,
-        ] * self.config.solver_order
-        self.lower_order_nums = 0
-        self.last_sample = None
-        if self.solver_p:
-            self.solver_p.set_timesteps(self.num_inference_steps)
-
-    def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor:
-        """
-        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
-        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
-        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
-        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
-        photorealism as well as better image-text alignment, especially when using very large guidance weights."
-        https://arxiv.org/abs/2205.11487
-        """
-        dtype = sample.dtype
-        batch_size, channels, height, width = sample.shape
-
-        if dtype not in (paddle.float32, paddle.float64):
-            sample = paddle.cast(
-                sample, "float32"
-            )  # upcast for quantile calculation, and clamp not implemented for cpu half
-
-        # Flatten sample for doing quantile calculation along each image
-        sample = paddle.reshape(sample, [batch_size, channels * height * width])
-
-        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
-
-        s = paddle.quantile(abs_sample, self.config.dynamic_thresholding_ratio, axis=1)
-        # paddle.clip donot support min > max
-        if self.config.sample_max_value < 1:
-            s = paddle.ones_like(s) * self.config.sample_max_value
-        else:
-            s = paddle.clip(
-                s, min=1, max=self.config.sample_max_value
-            )  # When clip to min=1, equivalent to standard clipping to [-1, 1]
-        s = s.unsqueeze(1)  # (batch_size, 1) because clip will broadcast along axis=0
-        sample = paddle.clip(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
-
-        sample = paddle.reshape(sample, [batch_size, channels, height, width])
-        sample = paddle.cast(sample, dtype)
-
-        return sample
-
-    def convert_model_output(self, model_output: paddle.Tensor, timestep: int, sample: paddle.Tensor) -> paddle.Tensor:
-        r"""
-        Convert the model output to the corresponding type that the algorithm PC needs.
-
-        Args:
-            model_output (`paddle.Tensor`): direct output from learned diffusion model.
-            timestep (`int`): current discrete timestep in the diffusion chain.
-            sample (`paddle.Tensor`):
-                current instance of sample being created by diffusion process.
-
-        Returns:
-            `paddle.Tensor`: the converted model output.
-        """
-        if self.predict_x0:
-            if self.config.prediction_type == "epsilon":
-                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
-                x0_pred = (sample - sigma_t * model_output) / alpha_t
-            elif self.config.prediction_type == "sample":
-                x0_pred = model_output
-            elif self.config.prediction_type == "v_prediction":
-                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
-                x0_pred = alpha_t * sample - sigma_t * model_output
-            else:
-                raise ValueError(
-                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
-                    " `v_prediction` for the UniPCMultistepScheduler."
-                )
-
-            if self.config.thresholding:
-                x0_pred = self._threshold_sample(x0_pred)
-
-            return x0_pred
-        else:
-            if self.config.prediction_type == "epsilon":
-                return model_output
-            elif self.config.prediction_type == "sample":
-                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
-                epsilon = (sample - alpha_t * model_output) / sigma_t
-                return epsilon
-            elif self.config.prediction_type == "v_prediction":
-                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
-                epsilon = alpha_t * model_output + sigma_t * sample
-                return epsilon
-            else:
-                raise ValueError(
-                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
-                    " `v_prediction` for the UniPCMultistepScheduler."
-                )
-
-    def multistep_uni_p_bh_update(
-        self,
-        model_output: paddle.Tensor,
-        prev_timestep: int,
-        sample: paddle.Tensor,
-        order: int,
-    ) -> paddle.Tensor:
-        """
-        One step for the UniP (B(h) version). Alternatively, `self.solver_p` is used if is specified.
-
-        Args:
-            model_output (`paddle.Tensor`):
-                direct outputs from learned diffusion model at the current timestep.
-            prev_timestep (`int`): previous discrete timestep in the diffusion chain.
-            sample (`paddle.Tensor`):
-                current instance of sample being created by diffusion process.
-            order (`int`): the order of UniP at this step, also the p in UniPC-p.
-
-        Returns:
-            `paddle.Tensor`: the sample tensor at the previous timestep.
-        """
-        timestep_list = self.timestep_list
-        model_output_list = self.model_outputs
-
-        s0, t = self.timestep_list[-1], prev_timestep
-        m0 = model_output_list[-1]
-        x = sample
-
-        if self.solver_p:
-            x_t = self.solver_p.step(model_output, s0, x).prev_sample
-            return x_t
-
-        lambda_t, lambda_s0 = self.lambda_t[t], self.lambda_t[s0]
-        alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0]
-        sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0]
-
-        h = lambda_t - lambda_s0
-
-        rks = []
-        D1s = []
-        for i in range(1, order):
-            si = timestep_list[-(i + 1)]
-            mi = model_output_list[-(i + 1)]
-            lambda_si = self.lambda_t[si]
-            rk = (lambda_si - lambda_s0) / h
-            rks.append(rk.item())
-            D1s.append((mi - m0) / rk)
-
-        rks.append(1.0)
-        rks = paddle.to_tensor(rks)
-
-        R = []
-        b = []
-
-        hh = -h if self.predict_x0 else h
-        h_phi_1 = paddle.expm1(hh)  # h\phi_1(h) = e^h - 1
-        h_phi_k = h_phi_1 / hh - 1
-
-        factorial_i = 1
-
-        if self.config.solver_type == "bh1":
-            B_h = hh
-        elif self.config.solver_type == "bh2":
-            B_h = paddle.expm1(hh)
-        else:
-            raise NotImplementedError()
-
-        for i in range(1, order + 1):
-            R.append(paddle.pow(rks, i - 1))
-            b.append(h_phi_k * factorial_i / B_h)
-            factorial_i *= i + 1
-            h_phi_k = h_phi_k / hh - 1 / factorial_i
-
-        R = paddle.stack(R)
-        b = paddle.to_tensor(b)
-
-        if len(D1s) > 0:
-            D1s = paddle.stack(D1s, axis=1).cast(paddle.float32)  # (B, K)
-            # for order 2, we use a simplified version
-            if order == 2:
-                rhos_p = paddle.to_tensor([0.5])
-            else:
-                rhos_p = paddle.linalg.solve(R[:-1, :-1], b[:-1])
-        else:
-            D1s = None
-
-        if self.predict_x0:
-            x_t_ = sigma_t / sigma_s0 * x - alpha_t * h_phi_1 * m0
-            if D1s is not None:
-                if rhos_p.shape[0] == 2:
-                    rhos_p = rhos_p.squeeze(1)
-                pred_res = paddle.einsum("k,bkchw->bchw", rhos_p, D1s)
-            else:
-                pred_res = 0
-            x_t = x_t_ - alpha_t * B_h * pred_res
-        else:
-            x_t_ = alpha_t / alpha_s0 * x - sigma_t * h_phi_1 * m0
-            if D1s is not None:
-                if rhos_p.shape[0] == 2:
-                    rhos_p = rhos_p.squeeze(1)
-                pred_res = paddle.einsum("k,bkchw->bchw", rhos_p, D1s)
-            else:
-                pred_res = 0
-            x_t = x_t_ - sigma_t * B_h * pred_res
-
-        x_t = x_t.cast(x.dtype)
-        return x_t
-
-    def multistep_uni_c_bh_update(
-        self,
-        this_model_output: paddle.Tensor,
-        this_timestep: int,
-        last_sample: paddle.Tensor,
-        this_sample: paddle.Tensor,
-        order: int,
-    ) -> paddle.Tensor:
-        """
-        One step for the UniC (B(h) version).
-
-        Args:
-            this_model_output (`paddle.Tensor`): the model outputs at `x_t`
-            this_timestep (`int`): the current timestep `t`
-            last_sample (`paddle.Tensor`): the generated sample before the last predictor: `x_{t-1}`
-            this_sample (`paddle.Tensor`): the generated sample after the last predictor: `x_{t}`
-            order (`int`): the `p` of UniC-p at this step. Note that the effective order of accuracy
-                should be order + 1
-
-        Returns:
-            `paddle.Tensor`: the corrected sample tensor at the current timestep.
-        """
-        timestep_list = self.timestep_list
-        model_output_list = self.model_outputs
-
-        s0, t = timestep_list[-1], this_timestep
-        m0 = model_output_list[-1]
-        x = last_sample
-        x_t = this_sample
-        model_t = this_model_output
-
-        lambda_t, lambda_s0 = self.lambda_t[t], self.lambda_t[s0]
-        alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0]
-        sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0]
-
-        h = lambda_t - lambda_s0
-
-        rks = []
-        D1s = []
-        for i in range(1, order):
-            si = timestep_list[-(i + 1)]
-            mi = model_output_list[-(i + 1)]
-            lambda_si = self.lambda_t[si]
-            rk = (lambda_si - lambda_s0) / h
-            rks.append(rk.item())
-            D1s.append((mi - m0) / rk)
-
-        rks.append(1.0)
-        rks = paddle.to_tensor(rks)
-
-        R = []
-        b = []
-
-        hh = -h if self.predict_x0 else h
-        h_phi_1 = paddle.expm1(hh)  # h\phi_1(h) = e^h - 1
-        h_phi_k = h_phi_1 / hh - 1
-
-        factorial_i = 1
-
-        if self.config.solver_type == "bh1":
-            B_h = hh
-        elif self.config.solver_type == "bh2":
-            B_h = paddle.expm1(hh)
-        else:
-            raise NotImplementedError()
-
-        for i in range(1, order + 1):
-            R.append(paddle.pow(rks, i - 1))
-            b.append(h_phi_k * factorial_i / B_h)
-            factorial_i *= i + 1
-            h_phi_k = h_phi_k / hh - 1 / factorial_i
-
-        R = paddle.stack(R)
-        b = paddle.to_tensor(b)
-
-        if len(D1s) > 0:
-            # cast this  to float32
-            D1s = paddle.stack(D1s, axis=1).cast("float32")
-        else:
-            D1s = None
-
-        # for order 1, we use a simplified version
-        if order == 1:
-            rhos_c = paddle.to_tensor([0.5])
-        else:
-            rhos_c = paddle.linalg.solve(R, b)
-
-        if self.predict_x0:
-            x_t_ = sigma_t / sigma_s0 * x - alpha_t * h_phi_1 * m0
-            if D1s is not None:
-                corr_res = paddle.einsum("k,bkchw->bchw", rhos_c[:-1].squeeze(1), D1s)
-            else:
-                corr_res = 0
-            D1_t = model_t - m0
-            x_t = x_t_ - alpha_t * B_h * (corr_res + rhos_c[-1] * D1_t)
-        else:
-            x_t_ = alpha_t / alpha_s0 * x - sigma_t * h_phi_1 * m0
-            if D1s is not None:
-                corr_res = paddle.einsum("k,bkchw->bchw", rhos_c[:-1].squeeze(1), D1s)
-            else:
-                corr_res = 0
-            D1_t = model_t - m0
-            x_t = x_t_ - sigma_t * B_h * (corr_res + rhos_c[-1] * D1_t)
-        x_t = x_t.cast(x.dtype)
-        return x_t
-
-    def step(
-        self,
-        model_output: paddle.Tensor,
-        timestep: int,
-        sample: paddle.Tensor,
-        return_dict: bool = True,
-    ) -> Union[SchedulerOutput, Tuple]:
-        """
-        Step function propagating the sample with the multistep UniPC.
-
-        Args:
-            model_output (`paddle.Tensor`): direct output from learned diffusion model.
-            timestep (`int`): current discrete timestep in the diffusion chain.
-            sample (`paddle.Tensor`):
-                current instance of sample being created by diffusion process.
-            return_dict (`bool`): option for returning tuple rather than SchedulerOutput class
-
-        Returns:
-            [`~scheduling_utils.SchedulerOutput`] or `tuple`: [`~scheduling_utils.SchedulerOutput`] if `return_dict` is
-            True, otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
-
-        """
-
-        if self.num_inference_steps is None:
-            raise ValueError(
-                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
-            )
-
-        step_index = (self.timesteps == timestep).nonzero()
-        if len(step_index) == 0:
-            step_index = len(self.timesteps) - 1
-        else:
-            step_index = step_index.item()
-
-        use_corrector = (
-            step_index > 0 and step_index - 1 not in self.disable_corrector and self.last_sample is not None
-        )
-
-        model_output_convert = self.convert_model_output(model_output, timestep, sample)
-        if use_corrector:
-            sample = self.multistep_uni_c_bh_update(
-                this_model_output=model_output_convert,
-                this_timestep=timestep,
-                last_sample=self.last_sample,
-                this_sample=sample,
-                order=self.this_order,
-            )
-
-        # now prepare to run the predictor
-        prev_timestep = 0 if step_index == len(self.timesteps) - 1 else self.timesteps[step_index + 1]
-
-        for i in range(self.config.solver_order - 1):
-            self.model_outputs[i] = self.model_outputs[i + 1]
-            self.timestep_list[i] = self.timestep_list[i + 1]
-
-        self.model_outputs[-1] = model_output_convert
-        self.timestep_list[-1] = timestep
-
-        if self.config.lower_order_final:
-            this_order = min(self.config.solver_order, len(self.timesteps) - step_index)
-        else:
-            this_order = self.config.solver_order
-
-        self.this_order = min(this_order, self.lower_order_nums + 1)  # warmup for multistep
-        assert self.this_order > 0
-
-        self.last_sample = sample
-        prev_sample = self.multistep_uni_p_bh_update(
-            model_output=model_output,  # pass the original non-converted model output, in case solver-p is used
-            prev_timestep=prev_timestep,
-            sample=sample,
-            order=self.this_order,
-        )
-
-        if self.lower_order_nums < self.config.solver_order:
-            self.lower_order_nums += 1
-
-        if not return_dict:
-            return (prev_sample,)
-
-        return SchedulerOutput(prev_sample=prev_sample)
-
-    def scale_model_input(self, sample: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
-        """
-        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
-        current timestep.
-
-        Args:
-            sample (`paddle.Tensor`): input sample
-
-        Returns:
-            `paddle.Tensor`: scaled input sample
-        """
-        return sample
-
-    # Copied from ppdiffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
-    def add_noise(
-        self,
-        original_samples: paddle.Tensor,
-        noise: paddle.Tensor,
-        timesteps: paddle.Tensor,
-    ) -> paddle.Tensor:
-        # Make sure alphas_cumprod and timestep have same dtype as original_samples
-        alphas_cumprod = self.alphas_cumprod.cast(original_samples.dtype)
-
-        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
-        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
-        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
-            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
-
-        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
-        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
-        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
-            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
-
-        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
-        return noisy_samples
-
-    def __len__(self):
-        return self.config.num_train_timesteps
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_utils.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_utils.py
deleted file mode 100644
index 67cfeeb75734..000000000000
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_utils.py
+++ /dev/null
@@ -1,178 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import importlib
-import os
-from dataclasses import dataclass
-from enum import Enum
-from typing import Any, Dict, Optional, Union
-
-import paddle
-
-from ..utils import BaseOutput
-
-SCHEDULER_CONFIG_NAME = "scheduler_config.json"
-
-# NOTE: We make this type an enum because it simplifies usage in docs and prevents
-# circular imports when used for `_compatibles` within the schedulers module.
-# When it's used as a type in pipelines, it really is a Union because the actual
-# scheduler instance is passed in.
-
-
-class KarrasDiffusionSchedulers(Enum):
-    DDIMScheduler = 1
-    DDPMScheduler = 2
-    PNDMScheduler = 3
-    LMSDiscreteScheduler = 4
-    EulerDiscreteScheduler = 5
-    HeunDiscreteScheduler = 6
-    EulerAncestralDiscreteScheduler = 7
-    DPMSolverMultistepScheduler = 8
-    DPMSolverSinglestepScheduler = 9
-    KDPM2DiscreteScheduler = 10
-    KDPM2AncestralDiscreteScheduler = 11
-    DEISMultistepScheduler = 12
-    UniPCMultistepScheduler = 13
-
-
-@dataclass
-class SchedulerOutput(BaseOutput):
-    """
-    Base class for the scheduler's step function output.
-
-    Args:
-        prev_sample (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
-            Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
-            denoising loop.
-    """
-
-    prev_sample: paddle.Tensor
-
-
-class SchedulerMixin:
-    """
-    Mixin containing common functions for the schedulers.
-
-    Class attributes:
-        - **_compatibles** (`List[str]`) -- A list of classes that are compatible with the parent class, so that
-          `from_config` can be used from a class different than the one used to save the config (should be overridden
-          by parent class).
-    """
-
-    config_name = SCHEDULER_CONFIG_NAME
-    _compatibles = []
-    has_compatibles = True
-
-    @classmethod
-    def from_pretrained(
-        cls,
-        pretrained_model_name_or_path: Dict[str, Any] = None,
-        subfolder: Optional[str] = None,
-        return_unused_kwargs: bool = False,
-        **kwargs,
-    ):
-        r"""
-        Instantiate a Scheduler class from a pre-defined JSON configuration file inside a directory or Hub repo.
-
-        Parameters:
-            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
-                Can be either:
-
-                    - A string, the *model id* of a model repo on huggingface.co. Valid model ids should have an
-                      organization name, like `google/ddpm-celebahq-256`.
-                    - A path to a *directory* containing the schedluer configurations saved using
-                      [`~SchedulerMixin.save_pretrained`], e.g., `./my_model_directory/`.
-            subfolder (`str`, *optional*):
-                In case the relevant files are located inside a subfolder of the model repo (either remote in
-                huggingface.co or downloaded locally), you can specify the folder name here.
-            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
-                Whether kwargs that are not consumed by the Python class should be returned or not.
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory in which a downloaded pretrained model configuration should be cached if the
-                standard cache should not be used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
-                file exists.
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            output_loading_info(`bool`, *optional*, defaults to `False`):
-                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
-            local_files_only(`bool`, *optional*, defaults to `False`):
-                Whether or not to only look at local files (i.e., do not try to download the model).
-            use_auth_token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-                when running `transformers-cli login` (stored in `~/.huggingface`).
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
-                identifier allowed by git.
-            from_hf_hub (bool, *optional*):
-                Whether to load from Hugging Face Hub. Defaults to False
-        <Tip>
-
-         It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated
-         models](https://huggingface.co/docs/hub/models-gated#gated-models).
-
-        </Tip>
-
-        <Tip>
-
-        Activate the special ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode) to
-        use this method in a firewalled environment.
-
-        </Tip>
-
-        """
-        config, kwargs, commit_hash = cls.load_config(
-            pretrained_model_name_or_path=pretrained_model_name_or_path,
-            subfolder=subfolder,
-            return_unused_kwargs=True,
-            return_commit_hash=True,
-            **kwargs,
-        )
-        return cls.from_config(config, return_unused_kwargs=return_unused_kwargs, **kwargs)
-
-    def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
-        """
-        Save a scheduler configuration object to the directory `save_directory`, so that it can be re-loaded using the
-        [`~SchedulerMixin.from_pretrained`] class method.
-
-        Args:
-            save_directory (`str` or `os.PathLike`):
-                Directory where the configuration JSON file will be saved (will be created if it does not exist).
-        """
-        self.save_config(save_directory=save_directory, push_to_hub=push_to_hub, **kwargs)
-
-    @property
-    def compatibles(self):
-        """
-        Returns all schedulers that are compatible with this scheduler
-
-        Returns:
-            `List[SchedulerMixin]`: List of compatible schedulers
-        """
-        return self._get_compatibles()
-
-    @classmethod
-    def _get_compatibles(cls):
-        compatible_classes_str = list(set([cls.__name__] + cls._compatibles))
-        diffusers_library = importlib.import_module(__name__.split(".")[0])
-        compatible_classes = [
-            getattr(diffusers_library, c) for c in compatible_classes_str if hasattr(diffusers_library, c)
-        ]
-        return compatible_classes
diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_vq_diffusion.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_vq_diffusion.py
deleted file mode 100644
index b5b3cebcd734..000000000000
--- a/ppdiffusers/ppdiffusers/schedulers/scheduling_vq_diffusion.py
+++ /dev/null
@@ -1,497 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2022 Microsoft and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-import paddle.nn.functional as F
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, rand_tensor
-from .scheduling_utils import SchedulerMixin
-
-
-def logaddexp(a, b):
-    return paddle.log(a.exp() + b.exp())
-
-
-# (TODO junnyu) paddle logsumexp may has bug
-def logsumexp(x, axis=None, keepdim=False):
-    return paddle.log(x.exp().sum(axis=axis, keepdim=keepdim))
-
-
-@dataclass
-class VQDiffusionSchedulerOutput(BaseOutput):
-    """
-    Output class for the scheduler's step function output.
-
-    Args:
-        prev_sample (`paddle.Tensor` of shape `(batch size, num latent pixels)`):
-            Computed sample x_{t-1} of previous timestep. `prev_sample` should be used as next model input in the
-            denoising loop.
-    """
-
-    prev_sample: paddle.Tensor
-
-
-def index_to_log_onehot(x: paddle.Tensor, num_classes: int) -> paddle.Tensor:
-    """
-    Convert batch of vector of class indices into batch of log onehot vectors
-
-    Args:
-        x (`paddle.Tensor` of shape `(batch size, vector length)`):
-            Batch of class indices
-
-        num_classes (`int`):
-            number of classes to be used for the onehot vectors
-
-    Returns:
-        `paddle.Tensor` of shape `(batch size, num classes, vector length)`:
-            Log onehot vectors
-    """
-    x_onehot = F.one_hot(x, num_classes)
-    x_onehot = x_onehot.transpose([0, 2, 1])
-    log_x = paddle.log(x_onehot.cast("float32").clip(min=1e-30))
-    return log_x
-
-
-def gumbel_noised(logits: paddle.Tensor, generator: Optional[paddle.Generator]) -> paddle.Tensor:
-    """
-    Apply gumbel noise to `logits`
-    """
-    uniform = rand_tensor(logits.shape, generator=generator)
-    gumbel_noise = -paddle.log(-paddle.log(uniform + 1e-30) + 1e-30)
-    noised = gumbel_noise + logits
-    return noised
-
-
-def alpha_schedules(num_diffusion_timesteps: int, alpha_cum_start=0.99999, alpha_cum_end=0.000009):
-    """
-    Cumulative and non-cumulative alpha schedules.
-
-    See section 4.1.
-    """
-    att = (
-        np.arange(0, num_diffusion_timesteps) / (num_diffusion_timesteps - 1) * (alpha_cum_end - alpha_cum_start)
-        + alpha_cum_start
-    )
-    att = np.concatenate(([1], att))
-    at = att[1:] / att[:-1]
-    att = np.concatenate((att[1:], [1]))
-    return at, att
-
-
-def gamma_schedules(num_diffusion_timesteps: int, gamma_cum_start=0.000009, gamma_cum_end=0.99999):
-    """
-    Cumulative and non-cumulative gamma schedules.
-
-    See section 4.1.
-    """
-    ctt = (
-        np.arange(0, num_diffusion_timesteps) / (num_diffusion_timesteps - 1) * (gamma_cum_end - gamma_cum_start)
-        + gamma_cum_start
-    )
-    ctt = np.concatenate(([0], ctt))
-    one_minus_ctt = 1 - ctt
-    one_minus_ct = one_minus_ctt[1:] / one_minus_ctt[:-1]
-    ct = 1 - one_minus_ct
-    ctt = np.concatenate((ctt[1:], [0]))
-    return ct, ctt
-
-
-class VQDiffusionScheduler(SchedulerMixin, ConfigMixin):
-    """
-    The VQ-diffusion transformer outputs predicted probabilities of the initial unnoised image.
-
-    The VQ-diffusion scheduler converts the transformer's output into a sample for the unnoised image at the previous
-    diffusion timestep.
-
-    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
-    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
-    [`~SchedulerMixin.from_pretrained`] functions.
-
-    For more details, see the original paper: https://arxiv.org/abs/2111.14822
-
-    Args:
-        num_vec_classes (`int`):
-            The number of classes of the vector embeddings of the latent pixels. Includes the class for the masked
-            latent pixel.
-
-        num_train_timesteps (`int`):
-            Number of diffusion steps used to train the model.
-
-        alpha_cum_start (`float`):
-            The starting cumulative alpha value.
-
-        alpha_cum_end (`float`):
-            The ending cumulative alpha value.
-
-        gamma_cum_start (`float`):
-            The starting cumulative gamma value.
-
-        gamma_cum_end (`float`):
-            The ending cumulative gamma value.
-    """
-
-    order = 1
-
-    @register_to_config
-    def __init__(
-        self,
-        num_vec_classes: int,
-        num_train_timesteps: int = 100,
-        alpha_cum_start: float = 0.99999,
-        alpha_cum_end: float = 0.000009,
-        gamma_cum_start: float = 0.000009,
-        gamma_cum_end: float = 0.99999,
-    ):
-        self.num_embed = num_vec_classes
-
-        # By convention, the index for the mask class is the last class index
-        self.mask_class = self.num_embed - 1
-
-        at, att = alpha_schedules(num_train_timesteps, alpha_cum_start=alpha_cum_start, alpha_cum_end=alpha_cum_end)
-        ct, ctt = gamma_schedules(num_train_timesteps, gamma_cum_start=gamma_cum_start, gamma_cum_end=gamma_cum_end)
-
-        num_non_mask_classes = self.num_embed - 1
-        bt = (1 - at - ct) / num_non_mask_classes
-        btt = (1 - att - ctt) / num_non_mask_classes
-
-        at = paddle.to_tensor(at.astype("float64"))
-        bt = paddle.to_tensor(bt.astype("float64"))
-        ct = paddle.to_tensor(ct.astype("float64"))
-        log_at = paddle.log(at)
-        log_bt = paddle.log(bt)
-        log_ct = paddle.log(ct)
-
-        att = paddle.to_tensor(att.astype("float64"))
-        btt = paddle.to_tensor(btt.astype("float64"))
-        ctt = paddle.to_tensor(ctt.astype("float64"))
-        log_cumprod_at = paddle.log(att)
-        log_cumprod_bt = paddle.log(btt)
-        log_cumprod_ct = paddle.log(ctt)
-
-        self.log_at = log_at.cast("float32")
-        self.log_bt = log_bt.cast("float32")
-        self.log_ct = log_ct.cast("float32")
-        self.log_cumprod_at = log_cumprod_at.cast("float32")
-        self.log_cumprod_bt = log_cumprod_bt.cast("float32")
-        self.log_cumprod_ct = log_cumprod_ct.cast("float32")
-
-        # setable values
-        self.num_inference_steps = None
-        self.timesteps = paddle.to_tensor(np.arange(0, num_train_timesteps)[::-1].copy())
-
-    def set_timesteps(self, num_inference_steps: int):
-        """
-        Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
-
-        Args:
-            num_inference_steps (`int`):
-                the number of diffusion steps used when generating samples with a pre-trained model.
-        """
-        self.num_inference_steps = num_inference_steps
-        timesteps = np.arange(0, self.num_inference_steps)[::-1].copy()
-        self.timesteps = paddle.to_tensor(timesteps)
-
-    def step(
-        self,
-        model_output: paddle.Tensor,
-        timestep: paddle.Tensor,
-        sample: paddle.Tensor,
-        generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
-        return_dict: bool = True,
-    ) -> Union[VQDiffusionSchedulerOutput, Tuple]:
-        """
-        Predict the sample at the previous timestep via the reverse transition distribution i.e. Equation (11). See the
-        docstring for `self.q_posterior` for more in depth docs on how Equation (11) is computed.
-
-        Args:
-            log_p_x_0: (`paddle.Tensor` of shape `(batch size, num classes - 1, num latent pixels)`):
-                The log probabilities for the predicted classes of the initial latent pixels. Does not include a
-                prediction for the masked class as the initial unnoised image cannot be masked.
-
-            t (`paddle.Tensor`):
-                The timestep that determines which transition matrices are used.
-
-            x_t: (`paddle.Tensor` of shape `(batch size, num latent pixels)`):
-                The classes of each latent pixel at time `t`
-
-            generator: (`paddle.Generator` or None):
-                RNG for the noise applied to p(x_{t-1} | x_t) before it is sampled from.
-
-            return_dict (`bool`):
-                option for returning tuple rather than VQDiffusionSchedulerOutput class
-
-        Returns:
-            [`~schedulers.scheduling_utils.VQDiffusionSchedulerOutput`] or `tuple`:
-            [`~schedulers.scheduling_utils.VQDiffusionSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`.
-            When returning a tuple, the first element is the sample tensor.
-        """
-        if timestep == 0:
-            log_p_x_t_min_1 = model_output
-        else:
-            log_p_x_t_min_1 = self.q_posterior(model_output, sample, timestep)
-
-        log_p_x_t_min_1 = gumbel_noised(log_p_x_t_min_1, generator)
-
-        x_t_min_1 = log_p_x_t_min_1.argmax(axis=1)
-
-        if not return_dict:
-            return (x_t_min_1,)
-
-        return VQDiffusionSchedulerOutput(prev_sample=x_t_min_1)
-
-    def q_posterior(self, log_p_x_0, x_t, t):
-        """
-        Calculates the log probabilities for the predicted classes of the image at timestep `t-1`. I.e. Equation (11).
-
-        Instead of directly computing equation (11), we use Equation (5) to restate Equation (11) in terms of only
-        forward probabilities.
-
-        Equation (11) stated in terms of forward probabilities via Equation (5):
-
-        Where:
-        - the sum is over x_0 = {C_0 ... C_{k-1}} (classes for x_0)
-
-        p(x_{t-1} | x_t) = sum( q(x_t | x_{t-1}) * q(x_{t-1} | x_0) * p(x_0) / q(x_t | x_0) )
-
-        Args:
-            log_p_x_0: (`paddle.Tensor` of shape `(batch size, num classes - 1, num latent pixels)`):
-                The log probabilities for the predicted classes of the initial latent pixels. Does not include a
-                prediction for the masked class as the initial unnoised image cannot be masked.
-
-            x_t: (`paddle.Tensor` of shape `(batch size, num latent pixels)`):
-                The classes of each latent pixel at time `t`
-
-            t (paddle.Tensor):
-                The timestep that determines which transition matrix is used.
-
-        Returns:
-            `paddle.Tensor` of shape `(batch size, num classes, num latent pixels)`:
-                The log probabilities for the predicted classes of the image at timestep `t-1`. I.e. Equation (11).
-        """
-        log_onehot_x_t = index_to_log_onehot(x_t, self.num_embed)
-
-        log_q_x_t_given_x_0 = self.log_Q_t_transitioning_to_known_class(
-            t=t, x_t=x_t, log_onehot_x_t=log_onehot_x_t, cumulative=True
-        )
-
-        log_q_t_given_x_t_min_1 = self.log_Q_t_transitioning_to_known_class(
-            t=t, x_t=x_t, log_onehot_x_t=log_onehot_x_t, cumulative=False
-        )
-
-        # p_0(x_0=C_0 | x_t) / q(x_t | x_0=C_0)          ...      p_n(x_0=C_0 | x_t) / q(x_t | x_0=C_0)
-        #               .                    .                                   .
-        #               .                            .                           .
-        #               .                                      .                 .
-        # p_0(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1})  ...      p_n(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1})
-        q = log_p_x_0 - log_q_x_t_given_x_0
-
-        # sum_0 = p_0(x_0=C_0 | x_t) / q(x_t | x_0=C_0) + ... + p_0(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1}), ... ,
-        # sum_n = p_n(x_0=C_0 | x_t) / q(x_t | x_0=C_0) + ... + p_n(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1})
-        q_log_sum_exp = logsumexp(q, axis=1, keepdim=True)
-
-        # p_0(x_0=C_0 | x_t) / q(x_t | x_0=C_0) / sum_0          ...      p_n(x_0=C_0 | x_t) / q(x_t | x_0=C_0) / sum_n
-        #                        .                             .                                   .
-        #                        .                                     .                           .
-        #                        .                                               .                 .
-        # p_0(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1}) / sum_0  ...      p_n(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1}) / sum_n
-        q = q - q_log_sum_exp
-
-        # (p_0(x_0=C_0 | x_t) / q(x_t | x_0=C_0) / sum_0) * a_cumulative_{t-1} + b_cumulative_{t-1}          ...      (p_n(x_0=C_0 | x_t) / q(x_t | x_0=C_0) / sum_n) * a_cumulative_{t-1} + b_cumulative_{t-1}
-        #                                         .                                                .                                              .
-        #                                         .                                                        .                                      .
-        #                                         .                                                                  .                            .
-        # (p_0(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1}) / sum_0) * a_cumulative_{t-1} + b_cumulative_{t-1}  ...      (p_n(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1}) / sum_n) * a_cumulative_{t-1} + b_cumulative_{t-1}
-        # c_cumulative_{t-1}                                                                                 ...      c_cumulative_{t-1}
-        q = self.apply_cumulative_transitions(q, t - 1)
-
-        # ((p_0(x_0=C_0 | x_t) / q(x_t | x_0=C_0) / sum_0) * a_cumulative_{t-1} + b_cumulative_{t-1}) * q(x_t | x_{t-1}=C_0) * sum_0              ...      ((p_n(x_0=C_0 | x_t) / q(x_t | x_0=C_0) / sum_n) * a_cumulative_{t-1} + b_cumulative_{t-1}) * q(x_t | x_{t-1}=C_0) * sum_n
-        #                                                            .                                                                 .                                              .
-        #                                                            .                                                                         .                                      .
-        #                                                            .                                                                                   .                            .
-        # ((p_0(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1}) / sum_0) * a_cumulative_{t-1} + b_cumulative_{t-1}) * q(x_t | x_{t-1}=C_{k-1}) * sum_0  ...      ((p_n(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1}) / sum_n) * a_cumulative_{t-1} + b_cumulative_{t-1}) * q(x_t | x_{t-1}=C_{k-1}) * sum_n
-        # c_cumulative_{t-1} * q(x_t | x_{t-1}=C_k) * sum_0                                                                                       ...      c_cumulative_{t-1} * q(x_t | x_{t-1}=C_k) * sum_0
-        log_p_x_t_min_1 = q + log_q_t_given_x_t_min_1 + q_log_sum_exp
-
-        # For each column, there are two possible cases.
-        #
-        # Where:
-        # - sum(p_n(x_0))) is summing over all classes for x_0
-        # - C_i is the class transitioning from (not to be confused with c_t and c_cumulative_t being used for gamma's)
-        # - C_j is the class transitioning to
-        #
-        # 1. x_t is masked i.e. x_t = c_k
-        #
-        # Simplifying the expression, the column vector is:
-        #                                                      .
-        #                                                      .
-        #                                                      .
-        # (c_t / c_cumulative_t) * (a_cumulative_{t-1} * p_n(x_0 = C_i | x_t) + b_cumulative_{t-1} * sum(p_n(x_0)))
-        #                                                      .
-        #                                                      .
-        #                                                      .
-        # (c_cumulative_{t-1} / c_cumulative_t) * sum(p_n(x_0))
-        #
-        # From equation (11) stated in terms of forward probabilities, the last row is trivially verified.
-        #
-        # For the other rows, we can state the equation as ...
-        #
-        # (c_t / c_cumulative_t) * [b_cumulative_{t-1} * p(x_0=c_0) + ... + (a_cumulative_{t-1} + b_cumulative_{t-1}) * p(x_0=C_i) + ... + b_cumulative_{k-1} * p(x_0=c_{k-1})]
-        #
-        # This verifies the other rows.
-        #
-        # 2. x_t is not masked
-        #
-        # Simplifying the expression, there are two cases for the rows of the column vector, where C_j = C_i and where C_j != C_i:
-        #                                                      .
-        #                                                      .
-        #                                                      .
-        # C_j != C_i:        b_t * ((b_cumulative_{t-1} / b_cumulative_t) * p_n(x_0 = c_0) + ... + ((a_cumulative_{t-1} + b_cumulative_{t-1}) / b_cumulative_t) * p_n(x_0 = C_i) + ... + (b_cumulative_{t-1} / (a_cumulative_t + b_cumulative_t)) * p_n(c_0=C_j) + ... + (b_cumulative_{t-1} / b_cumulative_t) * p_n(x_0 = c_{k-1}))
-        #                                                      .
-        #                                                      .
-        #                                                      .
-        # C_j = C_i: (a_t + b_t) * ((b_cumulative_{t-1} / b_cumulative_t) * p_n(x_0 = c_0) + ... + ((a_cumulative_{t-1} + b_cumulative_{t-1}) / (a_cumulative_t + b_cumulative_t)) * p_n(x_0 = C_i = C_j) + ... + (b_cumulative_{t-1} / b_cumulative_t) * p_n(x_0 = c_{k-1}))
-        #                                                      .
-        #                                                      .
-        #                                                      .
-        # 0
-        #
-        # The last row is trivially verified. The other rows can be verified by directly expanding equation (11) stated in terms of forward probabilities.
-        return log_p_x_t_min_1
-
-    def log_Q_t_transitioning_to_known_class(
-        self, *, t: paddle.Tensor, x_t: paddle.Tensor, log_onehot_x_t: paddle.Tensor, cumulative: bool
-    ):
-        """
-        Returns the log probabilities of the rows from the (cumulative or non-cumulative) transition matrix for each
-        latent pixel in `x_t`.
-
-        See equation (7) for the complete non-cumulative transition matrix. The complete cumulative transition matrix
-        is the same structure except the parameters (alpha, beta, gamma) are the cumulative analogs.
-
-        Args:
-            t (paddle.Tensor):
-                The timestep that determines which transition matrix is used.
-
-            x_t (`paddle.Tensor` of shape `(batch size, num latent pixels)`):
-                The classes of each latent pixel at time `t`.
-
-            log_onehot_x_t (`paddle.Tensor` of shape `(batch size, num classes, num latent pixels)`):
-                The log one-hot vectors of `x_t`
-
-            cumulative (`bool`):
-                If cumulative is `False`, we use the single step transition matrix `t-1`->`t`. If cumulative is `True`,
-                we use the cumulative transition matrix `0`->`t`.
-
-        Returns:
-            `paddle.Tensor` of shape `(batch size, num classes - 1, num latent pixels)`:
-                Each _column_ of the returned matrix is a _row_ of log probabilities of the complete probability
-                transition matrix.
-
-                When non cumulative, returns `self.num_classes - 1` rows because the initial latent pixel cannot be
-                masked.
-
-                Where:
-                - `q_n` is the probability distribution for the forward process of the `n`th latent pixel.
-                - C_0 is a class of a latent pixel embedding
-                - C_k is the class of the masked latent pixel
-
-                non-cumulative result (omitting logarithms):
-                ```
-                q_0(x_t | x_{t-1} = C_0) ... q_n(x_t | x_{t-1} = C_0)
-                          .      .                     .
-                          .               .            .
-                          .                      .     .
-                q_0(x_t | x_{t-1} = C_k) ... q_n(x_t | x_{t-1} = C_k)
-                ```
-
-                cumulative result (omitting logarithms):
-                ```
-                q_0_cumulative(x_t | x_0 = C_0)    ...  q_n_cumulative(x_t | x_0 = C_0)
-                          .               .                          .
-                          .                        .                 .
-                          .                               .          .
-                q_0_cumulative(x_t | x_0 = C_{k-1}) ... q_n_cumulative(x_t | x_0 = C_{k-1})
-                ```
-        """
-        if cumulative:
-            a = self.log_cumprod_at[t]
-            b = self.log_cumprod_bt[t]
-            c = self.log_cumprod_ct[t]
-        else:
-            a = self.log_at[t]
-            b = self.log_bt[t]
-            c = self.log_ct[t]
-
-        if not cumulative:
-            # The values in the onehot vector can also be used as the logprobs for transitioning
-            # from masked latent pixels. If we are not calculating the cumulative transitions,
-            # we need to save these vectors to be re-appended to the final matrix so the values
-            # aren't overwritten.
-            #
-            # `P(x_t!=mask|x_{t-1=mask}) = 0` and 0 will be the value of the last row of the onehot vector
-            # if x_t is not masked
-            #
-            # `P(x_t=mask|x_{t-1=mask}) = 1` and 1 will be the value of the last row of the onehot vector
-            # if x_t is masked
-            log_onehot_x_t_transitioning_from_masked = log_onehot_x_t[:, -1, :].unsqueeze(1)
-
-        # `index_to_log_onehot` will add onehot vectors for masked pixels,
-        # so the default one hot matrix has one too many rows. See the doc string
-        # for an explanation of the dimensionality of the returned matrix.
-        log_onehot_x_t = log_onehot_x_t[:, :-1, :]
-
-        # this is a cheeky trick to produce the transition probabilities using log one-hot vectors.
-        #
-        # Don't worry about what values this sets in the columns that mark transitions
-        # to masked latent pixels. They are overwrote later with the `mask_class_mask`.
-        #
-        # Looking at the below logspace formula in non-logspace, each value will evaluate to either
-        # `1 * a + b = a + b` where `log_Q_t` has the one hot value in the column
-        # or
-        # `0 * a + b = b` where `log_Q_t` has the 0 values in the column.
-        #
-        # See equation 7 for more details.
-        log_Q_t = logaddexp(log_onehot_x_t + a, b)
-
-        # The whole column of each masked pixel is `c`
-        mask_class_mask = x_t == self.mask_class
-        mask_class_mask = mask_class_mask.unsqueeze(1).expand([-1, self.num_embed - 1, -1])
-        # log_Q_t[mask_class_mask] = c
-        log_Q_t = paddle.where(mask_class_mask, c, log_Q_t)
-
-        if not cumulative:
-            log_Q_t = paddle.concat((log_Q_t, log_onehot_x_t_transitioning_from_masked), axis=1)
-
-        return log_Q_t
-
-    def apply_cumulative_transitions(self, q, t):
-        bsz = q.shape[0]
-        a = self.log_cumprod_at[t]
-        b = self.log_cumprod_bt[t]
-        c = self.log_cumprod_ct[t]
-
-        num_latent_pixels = q.shape[2]
-        c = c.expand([bsz, 1, num_latent_pixels])
-
-        q = logaddexp(q + a, b)
-        q = paddle.concat((q, c), axis=1)
-
-        return q
diff --git a/ppdiffusers/ppdiffusers/training_utils.py b/ppdiffusers/ppdiffusers/training_utils.py
deleted file mode 100644
index d5164e7af9e5..000000000000
--- a/ppdiffusers/ppdiffusers/training_utils.py
+++ /dev/null
@@ -1,382 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import contextlib
-import copy
-import os
-import random
-from typing import Any, Dict, Optional, Union
-
-import numpy as np
-import paddle
-import paddle.distributed
-import paddle.nn as nn
-
-from .utils import deprecate, get_logger
-
-logger = get_logger(__name__)
-
-
-def enable_full_determinism(seed: int):
-    """
-    Helper function for reproducible behavior during distributed training.
-    """
-    # set seed first
-    set_seed(seed)
-
-    #  Enable Paddle deterministic mode. This potentially requires either the environment
-    #  variable 'CUDA_LAUNCH_BLOCKING' or 'CUBLAS_WORKSPACE_CONFIG' to be set,
-    # depending on the CUDA version, so we set them both here
-    os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
-    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
-    os.environ["FLAGS_cudnn_deterministic"] = "True"
-    os.environ["FLAGS_benchmark"] = "True"
-    os.environ["FLAGS_conv_workspace_size_limit"] = "4096"
-
-
-def set_seed(seed: int):
-    """
-    Args:
-    Helper function for reproducible behavior to set the seed in `random`, `numpy`, `paddle`.
-        seed (`int`): The seed to set.
-    """
-    if seed is not None:
-        random.seed(seed)
-        np.random.seed(seed)
-        paddle.seed(seed)
-    # ^^ safe to call this function even if cuda is not available
-
-
-# Adapted from torch-ema https://github.com/fadel/pytorch_ema/blob/master/torch_ema/ema.py#L14
-class EMAModel:
-    """
-    Exponential Moving Average of models weights
-    """
-
-    def __init__(
-        self,
-        parameters,
-        decay: float = 0.9999,
-        min_decay: float = 0.0,
-        update_after_step: int = 0,
-        use_ema_warmup: bool = False,
-        inv_gamma: Union[float, int] = 1.0,
-        power: Union[float, int] = 2 / 3,
-        model_cls: Optional[Any] = None,
-        model_config: Dict[str, Any] = None,
-        **kwargs,
-    ):
-        """
-        Args:
-            parameters (Iterable[nn.Parameter]): The parameters to track.
-            decay (float): The decay factor for the exponential moving average.
-            min_decay (float): The minimum decay factor for the exponential moving average.
-            update_after_step (int): The number of steps to wait before starting to update the EMA weights.
-            use_ema_warmup (bool): Whether to use EMA warmup.
-            inv_gamma (float):
-                Inverse multiplicative factor of EMA warmup. Default: 1. Only used if `use_ema_warmup` is True.
-            power (float): Exponential factor of EMA warmup. Default: 2/3. Only used if `use_ema_warmup` is True.
-
-        @crowsonkb's notes on EMA Warmup:
-            If gamma=1 and power=1, implements a simple average. gamma=1, power=2/3 are good values for models you plan
-            to train for a million or more steps (reaches decay factor 0.999 at 31.6K steps, 0.9999 at 1M steps),
-            gamma=1, power=3/4 for models you plan to train for less (reaches decay factor 0.999 at 10K steps, 0.9999
-            at 215.4k steps).
-        """
-
-        if isinstance(parameters, nn.Layer):
-            deprecation_message = (
-                "Passing a `nn.Layer` to `ExponentialMovingAverage` is deprecated. "
-                "Please pass the parameters of the module instead."
-            )
-            deprecate(
-                "passing a `nn.Layer` to `ExponentialMovingAverage`",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False,
-            )
-            parameters = parameters.parameters()
-
-            # set use_ema_warmup to True if a nn.Layer is passed for backwards compatibility
-            use_ema_warmup = True
-
-        if kwargs.get("max_value", None) is not None:
-            deprecation_message = "The `max_value` argument is deprecated. Please use `decay` instead."
-            deprecate("max_value", "1.0.0", deprecation_message, standard_warn=False)
-            decay = kwargs["max_value"]
-
-        if kwargs.get("min_value", None) is not None:
-            deprecation_message = "The `min_value` argument is deprecated. Please use `min_decay` instead."
-            deprecate("min_value", "1.0.0", deprecation_message, standard_warn=False)
-            min_decay = kwargs["min_value"]
-
-        parameters = list(parameters)
-        self.shadow_params = [p.clone().detach() for p in parameters]
-
-        if kwargs.get("device", None) is not None:
-            deprecation_message = "The `device` argument is deprecated. Please use `to` instead."
-            deprecate("device", "1.0.0", deprecation_message, standard_warn=False)
-            self.to(device=kwargs["device"])
-
-        self.temp_stored_params = None
-
-        self.decay = decay
-        self.min_decay = min_decay
-        self.update_after_step = update_after_step
-        self.use_ema_warmup = use_ema_warmup
-        self.inv_gamma = inv_gamma
-        self.power = power
-        self.optimization_step = 0
-        self.cur_decay_value = None  # set in `step()`
-
-        self.model_cls = model_cls
-        self.model_config = model_config
-
-    @classmethod
-    def from_pretrained(cls, path, model_cls) -> "EMAModel":
-        _, ema_kwargs = model_cls.load_config(path, return_unused_kwargs=True)
-        model = model_cls.from_pretrained(path)
-
-        ema_model = cls(model.parameters(), model_cls=model_cls, model_config=model.config)
-
-        ema_model.load_state_dict(ema_kwargs)
-        return ema_model
-
-    def save_pretrained(self, path):
-        if self.model_cls is None:
-            raise ValueError("`save_pretrained` can only be used if `model_cls` was defined at __init__.")
-
-        if self.model_config is None:
-            raise ValueError("`save_pretrained` can only be used if `model_config` was defined at __init__.")
-
-        model = self.model_cls.from_config(self.model_config)
-        state_dict = self.state_dict()
-        state_dict.pop("shadow_params", None)
-
-        model.register_to_config(**state_dict)
-        self.copy_to(model.parameters())
-        model.save_pretrained(path)
-
-    def get_decay(self, optimization_step: int) -> float:
-        """
-        Compute the decay factor for the exponential moving average.
-        """
-        # we donot -1!
-        step = max(0, optimization_step - self.update_after_step)
-
-        if step <= 0:
-            return 0.0
-
-        if self.use_ema_warmup:
-            cur_decay_value = 1 - (1 + step / self.inv_gamma) ** -self.power
-        else:
-            cur_decay_value = (1 + step) / (10 + step)
-
-        cur_decay_value = min(cur_decay_value, self.decay)
-        # make sure decay is not smaller than min_decay
-        cur_decay_value = max(cur_decay_value, self.min_decay)
-        return cur_decay_value
-
-    @paddle.no_grad()
-    def step(self, parameters):
-        if isinstance(parameters, nn.Layer):
-            deprecation_message = (
-                "Passing a `nn.Layer` to `ExponentialMovingAverage.step` is deprecated. "
-                "Please pass the parameters of the module instead."
-            )
-            deprecate(
-                "passing a `nn.Layer` to `ExponentialMovingAverage.step`",
-                "1.0.0",
-                deprecation_message,
-                standard_warn=False,
-            )
-            parameters = parameters.parameters()
-
-        parameters = list(parameters)
-
-        self.optimization_step += 1
-
-        # Compute the decay factor for the exponential moving average.
-        decay = self.get_decay(self.optimization_step)
-        self.cur_decay_value = decay
-        one_minus_decay = 1 - decay
-
-        for s_param, param in zip(self.shadow_params, parameters):
-            if not param.stop_gradient:
-                s_param.copy_(s_param - one_minus_decay * (s_param - param), True)
-            else:
-                s_param.copy_(param, True)
-
-    def copy_to(self, parameters) -> None:
-        """
-        Copy current averaged parameters into given collection of parameters.
-
-        Args:
-            parameters: Iterable of `nn.Parameter`; the parameters to be
-                updated with the stored moving averages. If `None`, the parameters with which this
-                `ExponentialMovingAverage` was initialized will be used.
-        """
-        parameters = list(parameters)
-        for s_param, param in zip(self.shadow_params, parameters):
-            param.copy_(s_param, True)
-
-    def state_dict(self) -> dict:
-        r"""
-        Returns the state of the ExponentialMovingAverage as a dict. This method is used by accelerate during
-        checkpointing to save the ema state dict.
-        """
-        # Following PyTorch conventions, references to tensors are returned:
-        # "returns a reference to the state and not its copy!" -
-        # https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict
-        return {
-            "decay": self.decay,
-            "min_decay": self.min_decay,
-            "optimization_step": self.optimization_step,
-            "update_after_step": self.update_after_step,
-            "use_ema_warmup": self.use_ema_warmup,
-            "inv_gamma": self.inv_gamma,
-            "power": self.power,
-            "shadow_params": self.shadow_params,
-        }
-
-    def store(self, parameters) -> None:
-        r"""
-        Args:
-        Save the current parameters for restoring later.
-            parameters: Iterable of `nn.Parameter`; the parameters to be
-                temporarily stored.
-        """
-        self.temp_stored_params = [param.detach().cpu().clone() for param in parameters]
-
-    def restore(self, parameters) -> None:
-        r"""
-        Args:
-        Restore the parameters stored with the `store` method. Useful to validate the model with EMA parameters without:
-        affecting the original optimization process. Store the parameters before the `copy_to()` method. After
-        validation (or model saving), use this to restore the former parameters.
-            parameters: Iterable of `nn.Parameter`; the parameters to be
-                updated with the stored parameters. If `None`, the parameters with which this
-                `ExponentialMovingAverage` was initialized will be used.
-        """
-        if self.temp_stored_params is None:
-            raise RuntimeError("This ExponentialMovingAverage has no `store()`ed weights " "to `restore()`")
-        for c_param, param in zip(self.temp_stored_params, parameters):
-            param.copy_(c_param, True)
-
-        # Better memory-wise.
-        self.temp_stored_params = None
-
-    def load_state_dict(self, state_dict: dict) -> None:
-        r"""
-        Args:
-        Loads the ExponentialMovingAverage state. This method is used by accelerate during checkpointing to save the
-        ema state dict.
-            state_dict (dict): EMA state. Should be an object returned
-                from a call to :meth:`state_dict`.
-        """
-        # deepcopy, to be consistent with module API
-        state_dict = copy.deepcopy(state_dict)
-
-        self.decay = state_dict.get("decay", self.decay)
-        if self.decay < 0.0 or self.decay > 1.0:
-            raise ValueError("Decay must be between 0 and 1")
-
-        self.min_decay = state_dict.get("min_decay", self.min_decay)
-        if not isinstance(self.min_decay, float):
-            raise ValueError("Invalid min_decay")
-
-        self.optimization_step = state_dict.get("optimization_step", self.optimization_step)
-        if not isinstance(self.optimization_step, int):
-            raise ValueError("Invalid optimization_step")
-
-        self.update_after_step = state_dict.get("update_after_step", self.update_after_step)
-        if not isinstance(self.update_after_step, int):
-            raise ValueError("Invalid update_after_step")
-
-        self.use_ema_warmup = state_dict.get("use_ema_warmup", self.use_ema_warmup)
-        if not isinstance(self.use_ema_warmup, bool):
-            raise ValueError("Invalid use_ema_warmup")
-
-        self.inv_gamma = state_dict.get("inv_gamma", self.inv_gamma)
-        if not isinstance(self.inv_gamma, (float, int)):
-            raise ValueError("Invalid inv_gamma")
-
-        self.power = state_dict.get("power", self.power)
-        if not isinstance(self.power, (float, int)):
-            raise ValueError("Invalid power")
-
-        shadow_params = state_dict.get("shadow_params", None)
-        if shadow_params is not None:
-            self.shadow_params = shadow_params
-            if not isinstance(self.shadow_params, list):
-                raise ValueError("shadow_params must be a list")
-            if not all(isinstance(p, paddle.Tensor) for p in self.shadow_params):
-                raise ValueError("shadow_params must all be Tensors")
-
-
-@contextlib.contextmanager
-def main_process_first(desc="work"):
-    if paddle.distributed.get_world_size() > 1:
-        rank = paddle.distributed.get_rank()
-        is_main_process = rank == 0
-        main_process_desc = "main local process"
-
-        try:
-            if not is_main_process:
-                # tell all replicas to wait
-                logger.debug(f"{rank}: waiting for the {main_process_desc} to perform {desc}")
-                paddle.distributed.barrier()
-            yield
-        finally:
-            if is_main_process:
-                # the wait is over
-                logger.debug(f"{rank}: {main_process_desc} completed {desc}, releasing all replicas")
-                paddle.distributed.barrier()
-    else:
-        yield
-
-
-def unfreeze_params(params):
-    for param in params:
-        param.stop_gradient = False
-
-
-def freeze_params(params):
-    for param in params:
-        param.stop_gradient = True
-
-
-def unfreeze_model(model: nn.Layer):
-    for param in model.parameters():
-        param.stop_gradient = False
-
-
-def freeze_model(model: nn.Layer):
-    for param in model.parameters():
-        param.stop_gradient = True
-
-
-def unwrap_model(model: nn.Layer) -> nn.Layer:
-    """
-    Recursively unwraps a model from potential containers (as used in distributed training).
-
-    Args:
-        model (`nn.Layer`): The model to unwrap.
-    """
-    # since there could be multiple levels of wrapping, unwrap recursively
-    if hasattr(model, "_layers"):
-        return unwrap_model(model._layers)
-    else:
-        return model
diff --git a/ppdiffusers/ppdiffusers/utils/__init__.py b/ppdiffusers/ppdiffusers/utils/__init__.py
deleted file mode 100644
index 3aff9f747e7e..000000000000
--- a/ppdiffusers/ppdiffusers/utils/__init__.py
+++ /dev/null
@@ -1,143 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import os
-
-from packaging import version
-
-from ..version import VERSION as __version__
-from . import initializer_utils
-from .constants import (
-    CONFIG_NAME,
-    DEPRECATED_REVISION_ARGS,
-    DIFFUSERS_CACHE,
-    DOWNLOAD_SERVER,
-    FASTDEPLOY_MODEL_NAME,
-    FASTDEPLOY_WEIGHTS_NAME,
-    FLAX_WEIGHTS_NAME,
-    FROM_DIFFUSERS,
-    FROM_HF_HUB,
-    HF_MODULES_CACHE,
-    HUGGINGFACE_CO_RESOLVE_ENDPOINT,
-    LOW_CPU_MEM_USAGE_DEFAULT,
-    NEG_INF,
-    ONNX_EXTERNAL_WEIGHTS_NAME,
-    ONNX_WEIGHTS_NAME,
-    PADDLE_WEIGHTS_NAME,
-    PPDIFFUSERS_CACHE,
-    PPDIFFUSERS_DYNAMIC_MODULE_NAME,
-    PPDIFFUSERS_MODULES_CACHE,
-    PPNLP_BOS_RESOLVE_ENDPOINT,
-    TEST_DOWNLOAD_SERVER,
-    TEXT_ENCODER_ATTN_MODULE,
-    TO_DIFFUSERS,
-    TORCH_SAFETENSORS_WEIGHTS_NAME,
-    TORCH_WEIGHTS_NAME,
-    WEIGHTS_NAME,
-    get_map_location_default,
-)
-from .deprecation_utils import deprecate
-from .doc_utils import replace_example_docstring
-from .download_utils import (
-    _add_variant,
-    _get_model_file,
-    bos_hf_download,
-    ppdiffusers_bos_dir_download,
-    ppdiffusers_url_download,
-)
-from .dynamic_modules_utils import get_class_from_dynamic_module
-from .hub_utils import HF_HUB_OFFLINE, extract_commit_hash, http_user_agent
-from .import_utils import (
-    BACKENDS_MAPPING,
-    ENV_VARS_TRUE_AND_AUTO_VALUES,
-    ENV_VARS_TRUE_VALUES,
-    DummyObject,
-    OptionalDependencyNotAvailable,
-    is_bs4_available,
-    is_einops_available,
-    is_fastdeploy_available,
-    is_ftfy_available,
-    is_inflect_available,
-    is_k_diffusion_available,
-    is_k_diffusion_version,
-    is_librosa_available,
-    is_note_seq_available,
-    is_omegaconf_available,
-    is_paddle_available,
-    is_paddle_version,
-    is_paddlenlp_available,
-    is_paddlenlp_version,
-    is_ppxformers_available,
-    is_safetensors_available,
-    is_scipy_available,
-    is_tensorboard_available,
-    is_torch_available,
-    is_torch_version,
-    is_unidecode_available,
-    is_visualdl_available,
-    is_wandb_available,
-    requires_backends,
-)
-
-# custom load_utils
-from .load_utils import is_torch_file, safetensors_load, smart_load, torch_load
-from .logging import get_logger
-from .outputs import BaseOutput
-from .paddle_utils import rand_tensor, randint_tensor, randn_tensor
-from .pil_utils import PIL_INTERPOLATION, numpy_to_pil, pd_to_pil, pt_to_pil
-
-if is_paddle_available():
-    from .testing_utils import (
-        floats_tensor,
-        image_grid,
-        load_hf_numpy,
-        load_image,
-        load_numpy,
-        load_pd,
-        load_ppnlp_numpy,
-        nightly,
-        paddle_all_close,
-        paddle_device,
-        parse_flag_from_env,
-        print_tensor_test,
-        require_paddle_gpu,
-        slow,
-    )
-
-if is_torch_available():
-    from .testing_utils import require_torch
-
-logger = get_logger(__name__)
-
-
-def apply_forward_hook(method):
-    return method
-
-
-from .testing_utils import export_to_video
-
-
-def check_min_version(min_version):
-    if version.parse(__version__) < version.parse(min_version):
-        if "dev" in min_version:
-            error_message = (
-                "This example requires a source install from PaddleNLP ppdiffusers (see "
-                "`https://huggingface.co/docs/diffusers/installation#install-from-source`),"
-            )
-        else:
-            error_message = f"This example requires a minimum version of {min_version},"
-        error_message += f" but the version found is {__version__}.\n"
-        raise ImportError(error_message)
diff --git a/ppdiffusers/ppdiffusers/utils/constants.py b/ppdiffusers/ppdiffusers/utils/constants.py
deleted file mode 100644
index 818a076646f4..000000000000
--- a/ppdiffusers/ppdiffusers/utils/constants.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-
-from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE, hf_cache_home
-
-
-def str2bool(variable):
-    if isinstance(variable, bool):
-        return variable
-
-    if not isinstance(variable, str):
-        variable = str(variable)
-
-    if variable.lower() == "false":
-        return False
-    elif variable.lower() == "true":
-        return True
-    else:
-        raise ValueError("Not supported value: {}".format(variable))
-
-
-ppnlp_cache_home = os.path.expanduser(
-    os.getenv("PPNLP_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "paddlenlp"))
-)
-
-ppdiffusers_default_cache_path = os.path.join(ppnlp_cache_home, "ppdiffusers")
-# diffusers_default_cache_path = os.path.join(HUGGINGFACE_HUB_CACHE, "diffusers")
-diffusers_default_cache_path = HUGGINGFACE_HUB_CACHE
-
-CONFIG_NAME = "config.json"
-TORCH_WEIGHTS_NAME = "diffusion_pytorch_model.bin"
-TORCH_SAFETENSORS_WEIGHTS_NAME = "diffusion_pytorch_model.safetensors"
-FLAX_WEIGHTS_NAME = "diffusion_flax_model.msgpack"
-ONNX_WEIGHTS_NAME = "model.onnx"
-ONNX_EXTERNAL_WEIGHTS_NAME = "weights.pb"
-
-HUGGINGFACE_CO_RESOLVE_ENDPOINT = "https://huggingface.co"
-PPDIFFUSERS_CACHE = ppdiffusers_default_cache_path
-DIFFUSERS_CACHE = diffusers_default_cache_path
-DIFFUSERS_DYNAMIC_MODULE_NAME = "diffusers_modules"
-PPDIFFUSERS_DYNAMIC_MODULE_NAME = "ppdiffusers_modules"
-HF_MODULES_CACHE = os.getenv("HF_MODULES_CACHE", os.path.join(hf_cache_home, "modules"))
-PPDIFFUSERS_MODULES_CACHE = os.getenv("PPDIFFUSERS_MODULES_CACHE", os.path.join(ppnlp_cache_home, "modules"))
-
-PADDLE_WEIGHTS_NAME = "model_state.pdparams"
-FASTDEPLOY_WEIGHTS_NAME = "inference.pdiparams"
-FASTDEPLOY_MODEL_NAME = "inference.pdmodel"
-WEIGHTS_NAME = PADDLE_WEIGHTS_NAME
-
-TEST_DOWNLOAD_SERVER = "https://paddlenlp.bj.bcebos.com/models/community/ppdiffusers/tests"
-DOWNLOAD_SERVER = "https://bj.bcebos.com/paddlenlp/models/community"
-PPNLP_BOS_RESOLVE_ENDPOINT = os.getenv("PPNLP_ENDPOINT", "https://bj.bcebos.com/paddlenlp")
-DEPRECATED_REVISION_ARGS = ["fp16", "non-ema"]
-TEXT_ENCODER_ATTN_MODULE = ".self_attn"
-LOW_CPU_MEM_USAGE_DEFAULT = str2bool(os.getenv("LOW_CPU_MEM_USAGE_DEFAULT", False))
-
-
-NEG_INF = -1e4
-
-get_map_location_default = lambda *args, **kwargs: os.getenv("MAP_LOCATION_DEFAULT", "cpu")
-FROM_HF_HUB = str2bool(os.getenv("FROM_HF_HUB", False))
-FROM_DIFFUSERS = str2bool(os.getenv("FROM_DIFFUSERS", False))
-TO_DIFFUSERS = str2bool(os.getenv("TO_DIFFUSERS", False))
-
-# FOR tests
-if bool(os.getenv("PATCH_ALLCLOSE", False)):
-    import paddle
-
-    raw_all_close = paddle.allclose
-
-    def allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
-        print(x.tolist())
-        print(y.tolist())
-        return raw_all_close(x, y, rtol=rtol, atol=atol, equal_nan=equal_nan, name=name)
diff --git a/ppdiffusers/ppdiffusers/utils/deprecation_utils.py b/ppdiffusers/ppdiffusers/utils/deprecation_utils.py
deleted file mode 100644
index 1322f1863bc4..000000000000
--- a/ppdiffusers/ppdiffusers/utils/deprecation_utils.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-import warnings
-from typing import Any, Dict, Optional, Union
-
-from packaging import version
-
-
-def deprecate(*args, take_from: Optional[Union[Dict, Any]] = None, standard_warn=True, stacklevel=2):
-    from ..version import VERSION as __version__
-
-    deprecated_kwargs = take_from
-    values = ()
-    if not isinstance(args[0], tuple):
-        args = (args,)
-
-    for attribute, version_name, message in args:
-        if version.parse(version.parse(__version__).base_version) >= version.parse(version_name):
-            raise ValueError(
-                f"The deprecation tuple {(attribute, version_name, message)} should be removed since ppdiffusers'"
-                f" version {__version__} is >= {version_name}"
-            )
-
-        warning = None
-        if isinstance(deprecated_kwargs, dict) and attribute in deprecated_kwargs:
-            values += (deprecated_kwargs.pop(attribute),)
-            warning = f"The `{attribute}` argument is deprecated and will be removed in version {version_name}."
-        elif hasattr(deprecated_kwargs, attribute):
-            values += (getattr(deprecated_kwargs, attribute),)
-            warning = f"The `{attribute}` attribute is deprecated and will be removed in version {version_name}."
-        elif deprecated_kwargs is None:
-            warning = f"`{attribute}` is deprecated and will be removed in version {version_name}."
-
-        if warning is not None:
-            warning = warning + " " if standard_warn else ""
-            warnings.warn(warning + message, FutureWarning, stacklevel=stacklevel)
-
-    if isinstance(deprecated_kwargs, dict) and len(deprecated_kwargs) > 0:
-        call_frame = inspect.getouterframes(inspect.currentframe())[1]
-        filename = call_frame.filename
-        line_number = call_frame.lineno
-        function = call_frame.function
-        key, value = next(iter(deprecated_kwargs.items()))
-        raise TypeError(f"{function} in {filename} line {line_number-1} got an unexpected keyword argument `{key}`")
-
-    if len(values) == 0:
-        return
-    elif len(values) == 1:
-        return values[0]
-    return values
diff --git a/ppdiffusers/ppdiffusers/utils/doc_utils.py b/ppdiffusers/ppdiffusers/utils/doc_utils.py
deleted file mode 100644
index 01188c98e915..000000000000
--- a/ppdiffusers/ppdiffusers/utils/doc_utils.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Doc utilities: Utilities related to documentation
-"""
-import re
-
-
-def replace_example_docstring(example_docstring):
-    def docstring_decorator(fn):
-        func_doc = fn.__doc__
-        lines = func_doc.split("\n")
-        i = 0
-        while i < len(lines) and re.search(r"^\s*Examples?:\s*$", lines[i]) is None:
-            i += 1
-        if i < len(lines):
-            lines[i] = example_docstring
-            func_doc = "\n".join(lines)
-        else:
-            raise ValueError(
-                f"The function {fn} should have an empty 'Examples:' in its docstring as placeholder, "
-                f"current docstring is:\n{func_doc}"
-            )
-        fn.__doc__ = func_doc
-        return fn
-
-    return docstring_decorator
diff --git a/ppdiffusers/ppdiffusers/utils/download_utils.py b/ppdiffusers/ppdiffusers/utils/download_utils.py
deleted file mode 100644
index 2ef31e8ba396..000000000000
--- a/ppdiffusers/ppdiffusers/utils/download_utils.py
+++ /dev/null
@@ -1,661 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import os.path
-import re
-import tempfile
-import warnings
-from contextlib import contextmanager
-from functools import partial
-from pathlib import Path
-from typing import List, Optional, Union
-from urllib.parse import quote
-
-import requests
-from filelock import FileLock
-from huggingface_hub import hf_hub_download
-from huggingface_hub.file_download import _chmod_and_replace, http_get
-from huggingface_hub.utils import (
-    EntryNotFoundError,
-    RepositoryNotFoundError,
-    RevisionNotFoundError,
-)
-from huggingface_hub.utils import tqdm as hf_tqdm
-from packaging import version
-from requests import HTTPError
-from tqdm.auto import tqdm as base_tqdm
-from tqdm.contrib.concurrent import thread_map
-
-from ..version import VERSION as __version__
-from .constants import (
-    DEPRECATED_REVISION_ARGS,
-    HUGGINGFACE_CO_RESOLVE_ENDPOINT,
-    PPDIFFUSERS_CACHE,
-    PPNLP_BOS_RESOLVE_ENDPOINT,
-    TORCH_SAFETENSORS_WEIGHTS_NAME,
-    WEIGHTS_NAME,
-)
-from .logging import get_logger
-
-
-def _add_variant(weights_name: str, variant: Optional[str] = None) -> str:
-    if variant is not None:
-        splits = weights_name.split(".")
-        splits = splits[:-1] + [variant] + splits[-1:]
-        weights_name = ".".join(splits)
-
-    return weights_name
-
-
-# https://github.com/huggingface/diffusers/blob/da2ce1a6b92f48cabe9e9d3944c4ee8b007b2871/src/diffusers/utils/hub_utils.py#L246
-def _get_model_file(
-    pretrained_model_name_or_path,
-    *,
-    weights_name,
-    subfolder,
-    cache_dir,
-    force_download=False,
-    revision=None,
-    proxies=None,
-    resume_download=False,
-    local_files_only=None,
-    use_auth_token=None,
-    user_agent=None,
-    commit_hash=None,
-    file_lock_timeout=-1,
-    from_hf_hub=False,
-):
-    pretrained_model_name_or_path = str(pretrained_model_name_or_path)
-    if os.path.isfile(pretrained_model_name_or_path):
-        return pretrained_model_name_or_path
-    elif os.path.isdir(pretrained_model_name_or_path):
-        if os.path.isfile(os.path.join(pretrained_model_name_or_path, weights_name)):
-            # Load from a PyTorch checkpoint
-            model_file = os.path.join(pretrained_model_name_or_path, weights_name)
-            return model_file
-        elif subfolder is not None and os.path.isfile(
-            os.path.join(pretrained_model_name_or_path, subfolder, weights_name)
-        ):
-            model_file = os.path.join(pretrained_model_name_or_path, subfolder, weights_name)
-            return model_file
-        else:
-            raise EnvironmentError(
-                f"Error no file named {weights_name} found in directory {pretrained_model_name_or_path}."
-            )
-    else:
-        return bos_hf_download(
-            pretrained_model_name_or_path,
-            filename=weights_name,
-            subfolder=subfolder,
-            cache_dir=cache_dir,
-            force_download=force_download,
-            revision=revision,
-            from_hf_hub=from_hf_hub,
-            proxies=proxies,
-            resume_download=resume_download,
-            local_files_only=local_files_only,
-            use_auth_token=use_auth_token,
-            user_agent=user_agent,
-            file_lock_timeout=file_lock_timeout,
-            commit_hash=commit_hash,
-        )
-
-
-REPO_TYPES = ["model"]
-DEFAULT_REVISION = "main"
-# REPO_ID_SEPARATOR = "--"
-REGEX_COMMIT_HASH = re.compile(r"^[0-9a-f]{40}$")
-PPDIFFUSERS_BOS_URL_TEMPLATE = PPNLP_BOS_RESOLVE_ENDPOINT + "/{repo_type}/community/{repo_id}/{revision}/{filename}"
-
-ALLOW_PATTERNS_MAPPING = {
-    "scheduler": [
-        "scheduler_config.json",
-    ],
-    "text_encoder": [
-        "model_state.pdparams",
-        "config.json",
-        "model_config.json",
-    ],
-    "safety_checker": [
-        "model_state.pdparams",
-        "config.json",
-        "model_config.json",
-    ],
-    "unet": [
-        "model_state.pdparams",
-        "config.json",
-    ],
-    "vae": [
-        "model_state.pdparams",
-        "config.json",
-    ],
-    "vqvae": [
-        "model_state.pdparams",
-        "config.json",
-    ],
-    "bert": [
-        "model_state.pdparams",
-        "config.json",
-        "model_config.json",
-    ],
-    "tokenizer": [
-        "tokenizer_config.json",
-        "vocab.json",
-        "added_tokens.json",
-        "vocab.txt",
-        "special_tokens_map.json",
-        "spiece.model",
-        "merges.txt",
-        "sentencepiece.bpe.model",
-    ],
-    "feature_extractor": ["preprocessor_config.json"],
-    "transformer": [
-        "model_state.pdparams",
-        "config.json",
-    ],
-    "mel": ["mel_config.json"],
-    "melgan": ["model.onnx"],
-    "others": [
-        # models
-        "model_state.pdparams",
-        "model_config.json",
-        "config.json",
-        # scheduler
-        "scheduler_config.json",
-        # feature_extractor
-        "preprocessor_config.json",
-        # onnx
-        "model.onnx",
-        "pipeline.py",
-        # tokenizer
-        "tokenizer_config.json",
-        "vocab.json",
-        "added_tokens.json",
-        "vocab.txt",
-        "special_tokens_map.json",
-        "spiece.model",
-        "merges.txt",
-        "sentencepiece.bpe.model",
-    ],
-}
-
-logger = get_logger(__name__)
-
-
-def ppdiffusers_bos_url(
-    repo_id: str,
-    filename: str,
-    *,
-    subfolder: Optional[str] = None,
-    repo_type: Optional[str] = None,
-    revision: Optional[str] = None,
-) -> str:
-    if subfolder == "":
-        subfolder = None
-    if subfolder is not None:
-        filename = f"{subfolder}/{filename}"
-
-    if repo_type is None:
-        repo_type = REPO_TYPES[0]
-    if repo_type not in REPO_TYPES:
-        raise ValueError("Invalid repo type")
-    if repo_type == "model":
-        repo_type = "models"
-    if revision is None:
-        revision = DEFAULT_REVISION
-    return PPDIFFUSERS_BOS_URL_TEMPLATE.format(
-        repo_type=repo_type,
-        repo_id=repo_id,
-        revision=quote(revision, safe=""),
-        filename=quote(filename),
-    ).replace(f"/{DEFAULT_REVISION}/", "/")
-
-
-def repo_folder_name(*, repo_id: str, repo_type: str) -> str:
-    # """Return a serialized version of a hf.co repo name and type, safe for disk storage
-    # as a single non-nested folder.
-    # Example: models--julien-c--EsperBERTo-small
-    # """
-    # remove all `/` occurrences to correctly convert repo to directory name
-    # parts = ["ppdiffusers", f"{repo_type}s", *repo_id.split("/")]
-    # return REPO_ID_SEPARATOR.join(parts)
-    return repo_id
-
-
-def ppdiffusers_bos_download(
-    repo_id: str,
-    filename: str,
-    *,
-    subfolder: Optional[str] = None,
-    repo_type: Optional[str] = None,
-    revision: Optional[str] = None,
-    cache_dir: Union[str, Path, None] = None,
-    force_download: bool = False,
-    resume_download: bool = False,
-    file_lock_timeout: int = -1,
-):
-    if cache_dir is None:
-        cache_dir = PPDIFFUSERS_CACHE
-    if revision is None:
-        revision = DEFAULT_REVISION
-    if isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-
-    if subfolder == "":
-        subfolder = None
-    if subfolder is not None:
-        # This is used to create a URL, and not a local path, hence the forward slash.
-        filename = f"{subfolder}/{filename}"
-
-    if repo_type is None:
-        repo_type = REPO_TYPES[0]
-
-    if repo_type not in REPO_TYPES:
-        raise ValueError(f"Invalid repo type: {repo_type}. Accepted repo types are:" f" {str(REPO_TYPES)}")
-    storage_folder = os.path.join(cache_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type))
-    os.makedirs(storage_folder, exist_ok=True)
-
-    # cross platform transcription of filename, to be used as a local file path.
-    relative_filename = os.path.join(*filename.split("/"))
-
-    if REGEX_COMMIT_HASH.match(revision):
-        pointer_path = os.path.join(storage_folder, revision, relative_filename)
-    else:
-        pointer_path = os.path.join(storage_folder, relative_filename)
-
-    if os.path.exists(pointer_path) and not force_download:
-        return pointer_path
-
-    url_to_download = ppdiffusers_bos_url(repo_id, filename, repo_type=repo_type, revision=revision)
-
-    blob_path = os.path.join(storage_folder, filename)
-    # Prevent parallel downloads of the same file with a lock.
-    lock_path = blob_path + ".lock"
-
-    # Some Windows versions do not allow for paths longer than 255 characters.
-    # In this case, we must specify it is an extended path by using the "\\?\" prefix.
-    if os.name == "nt" and len(os.path.abspath(lock_path)) > 255:
-        lock_path = "\\\\?\\" + os.path.abspath(lock_path)
-
-    if os.name == "nt" and len(os.path.abspath(blob_path)) > 255:
-        blob_path = "\\\\?\\" + os.path.abspath(blob_path)
-
-    os.makedirs(os.path.dirname(lock_path), exist_ok=True)
-    with FileLock(lock_path, timeout=file_lock_timeout):
-        # If the download just completed while the lock was activated.
-        if os.path.exists(pointer_path) and not force_download:
-            # Even if returning early like here, the lock will be released.
-            return pointer_path
-
-        if resume_download:
-            incomplete_path = blob_path + ".incomplete"
-
-            @contextmanager
-            def _resumable_file_manager():
-                with open(incomplete_path, "ab") as f:
-                    yield f
-
-            temp_file_manager = _resumable_file_manager
-            if os.path.exists(incomplete_path):
-                resume_size = os.stat(incomplete_path).st_size
-            else:
-                resume_size = 0
-        else:
-            temp_file_manager = partial(  # type: ignore
-                tempfile.NamedTemporaryFile, mode="wb", dir=cache_dir, delete=False
-            )
-            resume_size = 0
-
-        # Download to temporary file, then copy to cache dir once finished.
-        # Otherwise you get corrupt cache entries if the download gets interrupted.
-        with temp_file_manager() as temp_file:
-            logger.info("downloading %s to %s", url_to_download, temp_file.name)
-
-            http_get(
-                url_to_download,
-                temp_file,
-                proxies=None,
-                resume_size=resume_size,
-                headers=None,
-            )
-
-        logger.info("storing %s in cache at %s", url_to_download, blob_path)
-        _chmod_and_replace(temp_file.name, blob_path)
-    try:
-        os.remove(lock_path)
-    except OSError:
-        pass
-
-    return pointer_path
-
-
-def ppdiffusers_url_download(
-    url_to_download: str,
-    cache_dir: Union[str, Path, None] = None,
-    filename: Optional[str] = None,
-    force_download: bool = False,
-    resume_download: bool = False,
-    file_lock_timeout: int = -1,
-):
-    if cache_dir is None:
-        cache_dir = PPDIFFUSERS_CACHE
-    if isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-    if filename is None:
-        filename = url_to_download.split("/")[-1]
-    file_path = os.path.join(cache_dir, filename)
-    # Prevent parallel downloads of the same file with a lock.
-    lock_path = file_path + ".lock"
-    # Some Windows versions do not allow for paths longer than 255 characters.
-    # In this case, we must specify it is an extended path by using the "\\?\" prefix.
-    if os.name == "nt" and len(os.path.abspath(lock_path)) > 255:
-        lock_path = "\\\\?\\" + os.path.abspath(lock_path)
-
-    if os.name == "nt" and len(os.path.abspath(file_path)) > 255:
-        file_path = "\\\\?\\" + os.path.abspath(file_path)
-
-    os.makedirs(os.path.dirname(lock_path), exist_ok=True)
-    with FileLock(lock_path, timeout=file_lock_timeout):
-        # If the download just completed while the lock was activated.
-        if os.path.exists(file_path) and not force_download:
-            # Even if returning early like here, the lock will be released.
-            return file_path
-
-        if resume_download:
-            incomplete_path = file_path + ".incomplete"
-
-            @contextmanager
-            def _resumable_file_manager():
-                with open(incomplete_path, "ab") as f:
-                    yield f
-
-            temp_file_manager = _resumable_file_manager
-            if os.path.exists(incomplete_path):
-                resume_size = os.stat(incomplete_path).st_size
-            else:
-                resume_size = 0
-        else:
-            temp_file_manager = partial(  # type: ignore
-                tempfile.NamedTemporaryFile, mode="wb", dir=cache_dir, delete=False
-            )
-            resume_size = 0
-
-        # Download to temporary file, then copy to cache dir once finished.
-        # Otherwise you get corrupt cache entries if the download gets interrupted.
-        with temp_file_manager() as temp_file:
-            logger.info("downloading %s to %s", url_to_download, temp_file.name)
-
-            http_get(
-                url_to_download,
-                temp_file,
-                proxies=None,
-                resume_size=resume_size,
-                headers=None,
-            )
-
-        logger.info("storing %s in cache at %s", url_to_download, file_path)
-        _chmod_and_replace(temp_file.name, file_path)
-    try:
-        os.remove(lock_path)
-    except OSError:
-        pass
-    return file_path
-
-
-def bos_hf_download(
-    pretrained_model_name_or_path,
-    *,
-    filename,
-    subfolder,
-    cache_dir,
-    force_download=False,
-    revision=None,
-    from_hf_hub=False,
-    proxies=None,
-    resume_download=False,
-    local_files_only=None,
-    use_auth_token=None,
-    user_agent=None,
-    file_lock_timeout=-1,
-    commit_hash=None,
-):
-    if from_hf_hub:
-        # 1. First check if deprecated way of loading from branches is used
-        if (
-            revision in DEPRECATED_REVISION_ARGS
-            and (filename == WEIGHTS_NAME or filename == TORCH_SAFETENSORS_WEIGHTS_NAME)
-            and version.parse(version.parse(__version__).base_version) >= version.parse("0.17.0")
-        ):
-            try:
-                model_file = hf_hub_download(
-                    pretrained_model_name_or_path,
-                    filename=_add_variant(filename, revision),
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    proxies=proxies,
-                    resume_download=resume_download,
-                    local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
-                    user_agent=user_agent,
-                    subfolder=subfolder,
-                    revision=revision or commit_hash,
-                )
-                warnings.warn(
-                    f"Loading the variant {revision} from {pretrained_model_name_or_path} via `revision='{revision}'` is deprecated. Loading instead from `revision='main'` with `variant={revision}`. Loading model variants via `revision='{revision}'` will be removed in diffusers v1. Please use `variant='{revision}'` instead.",
-                    FutureWarning,
-                )
-                return model_file
-            except:  # noqa: E722
-                warnings.warn(
-                    f"You are loading the variant {revision} from {pretrained_model_name_or_path} via `revision='{revision}'`. This behavior is deprecated and will be removed in diffusers v1. One should use `variant='{revision}'` instead. However, it appears that {pretrained_model_name_or_path} currently does not have a {_add_variant(filename, revision)} file in the 'main' branch of {pretrained_model_name_or_path}. \n The Diffusers team and community would be very grateful if you could open an issue: https://github.com/huggingface/diffusers/issues/new with the title '{pretrained_model_name_or_path} is missing {_add_variant(filename, revision)}' so that the correct variant file can be added.",
-                    FutureWarning,
-                )
-        # 2. Load model file as usual
-        try:
-            model_file = hf_hub_download(
-                pretrained_model_name_or_path,
-                filename=filename,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                proxies=proxies,
-                resume_download=resume_download,
-                local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
-                user_agent=user_agent,
-                subfolder=subfolder,
-                revision=revision,
-            )
-            return model_file
-
-        except RepositoryNotFoundError:
-            raise EnvironmentError(
-                f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier "
-                "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a "
-                "token having permission to this repo with `use_auth_token` or log in with `huggingface-cli "
-                "login`."
-            )
-        except RevisionNotFoundError:
-            raise EnvironmentError(
-                f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for "
-                "this model name. Check the model page at "
-                f"'https://huggingface.co/{pretrained_model_name_or_path}' for available revisions."
-            )
-        except EntryNotFoundError:
-            raise EnvironmentError(f"{pretrained_model_name_or_path} does not appear to have a file named {filename}.")
-        except HTTPError as err:
-            raise EnvironmentError(
-                f"There was a specific connection error when trying to load {pretrained_model_name_or_path}:\n{err}"
-            )
-        except ValueError:
-            raise EnvironmentError(
-                f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load this model, couldn't find it"
-                f" in the cached files and it looks like {pretrained_model_name_or_path} is not the path to a"
-                f" directory containing a file named {filename} or"
-                " \nCheckout your internet connection or see how to run the library in"
-                " offline mode at 'https://huggingface.co/docs/diffusers/installation#offline-mode'."
-            )
-        except EnvironmentError:
-            raise EnvironmentError(
-                f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it from "
-                "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
-                f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
-                f"containing a file named {filename}"
-            )
-        except KeyboardInterrupt:
-            raise EnvironmentError(
-                "You have interrupted the download, if you want to continue the download, you can set `resume_download=True`!"
-            )
-    else:
-        try:
-            model_file = ppdiffusers_bos_download(
-                pretrained_model_name_or_path,
-                filename=filename,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                resume_download=resume_download,
-                subfolder=subfolder,
-                revision=revision,
-                file_lock_timeout=file_lock_timeout,
-            )
-            return model_file
-        except HTTPError as err:
-            raise EnvironmentError(
-                f"{err}!\n"
-                f"There was a specific connection error when trying to load '{pretrained_model_name_or_path}'! "
-                f"We couldn't connect to '{PPNLP_BOS_RESOLVE_ENDPOINT}' to load this model, couldn't find it "
-                f"in the cached files and it looks like '{pretrained_model_name_or_path}' is not the path to a "
-                f"directory containing a file named '{filename}'."
-            )
-        except EnvironmentError:
-            raise EnvironmentError(
-                f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it from "
-                f"'{PPNLP_BOS_RESOLVE_ENDPOINT}', make sure you don't have a local directory with the same name. "
-                f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
-                f"containing a file named '{filename}'"
-            )
-        except KeyboardInterrupt:
-            raise EnvironmentError(
-                "You have interrupted the download, if you want to continue the download, you can set `resume_download=True`!"
-            )
-
-
-def url_file_exists(url: str) -> bool:
-    """check whether the url file exists
-
-        refer to: https://stackoverflow.com/questions/2486145/python-check-if-url-to-jpg-exists
-
-    Args:
-        url (str): the url of target file
-
-    Returns:
-        bool: whether the url file exists
-    """
-    is_url = url.startswith("http://") or url.startswith("https://")
-    if not is_url:
-        return False
-
-    result = requests.head(url)
-    return result.status_code == requests.codes.ok
-
-
-def ppdiffusers_bos_dir_download(
-    repo_id: str,
-    *,
-    revision: Optional[str] = None,
-    repo_type: Optional[str] = None,
-    cache_dir: Union[str, Path, None] = None,
-    force_download: bool = False,
-    resume_download: bool = False,
-    folder_names: Optional[Union[List[str], str]] = None,
-    max_workers: int = 1,
-    tqdm_class: Optional[base_tqdm] = None,
-    variant: Optional[str] = None,
-    is_fastdeploy_model: Optional[str] = False,
-    file_lock_timeout: int = -1,
-    local_files_only: bool = False,
-) -> str:
-    # update repo id must end with @fastdeploy
-    if is_fastdeploy_model and not repo_id.endswith("@fastdeploy"):
-        repo_id = f"{repo_id}@fastdeploy"
-
-    if local_files_only:
-        return os.path.join(cache_dir, repo_id)
-
-    filtered_repo_files = [["model_index.json", None]]
-    for subfolder in folder_names:
-        allow_patterns = ALLOW_PATTERNS_MAPPING.get(subfolder, ALLOW_PATTERNS_MAPPING["others"])
-        if is_fastdeploy_model:
-            allow_patterns = [ap for ap in allow_patterns if "pdparams" not in ap]
-            allow_patterns.extend(["inference.pdiparams", "inference.pdmodel"])
-        for filename in allow_patterns:
-            need_to_check_no_variant_file = False
-            raw_filename = filename
-            if "pdparams" in filename:
-                filename = _add_variant(filename, variant)
-                need_to_check_no_variant_file = variant is not None
-
-            url = ppdiffusers_bos_url(
-                repo_id,
-                filename=filename,
-                subfolder=subfolder,
-            )
-            if url_file_exists(url):
-                # exist file
-                filtered_repo_files.append(
-                    [
-                        filename,
-                        subfolder,
-                    ]
-                )
-            else:
-                if need_to_check_no_variant_file:
-                    url = ppdiffusers_bos_url(
-                        repo_id,
-                        filename=raw_filename,
-                        subfolder=subfolder,
-                    )
-                    if url_file_exists(url):
-                        # exist file
-                        filtered_repo_files.append(
-                            [
-                                raw_filename,
-                                subfolder,
-                            ]
-                        )
-
-    def _inner_ppdiffusers_bos_download(repo_file_list):
-        filename, _subfolder = repo_file_list
-        return ppdiffusers_bos_download(
-            repo_id,
-            filename=filename,
-            subfolder=_subfolder,
-            repo_type=repo_type,
-            cache_dir=cache_dir,
-            revision=revision,
-            resume_download=resume_download,
-            force_download=force_download,
-            file_lock_timeout=file_lock_timeout,
-        )
-
-    thread_map(
-        _inner_ppdiffusers_bos_download,
-        filtered_repo_files,
-        desc=f"Fetching {len(filtered_repo_files)} files",
-        max_workers=max_workers,
-        # User can use its own tqdm class or the default one from `huggingface_hub.utils`
-        tqdm_class=tqdm_class or hf_tqdm,
-    )
-    return os.path.join(cache_dir, repo_id)
diff --git a/ppdiffusers/ppdiffusers/utils/dummy_fastdeploy_objects.py b/ppdiffusers/ppdiffusers/utils/dummy_fastdeploy_objects.py
deleted file mode 100644
index d25a79a53e2e..000000000000
--- a/ppdiffusers/ppdiffusers/utils/dummy_fastdeploy_objects.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This file is autogenerated by the command `make fix-copies`, do not edit.
-from . import DummyObject, requires_backends
-
-
-class FastDeployRuntimeModel(metaclass=DummyObject):
-    _backends = ["fastdeploy"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["fastdeploy"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["fastdeploy"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["fastdeploy"])
diff --git a/ppdiffusers/ppdiffusers/utils/dummy_note_seq_objects.py b/ppdiffusers/ppdiffusers/utils/dummy_note_seq_objects.py
deleted file mode 100644
index 8e7945eae036..000000000000
--- a/ppdiffusers/ppdiffusers/utils/dummy_note_seq_objects.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This file is autogenerated by the command `make fix-copies`, do not edit.
-from ..utils import DummyObject, requires_backends
-
-
-class MidiProcessor(metaclass=DummyObject):
-    _backends = ["note_seq"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["note_seq"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["note_seq"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["note_seq"])
diff --git a/ppdiffusers/ppdiffusers/utils/dummy_paddle_and_einops_objects.py b/ppdiffusers/ppdiffusers/utils/dummy_paddle_and_einops_objects.py
deleted file mode 100644
index 74ac513030ab..000000000000
--- a/ppdiffusers/ppdiffusers/utils/dummy_paddle_and_einops_objects.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This file is autogenerated by the command `make fix-copies`, do not edit.
-from . import DummyObject, requires_backends
-
-
-class UViTModel(metaclass=DummyObject):
-    _backends = ["paddle", "einops"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "einops"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "einops"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "einops"])
diff --git a/ppdiffusers/ppdiffusers/utils/dummy_paddle_and_librosa_objects.py b/ppdiffusers/ppdiffusers/utils/dummy_paddle_and_librosa_objects.py
deleted file mode 100644
index c31de0af87d5..000000000000
--- a/ppdiffusers/ppdiffusers/utils/dummy_paddle_and_librosa_objects.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This file is autogenerated by the command `make fix-copies`, do not edit.
-from . import DummyObject, requires_backends
-
-
-class AudioDiffusionPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "librosa"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "librosa"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "librosa"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "librosa"])
-
-
-class Mel(metaclass=DummyObject):
-    _backends = ["paddle", "librosa"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "librosa"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "librosa"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "librosa"])
diff --git a/ppdiffusers/ppdiffusers/utils/dummy_paddle_and_paddlenlp_and_einops_objects.py b/ppdiffusers/ppdiffusers/utils/dummy_paddle_and_paddlenlp_and_einops_objects.py
deleted file mode 100644
index 9a58f2018ec3..000000000000
--- a/ppdiffusers/ppdiffusers/utils/dummy_paddle_and_paddlenlp_and_einops_objects.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This file is autogenerated by the command `make fix-copies`, do not edit.
-from . import DummyObject, requires_backends
-
-
-class UniDiffuserPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp", "einops"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp", "einops"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp", "einops"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp", "einops"])
diff --git a/ppdiffusers/ppdiffusers/utils/dummy_paddle_and_paddlenlp_and_fastdeploy_objects.py b/ppdiffusers/ppdiffusers/utils/dummy_paddle_and_paddlenlp_and_fastdeploy_objects.py
deleted file mode 100644
index c3356583dd1f..000000000000
--- a/ppdiffusers/ppdiffusers/utils/dummy_paddle_and_paddlenlp_and_fastdeploy_objects.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This file is autogenerated by the command `make fix-copies`, do not edit.
-# flake8: noqa
-
-from . import DummyObject, requires_backends
-
-
-class FastDeployStableDiffusionImg2ImgPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp", "fastdeploy"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp", "fastdeploy"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp", "fastdeploy"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp", "fastdeploy"])
-
-
-class FastDeployStableDiffusionInpaintPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp", "fastdeploy"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp", "fastdeploy"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp", "fastdeploy"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp", "fastdeploy"])
-
-
-class FastDeployStableDiffusionInpaintPipelineLegacy(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp", "fastdeploy"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp", "fastdeploy"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp", "fastdeploy"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp", "fastdeploy"])
-
-
-class FastDeployStableDiffusionMegaPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp", "fastdeploy"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp", "fastdeploy"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp", "fastdeploy"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp", "fastdeploy"])
-
-
-class FastDeployStableDiffusionPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp", "fastdeploy"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp", "fastdeploy"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp", "fastdeploy"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp", "fastdeploy"])
-
-
-class FastDeployCycleDiffusionPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp", "fastdeploy"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp", "fastdeploy"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp", "fastdeploy"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp", "fastdeploy"])
-
-
-class FastDeployStableDiffusionControlNetPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp", "fastdeploy"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp", "fastdeploy"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp", "fastdeploy"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp", "fastdeploy"])
diff --git a/ppdiffusers/ppdiffusers/utils/dummy_paddle_and_paddlenlp_and_k_diffusion_objects.py b/ppdiffusers/ppdiffusers/utils/dummy_paddle_and_paddlenlp_and_k_diffusion_objects.py
deleted file mode 100644
index f84a2855b7f7..000000000000
--- a/ppdiffusers/ppdiffusers/utils/dummy_paddle_and_paddlenlp_and_k_diffusion_objects.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This file is autogenerated by the command `make fix-copies`, do not edit.
-from . import DummyObject, requires_backends
-
-
-class StableDiffusionKDiffusionPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp", "k_diffusion"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp", "k_diffusion"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp", "k_diffusion"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp", "k_diffusion"])
diff --git a/ppdiffusers/ppdiffusers/utils/dummy_paddle_and_paddlenlp_and_note_seq_objects.py b/ppdiffusers/ppdiffusers/utils/dummy_paddle_and_paddlenlp_and_note_seq_objects.py
deleted file mode 100644
index 75d0f3217b40..000000000000
--- a/ppdiffusers/ppdiffusers/utils/dummy_paddle_and_paddlenlp_and_note_seq_objects.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This file is autogenerated by the command `make fix-copies`, do not edit.
-from ..utils import DummyObject, requires_backends
-
-
-class SpectrogramDiffusionPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp", "note_seq"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp", "note_seq"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp", "note_seq"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp", "note_seq"])
diff --git a/ppdiffusers/ppdiffusers/utils/dummy_paddle_and_paddlenlp_objects.py b/ppdiffusers/ppdiffusers/utils/dummy_paddle_and_paddlenlp_objects.py
deleted file mode 100644
index 3d755460a7cf..000000000000
--- a/ppdiffusers/ppdiffusers/utils/dummy_paddle_and_paddlenlp_objects.py
+++ /dev/null
@@ -1,692 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This file is autogenerated by the command `make fix-copies`, do not edit.
-from . import DummyObject, requires_backends
-
-
-class AltDiffusionImg2ImgPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class AltDiffusionPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class AudioLDMPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class CycleDiffusionPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class IFImg2ImgPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class IFImg2ImgSuperResolutionPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class IFInpaintingPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class IFInpaintingSuperResolutionPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class IFPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class IFSuperResolutionPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class LDMTextToImagePipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class PaintByExamplePipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class StableDiffusionAdapterPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class SemanticStableDiffusionPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class StableDiffusionAttendAndExcitePipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class StableDiffusionControlNetPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class StableDiffusionDepth2ImgPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class StableDiffusionImageVariationPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class StableDiffusionImg2ImgPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class StableDiffusionInpaintPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class StableDiffusionInpaintPipelineLegacy(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class StableDiffusionInstructPix2PixPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class StableDiffusionLatentUpscalePipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class StableDiffusionModelEditingPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class StableDiffusionPanoramaPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class StableDiffusionPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class StableDiffusionMegaPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class StableDiffusionPipelineAllinOne(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class StableDiffusionPipelineSafe(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class StableDiffusionPix2PixZeroPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class StableDiffusionSAGPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class StableDiffusionUpscalePipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class StableUnCLIPImg2ImgPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class StableUnCLIPPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class UnCLIPImageVariationPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class UnCLIPPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class VersatileDiffusionDualGuidedPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class VersatileDiffusionImageVariationPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class VersatileDiffusionPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class VersatileDiffusionTextToImagePipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class VQDiffusionPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class TextToVideoSDPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class TextToVideoZeroPipeline(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class LDMBertModel(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-
-class CaptionDecoder(metaclass=DummyObject):
-    _backends = ["paddle", "paddlenlp"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "paddlenlp"])
diff --git a/ppdiffusers/ppdiffusers/utils/dummy_paddle_and_scipy_objects.py b/ppdiffusers/ppdiffusers/utils/dummy_paddle_and_scipy_objects.py
deleted file mode 100644
index 1e3bf8caf75c..000000000000
--- a/ppdiffusers/ppdiffusers/utils/dummy_paddle_and_scipy_objects.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This file is autogenerated by the command `make fix-copies`, do not edit.
-from . import DummyObject, requires_backends
-
-
-class LMSDiscreteScheduler(metaclass=DummyObject):
-    _backends = ["paddle", "scipy"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "scipy"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "scipy"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "scipy"])
-
-
-class PreconfigLMSDiscreteScheduler(metaclass=DummyObject):
-    _backends = ["paddle", "scipy"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle", "scipy"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "scipy"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle", "scipy"])
diff --git a/ppdiffusers/ppdiffusers/utils/dummy_paddle_objects.py b/ppdiffusers/ppdiffusers/utils/dummy_paddle_objects.py
deleted file mode 100644
index fcbc659ea253..000000000000
--- a/ppdiffusers/ppdiffusers/utils/dummy_paddle_objects.py
+++ /dev/null
@@ -1,780 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This file is autogenerated by the command `make fix-copies`, do not edit.
-from . import DummyObject, requires_backends
-
-
-class AutoencoderKL(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class ControlNetModel(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class ModelMixin(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class MultiAdapter(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class PriorTransformer(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class T5FilmDecoder(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class T2IAdapter(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class Transformer2DModel(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class UNet1DModel(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class UNet2DConditionModel(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class UNet2DModel(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class UNet3DConditionModel(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class VQModel(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-def get_constant_schedule(*args, **kwargs):
-    requires_backends(get_constant_schedule, ["paddle"])
-
-
-def get_constant_schedule_with_warmup(*args, **kwargs):
-    requires_backends(get_constant_schedule_with_warmup, ["paddle"])
-
-
-def get_cosine_schedule_with_warmup(*args, **kwargs):
-    requires_backends(get_cosine_schedule_with_warmup, ["paddle"])
-
-
-def get_cosine_with_hard_restarts_schedule_with_warmup(*args, **kwargs):
-    requires_backends(get_cosine_with_hard_restarts_schedule_with_warmup, ["paddle"])
-
-
-def get_linear_schedule_with_warmup(*args, **kwargs):
-    requires_backends(get_linear_schedule_with_warmup, ["paddle"])
-
-
-def get_polynomial_decay_schedule_with_warmup(*args, **kwargs):
-    requires_backends(get_polynomial_decay_schedule_with_warmup, ["paddle"])
-
-
-def get_scheduler(*args, **kwargs):
-    requires_backends(get_scheduler, ["paddle"])
-
-
-class AudioPipelineOutput(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class DanceDiffusionPipeline(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class DDIMPipeline(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class DDPMPipeline(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class DiffusionPipeline(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class DiTPipeline(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class ImagePipelineOutput(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class KarrasVePipeline(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class LDMPipeline(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class LDMSuperResolutionPipeline(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class PNDMPipeline(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class RePaintPipeline(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class ScoreSdeVePipeline(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class DDIMInverseScheduler(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class DDIMScheduler(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class DDPMScheduler(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class DEISMultistepScheduler(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class DPMSolverMultistepScheduler(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class DPMSolverSinglestepScheduler(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class EulerAncestralDiscreteScheduler(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class PreconfigEulerAncestralDiscreteScheduler(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class EulerDiscreteScheduler(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class HeunDiscreteScheduler(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class IPNDMScheduler(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class KarrasVeScheduler(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class KDPM2AncestralDiscreteScheduler(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class KDPM2DiscreteScheduler(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class PNDMScheduler(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class RePaintScheduler(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class SchedulerMixin(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class ScoreSdeVeScheduler(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class UnCLIPScheduler(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class UniPCMultistepScheduler(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class VQDiffusionScheduler(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class DPMSolverUniDiffuserScheduler(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-
-class EMAModel(metaclass=DummyObject):
-    _backends = ["paddle"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["paddle"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["paddle"])
diff --git a/ppdiffusers/ppdiffusers/utils/dynamic_modules_utils.py b/ppdiffusers/ppdiffusers/utils/dynamic_modules_utils.py
deleted file mode 100644
index 3d30ffc3ac59..000000000000
--- a/ppdiffusers/ppdiffusers/utils/dynamic_modules_utils.py
+++ /dev/null
@@ -1,445 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Utilities to dynamically load objects from the Hub."""
-
-import importlib
-import inspect
-import json
-import os
-import re
-import shutil
-import sys
-from distutils.version import StrictVersion
-from pathlib import Path
-from typing import Dict, Optional, Union
-from urllib import request
-
-from huggingface_hub import HfFolder, cached_download, hf_hub_download, model_info
-
-from . import PPDIFFUSERS_DYNAMIC_MODULE_NAME, PPDIFFUSERS_MODULES_CACHE, logging
-
-COMMUNITY_PIPELINES_URL = (
-    "https://raw.githubusercontent.com/PaddlePaddle/PaddleNLP/{revision}/ppdiffusers/examples/community/{pipeline}.py"
-)
-GITEE_COMMUNITY_PIPELINES_URL = (
-    "https://gitee.com/paddlepaddle/PaddleNLP/raw/{revision}/ppdiffusers/examples/community/{pipeline}.py"
-)
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-def get_ppdiffusers_versions():
-    url = "https://pypi.org/pypi/ppdiffusers/json"
-    releases = json.loads(request.urlopen(url).read())["releases"].keys()
-    ignore = ["0.6.0.dev1"]
-    releases = [r for r in releases if r not in ignore]
-    return sorted(releases, key=StrictVersion)
-
-
-def init_ppdiffusers_modules():
-    """
-    Creates the cache directory for modules with an init, and adds it to the Python path.
-    """
-    # This function has already been executed if PPDIFFUSERS_MODULES_CACHE already is in the Python path.
-    if PPDIFFUSERS_MODULES_CACHE in sys.path:
-        return
-
-    sys.path.append(PPDIFFUSERS_MODULES_CACHE)
-    os.makedirs(PPDIFFUSERS_MODULES_CACHE, exist_ok=True)
-    init_path = Path(PPDIFFUSERS_MODULES_CACHE) / "__init__.py"
-    if not init_path.exists():
-        init_path.touch()
-
-
-def create_dynamic_module(name: Union[str, os.PathLike]):
-    """
-    Creates a dynamic module in the cache directory for modules.
-    """
-    init_ppdiffusers_modules()
-    dynamic_module_path = Path(PPDIFFUSERS_MODULES_CACHE) / name
-    # If the parent module does not exist yet, recursively create it.
-    if not dynamic_module_path.parent.exists():
-        create_dynamic_module(dynamic_module_path.parent)
-    os.makedirs(dynamic_module_path, exist_ok=True)
-    init_path = dynamic_module_path / "__init__.py"
-    if not init_path.exists():
-        init_path.touch()
-
-
-def get_relative_imports(module_file):
-    """
-    Get the list of modules that are relatively imported in a module file.
-
-    Args:
-        module_file (`str` or `os.PathLike`): The module file to inspect.
-    """
-    with open(module_file, "r", encoding="utf-8") as f:
-        content = f.read()
-
-    # Imports of the form `import .xxx`
-    relative_imports = re.findall("^\s*import\s+\.(\S+)\s*$", content, flags=re.MULTILINE)
-    # Imports of the form `from .xxx import yyy`
-    relative_imports += re.findall("^\s*from\s+\.(\S+)\s+import", content, flags=re.MULTILINE)
-    # Unique-ify
-    return list(set(relative_imports))
-
-
-def get_relative_import_files(module_file):
-    """
-    Get the list of all files that are needed for a given module. Note that this function recurses through the relative
-    imports (if a imports b and b imports c, it will return module files for b and c).
-
-    Args:
-        module_file (`str` or `os.PathLike`): The module file to inspect.
-    """
-    no_change = False
-    files_to_check = [module_file]
-    all_relative_imports = []
-
-    # Let's recurse through all relative imports
-    while not no_change:
-        new_imports = []
-        for f in files_to_check:
-            new_imports.extend(get_relative_imports(f))
-
-        module_path = Path(module_file).parent
-        new_import_files = [str(module_path / m) for m in new_imports]
-        new_import_files = [f for f in new_import_files if f not in all_relative_imports]
-        files_to_check = [f"{f}.py" for f in new_import_files]
-
-        no_change = len(new_import_files) == 0
-        all_relative_imports.extend(files_to_check)
-
-    return all_relative_imports
-
-
-def check_imports(filename):
-    """
-    Check if the current Python environment contains all the libraries that are imported in a file.
-    """
-    with open(filename, "r", encoding="utf-8") as f:
-        content = f.read()
-
-    # Imports of the form `import xxx`
-    imports = re.findall("^\s*import\s+(\S+)\s*$", content, flags=re.MULTILINE)
-    # Imports of the form `from xxx import yyy`
-    imports += re.findall("^\s*from\s+(\S+)\s+import", content, flags=re.MULTILINE)
-    # Only keep the top-level module
-    imports = [imp.split(".")[0] for imp in imports if not imp.startswith(".")]
-
-    # Unique-ify and test we got them all
-    imports = list(set(imports))
-    missing_packages = []
-    for imp in imports:
-        try:
-            importlib.import_module(imp)
-        except ImportError:
-            missing_packages.append(imp)
-
-    if len(missing_packages) > 0:
-        raise ImportError(
-            "This modeling file requires the following packages that were not found in your environment: "
-            f"{', '.join(missing_packages)}. Run `pip install {' '.join(missing_packages)}`"
-        )
-
-    return get_relative_imports(filename)
-
-
-def get_class_in_module(class_name, module_path):
-    """
-    Import a module on the cache directory for modules and extract a class from it.
-    """
-    module_path = module_path.replace(os.path.sep, ".")
-    module = importlib.import_module(module_path)
-
-    if class_name is None:
-        return find_pipeline_class(module)
-    return getattr(module, class_name)
-
-
-def find_pipeline_class(loaded_module):
-    """
-    Retrieve pipeline class that inherits from `DiffusionPipeline`. Note that there has to be exactly one class
-    inheriting from `DiffusionPipeline`.
-    """
-    from ..pipelines import DiffusionPipeline
-
-    cls_members = dict(inspect.getmembers(loaded_module, inspect.isclass))
-
-    pipeline_class = None
-    for cls_name, cls in cls_members.items():
-        if (
-            cls_name != DiffusionPipeline.__name__
-            and issubclass(cls, DiffusionPipeline)
-            and cls.__module__.split(".")[0] != "ppdiffusers"
-        ):
-            if pipeline_class is not None:
-                raise ValueError(
-                    f"Multiple classes that inherit from {DiffusionPipeline.__name__} have been found:"
-                    f" {pipeline_class.__name__}, and {cls_name}. Please make sure to define only one in"
-                    f" {loaded_module}."
-                )
-            pipeline_class = cls
-
-    return pipeline_class
-
-
-def get_cached_module_file(
-    pretrained_model_name_or_path: Union[str, os.PathLike],
-    module_file: str,
-    cache_dir: Optional[Union[str, os.PathLike]] = None,
-    force_download: bool = False,
-    resume_download: bool = False,
-    proxies: Optional[Dict[str, str]] = None,
-    use_auth_token: Optional[Union[bool, str]] = None,
-    revision: Optional[str] = None,
-    local_files_only: bool = False,
-):
-    """
-    Prepares Downloads a module from a local folder or a distant repo and returns its path inside the cached
-    Transformers module.
-
-    Args:
-        pretrained_model_name_or_path (`str` or `os.PathLike`):
-            This can be either:
-
-            - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
-              huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced
-              under a user or organization name, like `dbmdz/bert-base-german-cased`.
-            - a path to a *directory* containing a configuration file saved using the
-              [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
-
-        module_file (`str`):
-            The name of the module file containing the class to look for.
-        cache_dir (`str` or `os.PathLike`, *optional*):
-            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
-            cache should not be used.
-        force_download (`bool`, *optional*, defaults to `False`):
-            Whether or not to force to (re-)download the configuration files and override the cached versions if they
-            exist.
-        resume_download (`bool`, *optional*, defaults to `False`):
-            Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
-        proxies (`Dict[str, str]`, *optional*):
-            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
-            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-        use_auth_token (`str` or *bool*, *optional*):
-            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-            when running `transformers-cli login` (stored in `~/.huggingface`).
-        revision (`str`, *optional*, defaults to `"develop"`):
-            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
-            identifier allowed by git.
-        local_files_only (`bool`, *optional*, defaults to `False`):
-            If `True`, will only try to load the tokenizer configuration from local files.
-
-    <Tip>
-
-    You may pass a token in `use_auth_token` if you are not logged in (`huggingface-cli long`) and want to use private
-    or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models).
-
-    </Tip>
-
-    Returns:
-        `str`: The path to the module inside the cache.
-    """
-    # Download and cache module_file from the repo `pretrained_model_name_or_path` of grab it if it's a local file.
-    pretrained_model_name_or_path = str(pretrained_model_name_or_path)
-
-    module_file_or_url = os.path.join(pretrained_model_name_or_path, module_file)
-
-    if os.path.isfile(module_file_or_url):
-        resolved_module_file = module_file_or_url
-        submodule = "local"
-    elif pretrained_model_name_or_path.count("/") == 0:
-        # retrieve github version that matches
-        if revision is None:
-            revision = "develop"
-            logger.info(f"Defaulting to main: {revision}.")
-
-        # community pipeline on GitHub
-        github_url = COMMUNITY_PIPELINES_URL.format(revision=revision, pipeline=pretrained_model_name_or_path)
-        try:
-            resolved_module_file = cached_download(
-                github_url,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                proxies=proxies,
-                resume_download=resume_download,
-                local_files_only=local_files_only,
-                use_auth_token=False,
-            )
-            submodule = "git"
-            module_file = pretrained_model_name_or_path + ".py"
-        except EnvironmentError:
-            logger.error(f"Could not locate the {module_file} inside {pretrained_model_name_or_path}.")
-            raise
-    else:
-        try:
-            # Load from URL or cache if already cached
-            resolved_module_file = hf_hub_download(
-                pretrained_model_name_or_path,
-                filename=module_file,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                proxies=proxies,
-                resume_download=resume_download,
-                local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
-            )
-            submodule = os.path.join("local", "--".join(pretrained_model_name_or_path.split("/")))
-        except EnvironmentError:
-            logger.error(f"Could not locate the {module_file} inside {pretrained_model_name_or_path}.")
-            raise
-
-    # Check we have all the requirements in our environment
-    modules_needed = check_imports(resolved_module_file)
-
-    # Now we move the module inside our cached dynamic modules.
-    full_submodule = PPDIFFUSERS_DYNAMIC_MODULE_NAME + os.path.sep + submodule
-    create_dynamic_module(full_submodule)
-    submodule_path = Path(PPDIFFUSERS_MODULES_CACHE) / full_submodule
-    if submodule == "local" or submodule == "git":
-        # We always copy local files (we could hash the file to see if there was a change, and give them the name of
-        # that hash, to only copy when there is a modification but it seems overkill for now).
-        # The only reason we do the copy is to avoid putting too many folders in sys.path.
-        shutil.copy(resolved_module_file, submodule_path / module_file)
-        for module_needed in modules_needed:
-            module_needed = f"{module_needed}.py"
-            shutil.copy(os.path.join(pretrained_model_name_or_path, module_needed), submodule_path / module_needed)
-    else:
-        # Get the commit hash
-        # TODO: we will get this info in the etag soon, so retrieve it from there and not here.
-        if isinstance(use_auth_token, str):
-            token = use_auth_token
-        elif use_auth_token is True:
-            token = HfFolder.get_token()
-        else:
-            token = None
-
-        commit_hash = model_info(pretrained_model_name_or_path, revision=revision, token=token).sha
-
-        # The module file will end up being placed in a subfolder with the git hash of the repo. This way we get the
-        # benefit of versioning.
-        submodule_path = submodule_path / commit_hash
-        full_submodule = full_submodule + os.path.sep + commit_hash
-        create_dynamic_module(full_submodule)
-
-        if not (submodule_path / module_file).exists():
-            shutil.copy(resolved_module_file, submodule_path / module_file)
-        # Make sure we also have every file with relative
-        for module_needed in modules_needed:
-            if not (submodule_path / module_needed).exists():
-                get_cached_module_file(
-                    pretrained_model_name_or_path,
-                    f"{module_needed}.py",
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    resume_download=resume_download,
-                    proxies=proxies,
-                    use_auth_token=use_auth_token,
-                    revision=revision,
-                    local_files_only=local_files_only,
-                )
-    return os.path.join(full_submodule, module_file)
-
-
-def get_class_from_dynamic_module(
-    pretrained_model_name_or_path: Union[str, os.PathLike],
-    module_file: str,
-    class_name: Optional[str] = None,
-    cache_dir: Optional[Union[str, os.PathLike]] = None,
-    force_download: bool = False,
-    resume_download: bool = False,
-    proxies: Optional[Dict[str, str]] = None,
-    use_auth_token: Optional[Union[bool, str]] = None,
-    revision: Optional[str] = None,
-    local_files_only: bool = False,
-    **kwargs,
-):
-    """
-    Extracts a class from a module file, present in the local folder or repository of a model.
-
-    <Tip warning={true}>
-
-    Calling this function will execute the code in the module file found locally or downloaded from the Hub. It should
-    therefore only be called on trusted repos.
-
-    </Tip>
-
-    Args:
-        pretrained_model_name_or_path (`str` or `os.PathLike`):
-            This can be either:
-
-            - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
-              huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced
-              under a user or organization name, like `dbmdz/bert-base-german-cased`.
-            - a path to a *directory* containing a configuration file saved using the
-              [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
-
-        module_file (`str`):
-            The name of the module file containing the class to look for.
-        class_name (`str`):
-            The name of the class to import in the module.
-        cache_dir (`str` or `os.PathLike`, *optional*):
-            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
-            cache should not be used.
-        force_download (`bool`, *optional*, defaults to `False`):
-            Whether or not to force to (re-)download the configuration files and override the cached versions if they
-            exist.
-        resume_download (`bool`, *optional*, defaults to `False`):
-            Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
-        proxies (`Dict[str, str]`, *optional*):
-            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
-            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-        use_auth_token (`str` or `bool`, *optional*):
-            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-            when running `transformers-cli login` (stored in `~/.huggingface`).
-        revision (`str`, *optional*, defaults to `"main"`):
-            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
-            identifier allowed by git.
-        local_files_only (`bool`, *optional*, defaults to `False`):
-            If `True`, will only try to load the tokenizer configuration from local files.
-
-    <Tip>
-
-    You may pass a token in `use_auth_token` if you are not logged in (`huggingface-cli long`) and want to use private
-    or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models).
-
-    </Tip>
-
-    Returns:
-        `type`: The class, dynamically imported from the module.
-
-    Examples:
-
-    ```python
-    # Download module `modeling.py` from huggingface.co and cache then extract the class `MyBertModel` from this
-    # module.
-    cls = get_class_from_dynamic_module("sgugger/my-bert-model", "modeling.py", "MyBertModel")
-    ```"""
-    # And lastly we get the class inside our newly created module
-    final_module = get_cached_module_file(
-        pretrained_model_name_or_path,
-        module_file,
-        cache_dir=cache_dir,
-        force_download=force_download,
-        resume_download=resume_download,
-        proxies=proxies,
-        use_auth_token=use_auth_token,
-        revision=revision,
-        local_files_only=local_files_only,
-    )
-    return get_class_in_module(class_name, final_module.replace(".py", ""))
diff --git a/ppdiffusers/ppdiffusers/utils/hub_utils.py b/ppdiffusers/ppdiffusers/utils/hub_utils.py
deleted file mode 100644
index 422738a601ef..000000000000
--- a/ppdiffusers/ppdiffusers/utils/hub_utils.py
+++ /dev/null
@@ -1,219 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import os
-import re
-import sys
-import traceback
-from pathlib import Path
-from typing import Dict, Optional, Union
-from uuid import uuid4
-
-from huggingface_hub import HfFolder, ModelCard, ModelCardData, whoami
-from huggingface_hub.file_download import REGEX_COMMIT_HASH
-from huggingface_hub.utils import is_jinja_available
-
-from ..version import VERSION as __version__
-from .constants import DIFFUSERS_CACHE, HUGGINGFACE_CO_RESOLVE_ENDPOINT
-from .import_utils import (
-    ENV_VARS_TRUE_VALUES,
-    _fastdeploy_version,
-    _paddle_version,
-    _torch_version,
-    is_fastdeploy_available,
-    is_paddle_available,
-    is_torch_available,
-)
-from .logging import get_logger
-
-logger = get_logger(__name__)
-
-
-MODEL_CARD_TEMPLATE_PATH = Path(__file__).parent / "model_card_template.md"
-SESSION_ID = uuid4().hex
-HF_HUB_OFFLINE = os.getenv("HF_HUB_OFFLINE", "").upper() in ENV_VARS_TRUE_VALUES
-DISABLE_TELEMETRY = os.getenv("DISABLE_TELEMETRY", "").upper() in ENV_VARS_TRUE_VALUES
-HUGGINGFACE_CO_TELEMETRY = HUGGINGFACE_CO_RESOLVE_ENDPOINT + "/api/telemetry/"
-
-
-def http_user_agent(user_agent: Union[Dict, str, None] = None) -> str:
-    """
-    Formats a user-agent string with basic info about a request.
-    """
-    ua = f"ppdiffusers/{__version__}; python/{sys.version.split()[0]}; session_id/{SESSION_ID}"
-    if DISABLE_TELEMETRY or HF_HUB_OFFLINE:
-        return ua + "; telemetry/off"
-    if is_torch_available():
-        ua += f"; torch/{_torch_version}"
-    if is_paddle_available():
-        ua += f"; paddle/{_paddle_version}"
-    if is_fastdeploy_available():
-        ua += f"; fastdeploy/{_fastdeploy_version}"
-    # CI will set this value to True
-    if os.environ.get("PPDIFFUSERS_IS_CI", "").upper() in ENV_VARS_TRUE_VALUES:
-        ua += "; is_ci/true"
-    if isinstance(user_agent, dict):
-        ua += "; " + "; ".join(f"{k}/{v}" for k, v in user_agent.items())
-    elif isinstance(user_agent, str):
-        ua += "; " + user_agent
-    return ua
-
-
-def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
-    if token is None:
-        token = HfFolder.get_token()
-    if organization is None:
-        username = whoami(token)["name"]
-        return f"{username}/{model_id}"
-    else:
-        return f"{organization}/{model_id}"
-
-
-def create_model_card(args, model_name):
-    if not is_jinja_available():
-        raise ValueError(
-            "Modelcard rendering is based on Jinja templates."
-            " Please make sure to have `jinja` installed before using `create_model_card`."
-            " To install it, please run `pip install Jinja2`."
-        )
-
-    if hasattr(args, "local_rank") and args.local_rank not in [-1, 0]:
-        return
-
-    hub_token = args.hub_token if hasattr(args, "hub_token") else None
-    repo_name = get_full_repo_name(model_name, token=hub_token)
-
-    model_card = ModelCard.from_template(
-        card_data=ModelCardData(  # Card metadata object that will be converted to YAML block
-            language="en",
-            license="apache-2.0",
-            library_name="ppdiffusers",
-            tags=[],
-            datasets=args.dataset_name,
-            metrics=[],
-        ),
-        template_path=MODEL_CARD_TEMPLATE_PATH,
-        model_name=model_name,
-        repo_name=repo_name,
-        dataset_name=args.dataset_name if hasattr(args, "dataset_name") else None,
-        learning_rate=args.learning_rate,
-        train_batch_size=args.train_batch_size,
-        eval_batch_size=args.eval_batch_size,
-        gradient_accumulation_steps=(
-            args.gradient_accumulation_steps if hasattr(args, "gradient_accumulation_steps") else None
-        ),
-        adam_beta1=args.adam_beta1 if hasattr(args, "adam_beta1") else None,
-        adam_beta2=args.adam_beta2 if hasattr(args, "adam_beta2") else None,
-        adam_weight_decay=args.adam_weight_decay if hasattr(args, "adam_weight_decay") else None,
-        adam_epsilon=args.adam_epsilon if hasattr(args, "adam_epsilon") else None,
-        lr_scheduler=args.lr_scheduler if hasattr(args, "lr_scheduler") else None,
-        lr_warmup_steps=args.lr_warmup_steps if hasattr(args, "lr_warmup_steps") else None,
-        ema_inv_gamma=args.ema_inv_gamma if hasattr(args, "ema_inv_gamma") else None,
-        ema_power=args.ema_power if hasattr(args, "ema_power") else None,
-        ema_max_decay=args.ema_max_decay if hasattr(args, "ema_max_decay") else None,
-        mixed_precision=args.mixed_precision,
-    )
-
-    card_path = os.path.join(args.output_dir, "README.md")
-    model_card.save(card_path)
-
-
-def extract_commit_hash(resolved_file: Optional[str], commit_hash: Optional[str] = None):
-    """
-    Extracts the commit hash from a resolved filename toward a cache file.
-    """
-    if resolved_file is None or commit_hash is not None:
-        return commit_hash
-    resolved_file = str(Path(resolved_file).as_posix())
-    search = re.search(r"snapshots/([^/]+)/", resolved_file)
-    if search is None:
-        return None
-    commit_hash = search.groups()[0]
-    return commit_hash if REGEX_COMMIT_HASH.match(commit_hash) else None
-
-
-# Old default cache path, potentially to be migrated.
-# This logic was more or less taken from `transformers`, with the following differences:
-# - Diffusers doesn't use custom environment variables to specify the cache path.
-# - There is no need to migrate the cache format, just move the files to the new location.
-hf_cache_home = os.path.expanduser(
-    os.getenv("HF_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "huggingface"))
-)
-old_diffusers_cache = os.path.join(hf_cache_home, "diffusers")
-
-
-def move_cache(old_cache_dir: Optional[str] = None, new_cache_dir: Optional[str] = None) -> None:
-    if new_cache_dir is None:
-        new_cache_dir = DIFFUSERS_CACHE
-    if old_cache_dir is None:
-        old_cache_dir = old_diffusers_cache
-
-    old_cache_dir = Path(old_cache_dir).expanduser()
-    new_cache_dir = Path(new_cache_dir).expanduser()
-    # move file blob by blob
-    for old_blob_path in old_cache_dir.glob("**/blobs/*"):
-        if old_blob_path.is_file() and not old_blob_path.is_symlink():
-            new_blob_path = new_cache_dir / old_blob_path.relative_to(old_cache_dir)
-            new_blob_path.parent.mkdir(parents=True, exist_ok=True)
-            os.replace(old_blob_path, new_blob_path)
-            try:
-                os.symlink(new_blob_path, old_blob_path)
-            except OSError:
-                logger.warning(
-                    "Could not create symlink between old cache and new cache. If you use an older version of diffusers again, files will be re-downloaded."
-                )
-
-    # At this point, old_cache_dir contains symlinks to the new cache (it can still be used).
-
-
-cache_version_file = os.path.join(DIFFUSERS_CACHE, "version_diffusers_cache.txt")
-if not os.path.isfile(cache_version_file):
-    cache_version = 0
-else:
-    try:
-        with open(cache_version_file) as f:
-            cache_version = int(f.read())
-    except Exception:
-        cache_version = 0
-
-if cache_version < 1:
-    old_cache_is_not_empty = os.path.isdir(old_diffusers_cache) and len(os.listdir(old_diffusers_cache)) > 0
-    if old_cache_is_not_empty:
-        logger.warning(
-            "The cache for model files in Diffusers v0.14.0 has moved to a new location. Moving your "
-            "existing cached models. This is a one-time operation, you can interrupt it or run it "
-            "later by calling `diffusers.utils.hub_utils.move_cache()`."
-        )
-        try:
-            move_cache()
-        except Exception as e:
-            trace = "\n".join(traceback.format_tb(e.__traceback__))
-            logger.error(
-                f"There was a problem when trying to move your cache:\n\n{trace}\n{e.__class__.__name__}: {e}\n\nPlease "
-                "file an issue at https://github.com/huggingface/diffusers/issues/new/choose, copy paste this whole "
-                "message and we will do our best to help."
-            )
-
-if cache_version < 1:
-    try:
-        os.makedirs(DIFFUSERS_CACHE, exist_ok=True)
-        with open(cache_version_file, "w") as f:
-            f.write("1")
-    except Exception:
-        logger.warning(
-            f"There was a problem when trying to write in your cache folder ({DIFFUSERS_CACHE}). Please, ensure "
-            "the directory exists and can be written to."
-        )
diff --git a/ppdiffusers/ppdiffusers/utils/import_utils.py b/ppdiffusers/ppdiffusers/utils/import_utils.py
deleted file mode 100644
index d37de014ac11..000000000000
--- a/ppdiffusers/ppdiffusers/utils/import_utils.py
+++ /dev/null
@@ -1,619 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Import utilities: Utilities related to imports and our lazy inits.
-"""
-import importlib.util
-import operator as op
-import os
-import sys
-from collections import OrderedDict
-from typing import Union
-
-from packaging.version import Version, parse
-
-from . import logging
-
-# The package importlib_metadata is in a different place, depending on the python version.
-if sys.version_info < (3, 8):
-    import importlib_metadata
-else:
-    import importlib.metadata as importlib_metadata
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
-ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({"AUTO"})
-
-USE_PADDLE = os.environ.get("USE_PADDLE", "AUTO").upper()
-USE_SAFETENSORS = os.environ.get("USE_SAFETENSORS", "AUTO").upper()
-
-STR_OPERATION_TO_FUNC = {">": op.gt, ">=": op.ge, "==": op.eq, "!=": op.ne, "<=": op.le, "<": op.lt}
-
-_paddle_version = "N/A"
-if USE_PADDLE in ENV_VARS_TRUE_AND_AUTO_VALUES:
-    _paddle_available = importlib.util.find_spec("paddle") is not None
-    _ppxformers_available = False
-    if _paddle_available:
-        try:
-            import paddle
-
-            _paddle_version = paddle.__version__
-            logger.info(f"Paddle version {_paddle_version} available.")
-        except importlib_metadata.PackageNotFoundError:
-            _paddle_available = False
-
-        if _paddle_available:
-            try:
-                from paddle.incubate.nn.memory_efficient_attention import (  # noqa
-                    memory_efficient_attention,
-                )
-
-                _ppxformers_available = True
-            except ImportError:
-                _ppxformers_available = False
-
-else:
-    logger.info("Disabling Paddle because USE_PADDLE is set")
-    _paddle_available = False
-    _ppxformers_available = False
-
-_torch_version = "N/A"
-_torch_available = importlib.util.find_spec("torch") is not None
-if _torch_available:
-    try:
-        _torch_version = importlib_metadata.version("torch")
-        logger.info(f"PyTorch version {_torch_version} available.")
-    except importlib_metadata.PackageNotFoundError:
-        _torch_available = False
-
-if USE_SAFETENSORS in ENV_VARS_TRUE_AND_AUTO_VALUES:
-    _safetensors_available = importlib.util.find_spec("safetensors") is not None
-    if _safetensors_available:
-        try:
-            _safetensors_version = importlib_metadata.version("safetensors")
-            logger.info(f"Safetensors version {_safetensors_version} available.")
-        except importlib_metadata.PackageNotFoundError:
-            _safetensors_available = False
-else:
-    logger.info("Disabling Safetensors because USE_TF is set")
-    _safetensors_available = False
-
-_transformers_available = importlib.util.find_spec("transformers") is not None
-try:
-    _transformers_version = importlib_metadata.version("transformers")
-    logger.debug(f"Successfully imported transformers version {_transformers_version}")
-except importlib_metadata.PackageNotFoundError:
-    _transformers_available = False
-
-
-_inflect_available = importlib.util.find_spec("inflect") is not None
-try:
-    _inflect_version = importlib_metadata.version("inflect")
-    logger.debug(f"Successfully imported inflect version {_inflect_version}")
-except importlib_metadata.PackageNotFoundError:
-    _inflect_available = False
-
-
-_unidecode_available = importlib.util.find_spec("unidecode") is not None
-try:
-    _unidecode_version = importlib_metadata.version("unidecode")
-    logger.debug(f"Successfully imported unidecode version {_unidecode_version}")
-except importlib_metadata.PackageNotFoundError:
-    _unidecode_available = False
-
-_fastdeploy_version = "N/A"
-_fastdeploy_available = importlib.util.find_spec("fastdeploy") is not None
-if _fastdeploy_available:
-    candidates = ("fastdeploy_gpu_python", "fastdeploy_python")
-    # For the metadata, we have to look for both fastdeploy_python and fastdeploy_gpu_python
-    for pkg in candidates:
-        try:
-            _fastdeploy_version = importlib_metadata.version(pkg)
-            break
-        except importlib_metadata.PackageNotFoundError:
-            pass
-    _fastdeploy_available = _fastdeploy_version != "N/A"
-    if _fastdeploy_available:
-        logger.debug(f"Successfully imported fastdeploy version {_fastdeploy_version}")
-
-_paddlenlp_available = importlib.util.find_spec("paddlenlp") is not None
-try:
-    _paddlenlp_version = importlib_metadata.version("paddlenlp")
-    logger.debug(f"Successfully imported paddlenlp version {_paddlenlp_version}")
-except importlib_metadata.PackageNotFoundError:
-    _paddlenlp_available = False
-
-# (sayakpaul): importlib.util.find_spec("opencv-python") returns None even when it's installed.
-# _opencv_available = importlib.util.find_spec("opencv-python") is not None
-try:
-    candidates = (
-        "opencv-python",
-        "opencv-contrib-python",
-        "opencv-python-headless",
-        "opencv-contrib-python-headless",
-    )
-    _opencv_version = None
-    for pkg in candidates:
-        try:
-            _opencv_version = importlib_metadata.version(pkg)
-            break
-        except importlib_metadata.PackageNotFoundError:
-            pass
-    _opencv_available = _opencv_version is not None
-    if _opencv_available:
-        logger.debug(f"Successfully imported cv2 version {_opencv_version}")
-except importlib_metadata.PackageNotFoundError:
-    _opencv_available = False
-
-_scipy_available = importlib.util.find_spec("scipy") is not None
-try:
-    _scipy_version = importlib_metadata.version("scipy")
-    logger.debug(f"Successfully imported scipy version {_scipy_version}")
-except importlib_metadata.PackageNotFoundError:
-    _scipy_available = False
-
-_librosa_available = importlib.util.find_spec("librosa") is not None
-try:
-    _librosa_version = importlib_metadata.version("librosa")
-    logger.debug(f"Successfully imported librosa version {_librosa_version}")
-except importlib_metadata.PackageNotFoundError:
-    _librosa_available = False
-
-_k_diffusion_available = importlib.util.find_spec("k_diffusion") is not None
-try:
-    _k_diffusion_version = importlib_metadata.version("k_diffusion")
-    logger.debug(f"Successfully imported k-diffusion version {_k_diffusion_version}")
-except importlib_metadata.PackageNotFoundError:
-    _k_diffusion_available = False
-
-_note_seq_available = importlib.util.find_spec("note_seq") is not None
-try:
-    _note_seq_version = importlib_metadata.version("note_seq")
-    logger.debug(f"Successfully imported note-seq version {_note_seq_version}")
-except importlib_metadata.PackageNotFoundError:
-    _note_seq_available = False
-
-_wandb_available = importlib.util.find_spec("wandb") is not None
-try:
-    _wandb_version = importlib_metadata.version("wandb")
-    logger.debug(f"Successfully imported wandb version {_wandb_version }")
-except importlib_metadata.PackageNotFoundError:
-    _wandb_available = False
-
-_omegaconf_available = importlib.util.find_spec("omegaconf") is not None
-try:
-    _omegaconf_version = importlib_metadata.version("omegaconf")
-    logger.debug(f"Successfully imported omegaconf version {_omegaconf_version}")
-except importlib_metadata.PackageNotFoundError:
-    _omegaconf_available = False
-
-_tensorboard_available = importlib.util.find_spec("tensorboard")
-try:
-    _tensorboard_version = importlib_metadata.version("tensorboard")
-    logger.debug(f"Successfully imported tensorboard version {_tensorboard_version}")
-except importlib_metadata.PackageNotFoundError:
-    _tensorboard_available = False
-
-_visualdl_available = importlib.util.find_spec("visualdl")
-try:
-    _visualdl_version = importlib_metadata.version("visualdl")
-    logger.debug(f"Successfully imported visualdl version {_visualdl_version}")
-except importlib_metadata.PackageNotFoundError:
-    _visualdl_available = False
-
-_einops_available = importlib.util.find_spec("einops")
-try:
-    try:
-        import einops
-        import einops.layers.paddle
-
-        einops.layers.paddle
-        logger.debug(f"Successfully imported einops version {einops.__version__}")
-    except ImportError:
-        _einops_available = False
-except importlib_metadata.PackageNotFoundError:
-    _einops_available = False
-
-_compel_available = importlib.util.find_spec("compel")
-try:
-    _compel_version = importlib_metadata.version("compel")
-    logger.debug(f"Successfully imported compel version {_compel_version}")
-except importlib_metadata.PackageNotFoundError:
-    _compel_available = False
-
-
-_ftfy_available = importlib.util.find_spec("ftfy") is not None
-try:
-    _ftfy_version = importlib_metadata.version("ftfy")
-    logger.debug(f"Successfully imported ftfy version {_ftfy_version}")
-except importlib_metadata.PackageNotFoundError:
-    _ftfy_available = False
-
-
-_bs4_available = importlib.util.find_spec("bs4") is not None
-try:
-    # importlib metadata under different name
-    _bs4_version = importlib_metadata.version("beautifulsoup4")
-    logger.debug(f"Successfully imported ftfy version {_bs4_version}")
-except importlib_metadata.PackageNotFoundError:
-    _bs4_available = False
-
-
-def is_paddle_available():
-    return _paddle_available
-
-
-def is_paddlenlp_available():
-    return _paddlenlp_available
-
-
-def is_visualdl_available():
-    return _visualdl_available
-
-
-def is_fastdeploy_available():
-    return _fastdeploy_available
-
-
-def is_ppxformers_available():
-    return _ppxformers_available
-
-
-def is_torch_available():
-    return _torch_available
-
-
-def is_safetensors_available():
-    return _safetensors_available
-
-
-def is_transformers_available():
-    return _transformers_available
-
-
-def is_inflect_available():
-    return _inflect_available
-
-
-def is_unidecode_available():
-    return _unidecode_available
-
-
-def is_opencv_available():
-    return _opencv_available
-
-
-def is_scipy_available():
-    return _scipy_available
-
-
-def is_librosa_available():
-    return _librosa_available
-
-
-def is_k_diffusion_available():
-    return False  # _k_diffusion_available
-
-
-def is_wandb_available():
-    return _wandb_available
-
-
-def is_omegaconf_available():
-    return _omegaconf_available
-
-
-def is_tensorboard_available():
-    return _tensorboard_available
-
-
-def is_einops_available():
-    return _einops_available
-
-
-def is_note_seq_available():
-    return _note_seq_available
-
-
-def is_compel_available():
-    return _compel_available
-
-
-def is_ftfy_available():
-    return _ftfy_available
-
-
-def is_bs4_available():
-    return _bs4_available
-
-
-# docstyle-ignore
-FASTDEPLOY_IMPORT_ERROR = """
-{0} requires the fastdeploy library but it was not found in your environment. You can install it with pip: `pip install
-fastdeploy-gpu-python -f https://www.paddlepaddle.org.cn/whl/fastdeploy.html`
-"""
-
-# docstyle-ignore
-PADDLE_IMPORT_ERROR = """
-{0} requires the Paddle library but it was not found in your environment. Checkout the instructions on the
-installation page: https://www.paddlepaddle.org.cn/install/quick and follow the ones that match your environment.
-"""
-
-# docstyle-ignore
-PPXFORMERS_IMPORT_ERROR = """
-{0} requires the scaled_dot_product_attention but your PaddlePaddle donot have this. Checkout the instructions on the
-installation page: https://www.paddlepaddle.org.cn/install/quick and follow the ones that match your environment.
-"""
-
-# docstyle-ignore
-PADDLENLP_IMPORT_ERROR = """
-{0} requires the paddlenlp library but it was not found in your environment. You can install it with pip: `pip
-install paddlenlp`
-"""
-
-# docstyle-ignore
-TENSORBOARD_IMPORT_ERROR = """
-{0} requires the tensorboard library but it was not found in your environment. You can install it with pip: `pip
-install tensorboard`
-"""
-
-# docstyle-ignore
-VISUALDL_IMPORT_ERROR = """
-{0} requires the visualdl library but it was not found in your environment. You can install it with pip: `pip
-install visualdl`
-"""
-
-# docstyle-ignore
-INFLECT_IMPORT_ERROR = """
-{0} requires the inflect library but it was not found in your environment. You can install it with pip: `pip install
-inflect`
-"""
-
-# docstyle-ignore
-PYTORCH_IMPORT_ERROR = """
-{0} requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
-installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
-"""
-
-# docstyle-ignore
-OPENCV_IMPORT_ERROR = """
-{0} requires the OpenCV library but it was not found in your environment. You can install it with pip: `pip
-install opencv-python`
-"""
-
-# docstyle-ignore
-OPENCV_IMPORT_ERROR = """
-{0} requires the OpenCV library but it was not found in your environment. You can install it with pip: `pip
-install opencv-python`
-"""
-
-# docstyle-ignore
-SCIPY_IMPORT_ERROR = """
-{0} requires the scipy library but it was not found in your environment. You can install it with pip: `pip install
-scipy`
-"""
-
-# docstyle-ignore
-LIBROSA_IMPORT_ERROR = """
-{0} requires the librosa library but it was not found in your environment.  Checkout the instructions on the
-installation page: https://librosa.org/doc/latest/install.html and follow the ones that match your environment.
-"""
-
-
-# docstyle-ignore
-UNIDECODE_IMPORT_ERROR = """
-{0} requires the unidecode library but it was not found in your environment. You can install it with pip: `pip install
-Unidecode`
-"""
-
-# docstyle-ignore
-K_DIFFUSION_IMPORT_ERROR = """
-{0} requires the k-diffusion library but it was not found in your environment. You can install it with pip: `pip
-install k-diffusion`
-"""
-
-# docstyle-ignore
-NOTE_SEQ_IMPORT_ERROR = """
-{0} requires the note-seq library but it was not found in your environment. You can install it with pip: `pip
-install note-seq`
-"""
-
-# docstyle-ignore
-WANDB_IMPORT_ERROR = """
-{0} requires the wandb library but it was not found in your environment. You can install it with pip: `pip
-install wandb`
-"""
-
-# docstyle-ignore
-OMEGACONF_IMPORT_ERROR = """
-{0} requires the omegaconf library but it was not found in your environment. You can install it with pip: `pip
-install omegaconf`
-"""
-
-# docstyle-ignore
-TENSORBOARD_IMPORT_ERROR = """
-{0} requires the tensorboard library but it was not found in your environment. You can install it with pip: `pip
-install tensorboard`
-"""
-
-# docstyle-ignore
-COMPEL_IMPORT_ERROR = """
-{0} requires the compel library but it was not found in your environment. You can install it with pip: `pip install compel`
-"""
-
-# docstyle-ignore
-EINOPS_IMPORT_ERROR = """
-{0} requires the einops[paddle] library but it was not found in your environment. You can update it with pip: `pip
-install -U einops or pip install git+https://github.com/arogozhnikov/einops.git `
-"""
-
-# docstyle-ignore
-BS4_IMPORT_ERROR = """
-{0} requires the Beautiful Soup library but it was not found in your environment. You can install it with pip:
-`pip install beautifulsoup4`. Please note that you may need to restart your runtime after installation.
-"""
-
-# docstyle-ignore
-FTFY_IMPORT_ERROR = """
-{0} requires the ftfy library but it was not found in your environment. Checkout the instructions on the
-installation section: https://github.com/rspeer/python-ftfy/tree/master#installing and follow the ones
-that match your environment. Please note that you may need to restart your runtime after installation.
-"""
-
-BACKENDS_MAPPING = OrderedDict(
-    [
-        ("bs4", (is_bs4_available, BS4_IMPORT_ERROR)),
-        ("fastdeploy", (is_fastdeploy_available, FASTDEPLOY_IMPORT_ERROR)),
-        ("paddle", (is_paddle_available, PADDLE_IMPORT_ERROR)),
-        ("paddlenlp", (is_paddlenlp_available, PADDLENLP_IMPORT_ERROR)),
-        ("visualdl", (is_visualdl_available, VISUALDL_IMPORT_ERROR)),
-        ("inflect", (is_inflect_available, INFLECT_IMPORT_ERROR)),
-        ("opencv", (is_opencv_available, OPENCV_IMPORT_ERROR)),
-        ("scipy", (is_scipy_available, SCIPY_IMPORT_ERROR)),
-        ("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)),
-        ("unidecode", (is_unidecode_available, UNIDECODE_IMPORT_ERROR)),
-        ("librosa", (is_librosa_available, LIBROSA_IMPORT_ERROR)),
-        ("k_diffusion", (is_k_diffusion_available, K_DIFFUSION_IMPORT_ERROR)),
-        ("wandb", (is_wandb_available, WANDB_IMPORT_ERROR)),
-        ("omegaconf", (is_omegaconf_available, OMEGACONF_IMPORT_ERROR)),
-        ("tensorboard", (is_tensorboard_available, TENSORBOARD_IMPORT_ERROR)),
-        ("einops", (is_einops_available, EINOPS_IMPORT_ERROR)),
-        ("note_seq", (is_note_seq_available, NOTE_SEQ_IMPORT_ERROR)),
-        ("compel", (is_compel_available, COMPEL_IMPORT_ERROR)),
-        ("ftfy", (is_ftfy_available, FTFY_IMPORT_ERROR)),
-    ]
-)
-
-
-def requires_backends(obj, backends):
-    if not isinstance(backends, (list, tuple)):
-        backends = [backends]
-
-    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
-    checks = (BACKENDS_MAPPING[backend] for backend in backends)
-    failed = [msg.format(name) for available, msg in checks if not available()]
-    if failed:
-        raise ImportError("".join(failed))
-
-    if name in [
-        "VersatileDiffusionTextToImagePipeline",
-        "VersatileDiffusionPipeline",
-        "VersatileDiffusionDualGuidedPipeline",
-        "StableDiffusionImageVariationPipeline",
-        "UnCLIPPipeline",
-    ] and is_paddlenlp_version("<", "2.5.0"):
-        raise ImportError(
-            f"You need to install `paddlenlp>=2.5.0` in order to use {name}: \n```\n pip install"
-            " --upgrade paddlenlp \n```"
-        )
-
-    if name in ["StableDiffusionDepth2ImgPipeline", "StableDiffusionPix2PixZeroPipeline"] and is_paddlenlp_version(
-        "<", "2.5.1"  # TODO version
-    ):
-        raise ImportError(
-            f"You need to install `paddlenlp>=2.5.1` in order to use {name}: \n```\n pip install"
-            " --upgrade paddlenlp \n```"
-        )
-
-
-class DummyObject(type):
-    """
-    Metaclass for the dummy objects. Any class inheriting from it will return the ImportError generated by
-    `requires_backend` each time a user tries to access any method of that class.
-    """
-
-    def __getattr__(cls, key):
-        if key.startswith("_"):
-            return super().__getattr__(cls, key)
-        requires_backends(cls, cls._backends)
-
-
-# This function was copied from: https://github.com/huggingface/accelerate/blob/874c4967d94badd24f893064cc3bef45f57cadf7/src/accelerate/utils/versions.py#L319
-def compare_versions(library_or_version: Union[str, Version], operation: str, requirement_version: str):
-    """
-    Args:
-    Compares a library version to some requirement using a given operation.
-        library_or_version (`str` or `packaging.version.Version`):
-            A library name or a version to check.
-        operation (`str`):
-            A string representation of an operator, such as `">"` or `"<="`.
-        requirement_version (`str`):
-            The version to compare the library version against
-    """
-    if operation not in STR_OPERATION_TO_FUNC.keys():
-        raise ValueError(f"`operation` must be one of {list(STR_OPERATION_TO_FUNC.keys())}, received {operation}")
-    operation = STR_OPERATION_TO_FUNC[operation]
-    if isinstance(library_or_version, str):
-        library_or_version = parse(importlib_metadata.version(library_or_version))
-    return operation(library_or_version, parse(requirement_version))
-
-
-# This function was copied from: https://github.com/huggingface/accelerate/blob/874c4967d94badd24f893064cc3bef45f57cadf7/src/accelerate/utils/versions.py#L338
-def is_torch_version(operation: str, version: str):
-    """
-    Args:
-    Compares the current PyTorch version to a given reference with an operation.
-        operation (`str`):
-            A string representation of an operator, such as `">"` or `"<="`
-        version (`str`):
-            A string version of PyTorch
-    """
-    return compare_versions(parse(_torch_version), operation, version)
-
-
-def is_paddle_version(operation: str, version: str):
-    """
-    Args:
-    Compares the current Paddle version to a given reference with an operation.
-        operation (`str`):
-            A string representation of an operator, such as `">"` or `"<="`
-        version (`str`):
-            A string version of Paddle
-    """
-    return compare_versions(parse(_paddle_version), operation, version)
-
-
-def is_paddlenlp_version(operation: str, version: str):
-    """
-    Args:
-    Compares the current paddlenlp version to a given reference with an operation.
-        operation (`str`):
-            A string representation of an operator, such as `">"` or `"<="`
-        version (`str`):
-            A version string
-    """
-    if not _paddlenlp_available:
-        return False
-    return compare_versions(parse(_paddlenlp_version), operation, version)
-
-
-def is_k_diffusion_version(operation: str, version: str):
-    """
-    Args:
-    Compares the current k-diffusion version to a given reference with an operation.
-        operation (`str`):
-            A string representation of an operator, such as `">"` or `"<="`
-        version (`str`):
-            A version string
-    """
-    if not _k_diffusion_available:
-        return False
-    return compare_versions(parse(_k_diffusion_version), operation, version)
-
-
-class OptionalDependencyNotAvailable(BaseException):
-    """An error indicating that an optional dependency of Diffusers was not found in the environment."""
diff --git a/ppdiffusers/ppdiffusers/utils/initializer_utils.py b/ppdiffusers/ppdiffusers/utils/initializer_utils.py
deleted file mode 100644
index c7182a88b5e9..000000000000
--- a/ppdiffusers/ppdiffusers/utils/initializer_utils.py
+++ /dev/null
@@ -1,325 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-This code is based on https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py
-Ths copyright of pytorch/pytorch is a BSD-style license, as found in the LICENSE file.
-"""
-
-import math
-
-import numpy as np
-import paddle
-import paddle.nn as nn
-
-__all__ = [
-    "uniform_",
-    "normal_",
-    "constant_",
-    "ones_",
-    "zeros_",
-    "xavier_uniform_",
-    "xavier_normal_",
-    "kaiming_uniform_",
-    "kaiming_normal_",
-    "linear_init_",
-    "conv_init_",
-    "reset_initialized_parameter",
-]
-
-
-def _no_grad_uniform_(tensor, a, b):
-    with paddle.no_grad():
-        tensor.uniform_(min=a, max=b)
-    return tensor
-
-
-def _no_grad_normal_(tensor, mean=0.0, std=1.0):
-    with paddle.no_grad():
-        tensor.copy_(paddle.normal(mean=mean, std=std, shape=tensor.shape), True)
-    return tensor
-
-
-def _no_grad_fill_(tensor, value=0.0):
-    with paddle.no_grad():
-        tensor.fill_(value)
-    return tensor
-
-
-def uniform_(tensor, a, b):
-    """
-    Modified tensor inspace using uniform_
-    Args:
-        tensor (paddle.Tensor): paddle Tensor
-        a (float|int): min value.
-        b (float|int): max value.
-    Return:
-        tensor
-    """
-    return _no_grad_uniform_(tensor, a, b)
-
-
-def normal_(tensor, mean=0.0, std=1.0):
-    """
-    Modified tensor inspace using normal_
-    Args:
-        tensor (paddle.Tensor): paddle Tensor
-        mean (float|int): mean value.
-        std (float|int): std value.
-    Return:
-        tensor
-    """
-    return _no_grad_normal_(tensor, mean, std)
-
-
-def constant_(tensor, value=0.0):
-    """
-    Modified tensor inspace using constant_
-    Args:
-        tensor (paddle.Tensor): paddle Tensor
-        value (float|int): value to fill tensor.
-    Return:
-        tensor
-    """
-    return _no_grad_fill_(tensor, value)
-
-
-def ones_(tensor):
-    """
-    Modified tensor inspace using ones_
-    Args:
-        tensor (paddle.Tensor): paddle Tensor
-    Return:
-        tensor
-    """
-    return _no_grad_fill_(tensor, 1)
-
-
-def zeros_(tensor):
-    """
-    Modified tensor inspace using zeros_
-    Args:
-        tensor (paddle.Tensor): paddle Tensor
-    Return:
-        tensor
-    """
-    return _no_grad_fill_(tensor, 0)
-
-
-def vector_(tensor, vector):
-    with paddle.no_grad():
-        tensor.set_value(paddle.to_tensor(vector, dtype=tensor.dtype))
-    return tensor
-
-
-def _calculate_fan_in_and_fan_out(tensor, reverse=False):
-    """
-    Calculate (fan_in, _fan_out) for tensor
-    Args:
-        tensor (Tensor): paddle.Tensor
-        reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. e.g. : conv.weight [cout, cin, kh, kw] is False; linear.weight [cin, cout] is True
-    Return:
-        Tuple[fan_in, fan_out]
-    """
-    if tensor.ndim < 2:
-        raise ValueError("Fan in and fan out can not be computed for tensor with fewer than 2 dimensions")
-
-    if reverse:
-        num_input_fmaps, num_output_fmaps = tensor.shape[0], tensor.shape[1]
-    else:
-        num_input_fmaps, num_output_fmaps = tensor.shape[1], tensor.shape[0]
-
-    receptive_field_size = 1
-    if tensor.ndim > 2:
-        receptive_field_size = np.prod(tensor.shape[2:])
-
-    fan_in = num_input_fmaps * receptive_field_size
-    fan_out = num_output_fmaps * receptive_field_size
-
-    return fan_in, fan_out
-
-
-def xavier_uniform_(tensor, gain=1.0, reverse=False):
-    """
-    Modified tensor inspace using xavier_uniform_
-    Args:
-        tensor (paddle.Tensor): paddle Tensor
-        gain (float): super parameter, 1. default.
-        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
-    Return:
-        tensor
-    """
-    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse)
-    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
-    k = math.sqrt(3.0) * std
-    return _no_grad_uniform_(tensor, -k, k)
-
-
-def xavier_normal_(tensor, gain=1.0, reverse=False):
-    """
-    Modified tensor inspace using xavier_normal_
-    Args:
-        tensor (paddle.Tensor): paddle Tensor
-        gain (float): super parameter, 1. default.
-        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
-    Return:
-        tensor
-    """
-    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse)
-    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
-    return _no_grad_normal_(tensor, 0, std)
-
-
-# reference: https://pytorch.org/docs/stable/_modules/torch/nn/init.html
-def _calculate_correct_fan(tensor, mode, reverse=False):
-    mode = mode.lower()
-    valid_modes = ["fan_in", "fan_out"]
-    if mode not in valid_modes:
-        raise ValueError("Mode {} not supported, please use one of {}".format(mode, valid_modes))
-
-    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse)
-
-    return fan_in if mode == "fan_in" else fan_out
-
-
-def _calculate_gain(nonlinearity, param=None):
-    linear_fns = ["linear", "conv1d", "conv2d", "conv3d", "conv_transpose1d", "conv_transpose2d", "conv_transpose3d"]
-    if nonlinearity in linear_fns or nonlinearity == "sigmoid":
-        return 1
-    elif nonlinearity == "tanh":
-        return 5.0 / 3
-    elif nonlinearity == "relu":
-        return math.sqrt(2.0)
-    elif nonlinearity == "leaky_relu":
-        if param is None:
-            negative_slope = 0.01
-        elif not isinstance(param, bool) and isinstance(param, int) or isinstance(param, float):
-            # True/False are instances of int, hence check above
-            negative_slope = param
-        else:
-            raise ValueError("negative_slope {} not a valid number".format(param))
-        return math.sqrt(2.0 / (1 + negative_slope**2))
-    elif nonlinearity == "selu":
-        return 3.0 / 4
-    else:
-        raise ValueError("Unsupported nonlinearity {}".format(nonlinearity))
-
-
-def kaiming_uniform_(tensor, a=0, mode="fan_in", nonlinearity="leaky_relu", reverse=False):
-    """
-    Modified tensor inspace using kaiming_uniform method
-    Args:
-        tensor (paddle.Tensor): paddle Tensor
-        mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut
-        nonlinearity (str): nonlinearity method name
-        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
-    Return:
-        tensor
-    """
-    fan = _calculate_correct_fan(tensor, mode, reverse)
-    gain = _calculate_gain(nonlinearity, a)
-    std = gain / math.sqrt(fan)
-    k = math.sqrt(3.0) * std
-    return _no_grad_uniform_(tensor, -k, k)
-
-
-def kaiming_normal_(tensor, a=0, mode="fan_in", nonlinearity="leaky_relu", reverse=False):
-    """
-    Modified tensor inspace using kaiming_normal_
-    Args:
-        tensor (paddle.Tensor): paddle Tensor
-        mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut
-        nonlinearity (str): nonlinearity method name
-        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
-    Return:
-        tensor
-    """
-    fan = _calculate_correct_fan(tensor, mode, reverse)
-    gain = _calculate_gain(nonlinearity, a)
-    std = gain / math.sqrt(fan)
-    return _no_grad_normal_(tensor, 0, std)
-
-
-def linear_init_(module):
-    bound = 1 / math.sqrt(module.weight.shape[0])
-    uniform_(module.weight, -bound, bound)
-    uniform_(module.bias, -bound, bound)
-
-
-def conv_init_(module):
-    bound = 1 / np.sqrt(np.prod(module.weight.shape[1:]))
-    uniform_(module.weight, -bound, bound)
-    if module.bias is not None:
-        uniform_(module.bias, -bound, bound)
-
-
-def bias_init_with_prob(prior_prob=0.01):
-    """initialize conv/fc bias value according to a given probability value."""
-    bias_init = float(-np.log((1 - prior_prob) / prior_prob))
-    return bias_init
-
-
-@paddle.no_grad()
-def reset_initialized_parameter(model, include_self=True):
-    """
-    Reset initialized parameter using following method for [conv, linear, embedding, bn]
-    Args:
-        model (paddle.Layer): paddle Layer
-        include_self (bool: False): include_self for Layer.named_sublayers method. Indicate whether including itself
-    Return:
-        None
-    """
-    for _, m in model.named_sublayers(include_self=include_self):
-        if isinstance(m, nn.Conv2D):
-            k = float(m._groups) / (m._in_channels * m._kernel_size[0] * m._kernel_size[1])
-            k = math.sqrt(k)
-            _no_grad_uniform_(m.weight, -k, k)
-            if hasattr(m, "bias") and getattr(m, "bias") is not None:
-                _no_grad_uniform_(m.bias, -k, k)
-
-        elif isinstance(m, nn.Linear):
-            k = math.sqrt(1.0 / m.weight.shape[0])
-            _no_grad_uniform_(m.weight, -k, k)
-            if hasattr(m, "bias") and getattr(m, "bias") is not None:
-                _no_grad_uniform_(m.bias, -k, k)
-
-        elif isinstance(m, nn.Embedding):
-            _no_grad_normal_(m.weight, mean=0.0, std=1.0)
-
-        elif isinstance(m, (nn.BatchNorm2D, nn.LayerNorm)):
-            if hasattr(m, "weight") and getattr(m, "weight") is not None:
-                _no_grad_fill_(m.weight, 1.0)
-            if hasattr(m, "bias") and getattr(m, "bias") is not None:
-                _no_grad_fill_(m.bias, 0)
-
-
-class Init:
-    def __init__(self):
-        for init_func in [
-            uniform_,
-            normal_,
-            constant_,
-            ones_,
-            zeros_,
-            xavier_uniform_,
-            xavier_normal_,
-            kaiming_uniform_,
-            kaiming_normal_,
-            linear_init_,
-            conv_init_,
-        ]:
-            setattr(self, init_func.__name__, init_func)
-
-
-setattr(nn, "init", Init())
diff --git a/ppdiffusers/ppdiffusers/utils/load_utils.py b/ppdiffusers/ppdiffusers/utils/load_utils.py
deleted file mode 100644
index 21583b92650e..000000000000
--- a/ppdiffusers/ppdiffusers/utils/load_utils.py
+++ /dev/null
@@ -1,365 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import io
-import os
-import pickle
-from functools import lru_cache
-from pathlib import Path
-from typing import Union
-from zipfile import ZipFile
-
-import numpy as np
-
-from .constants import get_map_location_default
-from .import_utils import (
-    is_paddle_available,
-    is_safetensors_available,
-    is_torch_available,
-)
-from .logging import get_logger
-
-logger = get_logger(__name__)
-
-__all__ = ["smart_load", "torch_load", "safetensors_load"]
-
-
-paddle_suffix = [".pdparams", ".pd"]
-torch_suffix = [".pt", ".pth", ".bin", ".ckpt"]
-safetensors_suffix = [".safetensors"]
-
-if is_safetensors_available():
-    # patch_bf16 safe tensors
-    import safetensors.numpy
-
-    np.bfloat16 = np.uint16
-    np.bool = bool
-    safetensors.numpy._TYPES.update({"BF16": np.uint16})
-
-if is_torch_available():
-    import torch
-
-    # patch torch.uint16
-    torch.uint16 = torch.bfloat16
-
-if is_paddle_available():
-    import paddle
-
-
-MZ_ZIP_LOCAL_DIR_HEADER_SIZE = 30
-
-
-def is_torch_file(filename=None):
-    if filename is None:
-        return False
-    suffix = Path(str(filename)).suffix.lower()
-    return suffix in torch_suffix + safetensors_suffix
-
-
-def read_prefix_key(path):
-    file_size = os.stat(path).st_size
-    with open(path, "rb") as file_handler:
-        end_index = seek_by_string(file_handler, "data.pkl", file_size)
-        file_handler.seek(MZ_ZIP_LOCAL_DIR_HEADER_SIZE)
-        prefix_key = file_handler.read(end_index - MZ_ZIP_LOCAL_DIR_HEADER_SIZE - len("/data.pkl"))
-    return prefix_key.decode("latin")
-
-
-def seek_by_string(file_handler, string: str, file_size: int) -> int:
-    word_index = 0
-    word_bytes = string.encode("latin")
-    empty_byte = "".encode("latin")
-
-    while word_index < len(string) and file_handler.tell() < file_size:
-        content = file_handler.read(1)
-        if content == empty_byte:
-            break
-
-        if word_bytes[word_index] == content[0]:
-            word_index += 1
-        else:
-            word_index = 0
-
-    if file_handler.tell() >= file_size - 1:
-        raise Exception(f"can't find the find the target string<{string}> in the file")
-    return file_handler.tell()
-
-
-def _maybe_decode_ascii(bytes_str: Union[bytes, str]) -> str:
-    if isinstance(bytes_str, bytes):
-        return bytes_str.decode("ascii")
-    return bytes_str
-
-
-@lru_cache(maxsize=None)
-def _storage_type_to_dtype_to_map():
-    """convert storage type to numpy dtype"""
-    return {
-        "DoubleStorage": np.double,
-        "FloatStorage": np.float32,
-        "HalfStorage": np.half,
-        "LongStorage": np.int64,
-        "IntStorage": np.int32,
-        "ShortStorage": np.int16,
-        "CharStorage": np.int8,
-        "ByteStorage": np.uint8,
-        "BoolStorage": np.bool_,
-        "ComplexDoubleStorage": np.cdouble,
-        "ComplexFloatStorage": np.cfloat,
-        "BFloat16Storage": np.uint16,
-    }
-
-
-class StorageType:
-    """Temp Class for Storage Type"""
-
-    def __init__(self, name):
-        self.dtype = _storage_type_to_dtype_to_map()[name]
-
-    def __str__(self):
-        return f"StorageType(dtype={self.dtype})"
-
-
-def _element_size(dtype: str) -> int:
-    """
-    Returns the element size for a dtype, in bytes
-    """
-    if dtype in [np.float16, np.float32, np.float64]:
-        return np.finfo(dtype).bits >> 3
-    elif dtype == np.bool_:
-        return 1
-    else:
-        return np.iinfo(dtype).bits >> 3
-
-
-class UnpicklerWrapperStage(pickle.Unpickler):
-    def find_class(self, mod_name, name):
-        if type(name) is str and "Storage" in name:
-            try:
-                return StorageType(name)
-            except KeyError:
-                pass
-
-        # pure torch tensor builder
-        if mod_name == "torch._utils":
-            if name == "_rebuild_parameter":
-                return _rebuild_parameter
-            if name == "_rebuild_parameter_with_state":
-                return _rebuild_parameter_with_state
-            return _rebuild_tensor_stage
-
-        # pytorch_lightning tensor builder
-        if "pytorch_lightning" in mod_name:
-            return dumpy
-        return super().find_class(mod_name, name)
-
-
-def _rebuild_tensor_stage(storage, storage_offset, size, stride, requires_grad, backward_hooks):
-    # if a tensor has shape [M, N] and stride is [1, N], it's column-wise / fortran-style
-    # if a tensor has shape [M, N] and stride is [M, 1], it's row-wise / C-style
-    # defautls to C-style
-    if stride is not None and len(stride) > 1 and stride[0] == 1 and stride[1] > 1:
-        order = "F"
-    else:
-        order = "C"
-
-    # fix bug when load https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth
-    numel = int(np.prod(size))
-    return storage[storage_offset : storage_offset + numel].reshape(size, order=order)
-
-
-def _rebuild_parameter(data, requires_grad, backward_hooks):
-    return data
-
-
-def _rebuild_parameter_with_state(data, requires_grad, backward_hooks, state):
-    return data
-
-
-def dumpy(*args, **kwarsg):
-    return None
-
-
-def torch_load(path: str, **pickle_load_args):
-    if is_torch_available():
-        import torch
-
-        state_dict = torch.load(path, map_location="cpu")
-    else:
-        pickle_load_args.update({"encoding": "utf-8"})
-
-        prefix_key = read_prefix_key(path)
-
-        torch_zip = ZipFile(path, "r")
-        loaded_storages = {}
-
-        def load_tensor(dtype, numel, key, location):
-            name = f"{prefix_key}/data/{key}"
-            typed_storage = np.frombuffer(torch_zip.open(name).read()[:numel], dtype=dtype)
-            return typed_storage
-
-        def persistent_load(saved_id):
-            assert isinstance(saved_id, tuple)
-            typename = _maybe_decode_ascii(saved_id[0])
-            data = saved_id[1:]
-
-            assert (
-                typename == "storage"
-            ), f"Unknown typename for persistent_load, expected 'storage' but got '{typename}'"
-            storage_type, key, location, numel = data
-            dtype = storage_type.dtype
-
-            if key in loaded_storages:
-                typed_storage = loaded_storages[key]
-            else:
-                nbytes = numel * _element_size(dtype)
-                typed_storage = load_tensor(dtype, nbytes, key, _maybe_decode_ascii(location))
-                loaded_storages[key] = typed_storage
-
-            return typed_storage
-
-        data_iostream = torch_zip.open(f"{prefix_key}/data.pkl").read()
-        unpickler_stage = UnpicklerWrapperStage(io.BytesIO(data_iostream), **pickle_load_args)
-        unpickler_stage.persistent_load = persistent_load
-        state_dict = unpickler_stage.load()
-        torch_zip.close()
-    return state_dict
-
-
-def convert_to_paddle(state_dict, return_numpy=False, return_global_step=False):
-    pd_state_dict = {}
-    # maybe we will use global_step
-    if return_global_step:
-        pd_state_dict["global_step"] = state_dict.pop("global_step", -1)
-    state_dict = state_dict.get("state_dict", state_dict)
-
-    # ugly
-    # {
-    #    "state_dict" : {"state_dict": {}, "epoch": {}, "xxxxx"}
-    # }
-    if "global_step" in state_dict and "state_dict" in state_dict:
-        if return_global_step:
-            pd_state_dict["global_step"] = state_dict.pop("global_step", -1)
-    while "state_dict" in state_dict:
-        state_dict = state_dict["state_dict"]
-
-    for k, v in state_dict.items():
-        # maybe position id is bfloat32
-        # if "position_id" in k and "int" not in str(v.dtype):
-        #     v = v.numpy().astype("int64") if hasattr(v, "numpy") else v.astype("int64")
-        if v.ndim == 0:
-            v = v.reshape((1,))
-        if not return_numpy:
-            # support bfloat16
-            if "torch.bfloat16" in str(v.dtype):
-                v = v.float()
-                pd_state_dict[k] = (
-                    paddle.to_tensor(v.numpy()).cast(paddle.bfloat16)
-                    if hasattr(v, "numpy")
-                    else paddle.to_tensor(v).cast(paddle.bfloat16)
-                )
-            else:
-                pd_state_dict[k] = paddle.to_tensor(v.numpy()) if hasattr(v, "numpy") else paddle.to_tensor(v)
-        else:
-            pd_state_dict[k] = v.numpy() if hasattr(v, "numpy") else v
-
-    return pd_state_dict
-
-
-def convert_to_numpy(state_dict):
-    state_dict = state_dict.get("state_dict", state_dict)
-    pd_state_dict = {}
-    for k, v in state_dict.items():
-        # maybe position id is bfloat32
-        # if "position_id" in k and "int" not in str(v.dtype):
-        #     v = v.numpy().astype("int64") if hasattr(v, "numpy") else v.astype("int64")
-        if v.ndim == 0:
-            v = v.reshape((1,))
-    return pd_state_dict
-
-
-def safetensors_load(path: str):
-    if is_safetensors_available():
-        try:
-            if is_torch_available():
-                from safetensors.torch import load_file
-
-                data = load_file(path)
-            else:
-                from safetensors.numpy import load_file
-
-                data = load_file(path)
-        except Exception:
-            from safetensors.numpy import load_file
-
-            data = load_file(path)
-    else:
-        raise ImportError("`safetensors_load` requires the `safetensors library: `pip install safetensors`.")
-
-    return data
-
-
-def smart_load(
-    path: str,
-    map_location: str = None,
-    return_numpy: bool = False,
-    return_global_step: bool = False,
-    return_is_torch_weight: bool = False,
-):
-    if map_location is None:
-        map_location = get_map_location_default()
-
-    suffix = Path(path).suffix
-    name = Path(path).name
-    state_dict = None
-    with paddle.device_scope(map_location):
-        if suffix in paddle_suffix:
-            state_dict = paddle.load(path, return_numpy=return_numpy)
-            return state_dict
-
-        if suffix in torch_suffix:
-            state_dict = convert_to_paddle(torch_load(path), return_numpy, return_global_step)
-            if return_is_torch_weight:
-                state_dict["is_torch_weight"] = True
-            return state_dict
-
-        if suffix in safetensors_suffix:
-            state_dict = convert_to_paddle(safetensors_load(path), return_numpy, return_global_step)
-            if return_is_torch_weight:
-                state_dict["is_torch_weight"] = True
-            return state_dict
-
-        # must use safetensors_load first
-        try:
-            state_dict = convert_to_paddle(safetensors_load(path), return_numpy, return_global_step)
-            if return_is_torch_weight:
-                state_dict["is_torch_weight"] = True
-            return state_dict
-        except Exception:
-            logger.info(f"Cant load file {name} with safetensors!")
-        try:
-            state_dict = convert_to_paddle(torch_load(path), return_numpy, return_global_step)
-            if return_is_torch_weight:
-                state_dict["is_torch_weight"] = True
-            return state_dict
-        except Exception:
-            logger.info(f"Cant load file {name} with torch! We will try to load this with safetensors!")
-        try:
-            state_dict = paddle.load(path, return_numpy=return_numpy)
-            return state_dict
-        except Exception:
-            logger.info(f"Cant load file {name} with paddle! We will try to load this with torch/safetensors!")
-    if state_dict is None:
-        raise ValueError(f"Cant load {name}, currently we only support ['torch', 'safetensors', 'paddle']!")
diff --git a/ppdiffusers/ppdiffusers/utils/logging.py b/ppdiffusers/ppdiffusers/utils/logging.py
deleted file mode 100644
index 12b12c075d2e..000000000000
--- a/ppdiffusers/ppdiffusers/utils/logging.py
+++ /dev/null
@@ -1,339 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2022 Optuna, Hugging Face
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Logging utilities."""
-
-import logging
-import os
-import sys
-import threading
-from logging import CRITICAL  # NOQA
-from logging import DEBUG  # NOQA
-from logging import ERROR  # NOQA
-from logging import FATAL  # NOQA
-from logging import INFO  # NOQA
-from logging import NOTSET  # NOQA
-from logging import WARN  # NOQA
-from logging import WARNING  # NOQA
-from typing import Optional
-
-from tqdm import auto as tqdm_lib
-
-_lock = threading.Lock()
-_default_handler: Optional[logging.Handler] = None
-
-log_levels = {
-    "debug": logging.DEBUG,
-    "info": logging.INFO,
-    "warning": logging.WARNING,
-    "error": logging.ERROR,
-    "critical": logging.CRITICAL,
-}
-
-_default_log_level = logging.WARNING
-
-_tqdm_active = True
-
-
-def _get_default_logging_level():
-    """
-    If PPDIFFUSERS_VERBOSITY env var is set to one of the valid choices return that as the new default level. If it is
-    not - fall back to `_default_log_level`
-    """
-    env_level_str = os.getenv("PPDIFFUSERS_VERBOSITY", None)
-    if env_level_str:
-        if env_level_str in log_levels:
-            return log_levels[env_level_str]
-        else:
-            logging.getLogger().warning(
-                f"Unknown option PPDIFFUSERS_VERBOSITY={env_level_str}, "
-                f"has to be one of: { ', '.join(log_levels.keys()) }"
-            )
-    return _default_log_level
-
-
-def _get_library_name() -> str:
-    return __name__.split(".")[0]
-
-
-def _get_library_root_logger() -> logging.Logger:
-    return logging.getLogger(_get_library_name())
-
-
-def _configure_library_root_logger() -> None:
-    global _default_handler
-
-    with _lock:
-        if _default_handler:
-            # This library has already configured the library root logger.
-            return
-        _default_handler = logging.StreamHandler()  # Set sys.stderr as stream.
-        _default_handler.flush = sys.stderr.flush
-
-        # Apply our default configuration to the library root logger.
-        library_root_logger = _get_library_root_logger()
-        library_root_logger.addHandler(_default_handler)
-        library_root_logger.setLevel(_get_default_logging_level())
-        library_root_logger.propagate = False
-
-
-def _reset_library_root_logger() -> None:
-    global _default_handler
-
-    with _lock:
-        if not _default_handler:
-            return
-
-        library_root_logger = _get_library_root_logger()
-        library_root_logger.removeHandler(_default_handler)
-        library_root_logger.setLevel(logging.NOTSET)
-        _default_handler = None
-
-
-def get_log_levels_dict():
-    return log_levels
-
-
-def get_logger(name: Optional[str] = None) -> logging.Logger:
-    """
-    Return a logger with the specified name.
-
-    This function is not supposed to be directly accessed unless you are writing a custom ppdiffusers module.
-    """
-
-    if name is None:
-        name = _get_library_name()
-
-    _configure_library_root_logger()
-    return logging.getLogger(name)
-
-
-def get_verbosity() -> int:
-    """
-    Return the current level for the 🤗 PPDiffusers' root logger as an int.
-
-    Returns:
-        `int`: The logging level.
-
-    <Tip>
-
-    🤗 Diffusers has following logging levels:
-
-    - 50: `ppdiffusers.logging.CRITICAL` or `ppdiffusers.logging.FATAL`
-    - 40: `ppdiffusers.logging.ERROR`
-    - 30: `ppdiffusers.logging.WARNING` or `ppdiffusers.logging.WARN`
-    - 20: `ppdiffusers.logging.INFO`
-    - 10: `ppdiffusers.logging.DEBUG`
-
-    </Tip>"""
-
-    _configure_library_root_logger()
-    return _get_library_root_logger().getEffectiveLevel()
-
-
-def set_verbosity(verbosity: int) -> None:
-    """
-    Set the verbosity level for the 🤗 Diffusers' root logger.
-
-    Args:
-        verbosity (`int`):
-            Logging level, e.g., one of:
-
-            - `ppdiffusers.logging.CRITICAL` or `ppdiffusers.logging.FATAL`
-            - `ppdiffusers.logging.ERROR`
-            - `ppdiffusers.logging.WARNING` or `ppdiffusers.logging.WARN`
-            - `ppdiffusers.logging.INFO`
-            - `ppdiffusers.logging.DEBUG`
-    """
-
-    _configure_library_root_logger()
-    _get_library_root_logger().setLevel(verbosity)
-
-
-def set_verbosity_info():
-    """Set the verbosity to the `INFO` level."""
-    return set_verbosity(INFO)
-
-
-def set_verbosity_warning():
-    """Set the verbosity to the `WARNING` level."""
-    return set_verbosity(WARNING)
-
-
-def set_verbosity_debug():
-    """Set the verbosity to the `DEBUG` level."""
-    return set_verbosity(DEBUG)
-
-
-def set_verbosity_error():
-    """Set the verbosity to the `ERROR` level."""
-    return set_verbosity(ERROR)
-
-
-def disable_default_handler() -> None:
-    """Disable the default handler of the PaddleNLP PPDiffusers' root logger."""
-
-    _configure_library_root_logger()
-
-    assert _default_handler is not None
-    _get_library_root_logger().removeHandler(_default_handler)
-
-
-def enable_default_handler() -> None:
-    """Enable the default handler of the PaddleNLP PPDiffusers' root logger."""
-
-    _configure_library_root_logger()
-
-    assert _default_handler is not None
-    _get_library_root_logger().addHandler(_default_handler)
-
-
-def add_handler(handler: logging.Handler) -> None:
-    """adds a handler to the PaddleNLP PPDiffusers' root logger."""
-
-    _configure_library_root_logger()
-
-    assert handler is not None
-    _get_library_root_logger().addHandler(handler)
-
-
-def remove_handler(handler: logging.Handler) -> None:
-    """removes given handler from the PaddleNLP PPDiffusers' root logger."""
-
-    _configure_library_root_logger()
-
-    assert handler is not None and handler not in _get_library_root_logger().handlers
-    _get_library_root_logger().removeHandler(handler)
-
-
-def disable_propagation() -> None:
-    """
-    Disable propagation of the library log outputs. Note that log propagation is disabled by default.
-    """
-
-    _configure_library_root_logger()
-    _get_library_root_logger().propagate = False
-
-
-def enable_propagation() -> None:
-    """
-    Enable propagation of the library log outputs. Please disable the PaddleNLP PPDiffusers' default handler to prevent
-    double logging if the root logger has been configured.
-    """
-
-    _configure_library_root_logger()
-    _get_library_root_logger().propagate = True
-
-
-def enable_explicit_format() -> None:
-    """
-    Enable explicit formatting for every PaddleNLP PPDiffusers' logger. The explicit formatter is as follows:
-    ```
-        [LEVELNAME|FILENAME|LINE NUMBER] TIME >> MESSAGE
-    ```
-    All handlers currently bound to the root logger are affected by this method.
-    """
-    handlers = _get_library_root_logger().handlers
-
-    for handler in handlers:
-        formatter = logging.Formatter("[%(levelname)s|%(filename)s:%(lineno)s] %(asctime)s >> %(message)s")
-        handler.setFormatter(formatter)
-
-
-def reset_format() -> None:
-    """
-    Resets the formatting for PaddleNLP PPDiffusers' loggers.
-
-    All handlers currently bound to the root logger are affected by this method.
-    """
-    handlers = _get_library_root_logger().handlers
-
-    for handler in handlers:
-        handler.setFormatter(None)
-
-
-def warning_advice(self, *args, **kwargs):
-    """
-    This method is identical to `logger.warning()`, but if env var PPDIFFUSERS_NO_ADVISORY_WARNINGS=1 is set, this
-    warning will not be printed
-    """
-    no_advisory_warnings = os.getenv("PPDIFFUSERS_NO_ADVISORY_WARNINGS", False)
-    if no_advisory_warnings:
-        return
-    self.warning(*args, **kwargs)
-
-
-logging.Logger.warning_advice = warning_advice
-
-
-class EmptyTqdm:
-    """Dummy tqdm which doesn't do anything."""
-
-    def __init__(self, *args, **kwargs):  # pylint: disable=unused-argument
-        self._iterator = args[0] if args else None
-
-    def __iter__(self):
-        return iter(self._iterator)
-
-    def __getattr__(self, _):
-        """Return empty function."""
-
-        def empty_fn(*args, **kwargs):  # pylint: disable=unused-argument
-            return
-
-        return empty_fn
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, type_, value, traceback):
-        return
-
-
-class _tqdm_cls:
-    def __call__(self, *args, **kwargs):
-        if _tqdm_active:
-            return tqdm_lib.tqdm(*args, **kwargs)
-        else:
-            return EmptyTqdm(*args, **kwargs)
-
-    def set_lock(self, *args, **kwargs):
-        self._lock = None
-        if _tqdm_active:
-            return tqdm_lib.tqdm.set_lock(*args, **kwargs)
-
-    def get_lock(self):
-        if _tqdm_active:
-            return tqdm_lib.tqdm.get_lock()
-
-
-tqdm = _tqdm_cls()
-
-
-def is_progress_bar_enabled() -> bool:
-    """Return a boolean indicating whether tqdm progress bars are enabled."""
-    global _tqdm_active
-    return bool(_tqdm_active)
-
-
-def enable_progress_bar():
-    """Enable tqdm progress bar."""
-    global _tqdm_active
-    _tqdm_active = True
-
-
-def disable_progress_bar():
-    """Disable tqdm progress bar."""
-    global _tqdm_active
-    _tqdm_active = False
diff --git a/ppdiffusers/ppdiffusers/utils/model_card_template.md b/ppdiffusers/ppdiffusers/utils/model_card_template.md
deleted file mode 100644
index f473b3237b9b..000000000000
--- a/ppdiffusers/ppdiffusers/utils/model_card_template.md
+++ /dev/null
@@ -1,48 +0,0 @@
----
-{{ card_data }}
----
-
-<!-- This model card has been generated automatically according to the information the training script had access to. You
-should probably proofread and complete it, then remove this comment. -->
-
-# {{ model_name | default("Diffusion Model") }}
-
-## Model description
-
-This diffusion model is trained with the [PPDiffusers](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers) library
-on the `{{ dataset_name }}` dataset.
-
-## Intended uses & limitations
-
-#### How to use
-
-```python
-# TODO: add an example code snippet for running this diffusion pipeline
-```
-
-#### Limitations and bias
-
-[TODO: provide examples of latent issues and potential remediations]
-
-## Training data
-
-[TODO: describe the data used to train the model]
-
-### Training hyperparameters
-
-The following hyperparameters were used during training:
-- learning_rate: {{ learning_rate }}
-- train_batch_size: {{ train_batch_size }}
-- eval_batch_size: {{ eval_batch_size }}
-- gradient_accumulation_steps: {{ gradient_accumulation_steps }}
-- optimizer: AdamW with betas=({{ adam_beta1 }}, {{ adam_beta2 }}), weight_decay={{ adam_weight_decay }} and epsilon={{ adam_epsilon }}
-- lr_scheduler: {{ lr_scheduler }}
-- lr_warmup_steps: {{ lr_warmup_steps }}
-- ema_inv_gamma: {{ ema_inv_gamma }}
-- ema_inv_gamma: {{ ema_power }}
-- ema_inv_gamma: {{ ema_max_decay }}
-- mixed_precision: {{ mixed_precision }}
-
-### Training results
-
-📈 [TensorBoard logs](https://huggingface.co/{{ repo_name }}/tensorboard?#scalars)
diff --git a/ppdiffusers/ppdiffusers/utils/outputs.py b/ppdiffusers/ppdiffusers/utils/outputs.py
deleted file mode 100644
index b71ef22559c4..000000000000
--- a/ppdiffusers/ppdiffusers/utils/outputs.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Generic utilities
-"""
-
-from collections import OrderedDict
-from dataclasses import fields
-from typing import Any, Tuple
-
-import numpy as np
-
-from .import_utils import is_paddle_available
-
-
-def is_tensor(x):
-    """
-    Tests if `x` is a `paddle.Tensor` or `np.ndarray`.
-    """
-    if is_paddle_available():
-        import paddle
-
-        if isinstance(x, paddle.Tensor):
-            return True
-
-    return isinstance(x, np.ndarray)
-
-
-class BaseOutput(OrderedDict):
-    """
-    Base class for all model outputs as dataclass. Has a `__getitem__` that allows indexing by integer or slice (like a
-    tuple) or strings (like a dictionary) that will ignore the `None` attributes. Otherwise behaves like a regular
-    python dictionary.
-
-    <Tip warning={true}>
-
-    You can't unpack a `BaseOutput` directly. Use the [`~utils.BaseOutput.to_tuple`] method to convert it to a tuple
-    before.
-
-    </Tip>
-    """
-
-    def __post_init__(self):
-        class_fields = fields(self)
-
-        # Safety and consistency checks
-        if not len(class_fields):
-            raise ValueError(f"{self.__class__.__name__} has no fields.")
-
-        first_field = getattr(self, class_fields[0].name)
-        other_fields_are_none = all(getattr(self, field.name) is None for field in class_fields[1:])
-
-        if other_fields_are_none and isinstance(first_field, dict):
-            for key, value in first_field.items():
-                self[key] = value
-        else:
-            for field in class_fields:
-                v = getattr(self, field.name)
-                if v is not None:
-                    self[field.name] = v
-
-    def __delitem__(self, *args, **kwargs):
-        raise Exception(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.")
-
-    def setdefault(self, *args, **kwargs):
-        raise Exception(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.")
-
-    def pop(self, *args, **kwargs):
-        raise Exception(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")
-
-    def update(self, *args, **kwargs):
-        raise Exception(f"You cannot use ``update`` on a {self.__class__.__name__} instance.")
-
-    def __getitem__(self, k):
-        if isinstance(k, str):
-            inner_dict = dict(self.items())
-            return inner_dict[k]
-        else:
-            return self.to_tuple()[k]
-
-    def __setattr__(self, name, value):
-        if name in self.keys() and value is not None:
-            # Don't call self.__setitem__ to avoid recursion errors
-            super().__setitem__(name, value)
-        super().__setattr__(name, value)
-
-    def __setitem__(self, key, value):
-        # Will raise a KeyException if needed
-        super().__setitem__(key, value)
-        # Don't call self.__setattr__ to avoid recursion errors
-        super().__setattr__(key, value)
-
-    def to_tuple(self) -> Tuple[Any]:
-        """
-        Convert self to a tuple containing all the attributes/keys that are not `None`.
-        """
-        # try to fix: https://github.com/PaddlePaddle/PaddleNLP/issues/3355
-        # when trying to get the keys of `OrderedDict`, `keys` method return empty values.
-        # TODO(wj-Mcat): this bug should be fixed in Paddle framework
-        tuples = ()
-        for field in fields(self):
-            if getattr(self, field.name, None) is None:
-                continue
-            tuples = tuples + (getattr(self, field.name),)
-
-        return tuples
diff --git a/ppdiffusers/ppdiffusers/utils/paddle_utils.py b/ppdiffusers/ppdiffusers/utils/paddle_utils.py
deleted file mode 100644
index d06d45cee340..000000000000
--- a/ppdiffusers/ppdiffusers/utils/paddle_utils.py
+++ /dev/null
@@ -1,212 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Paddle utilities: Utilities related to Paddle
-"""
-import contextlib
-import time
-from contextlib import contextmanager
-from typing import List, Optional, Tuple, Union
-
-from .import_utils import is_paddle_available
-from .logging import get_logger
-
-logger = get_logger(__name__)  # pylint: disable=invalid-name
-
-if is_paddle_available():
-    import paddle
-
-    class RNGStatesTracker:
-        def __init__(self):
-            self.states_ = {}
-
-        def reset(self):
-            self.states_ = {}
-
-        def remove(self, generator_name=None):
-            if generator_name is not None:
-                del self.states_[generator_name]
-
-        def manual_seed(self, seed, generator_name=None):
-            if generator_name is None:
-                generator_name = str(time.time())
-            if generator_name in self.states_:
-                raise ValueError("state {} already exists".format(generator_name))
-            orig_rng_state = paddle.get_cuda_rng_state()
-            paddle.seed(seed)
-            self.states_[generator_name] = paddle.get_cuda_rng_state()
-            paddle.set_cuda_rng_state(orig_rng_state)
-            return generator_name
-
-        @contextlib.contextmanager
-        def rng_state(self, generator_name=None):
-            if generator_name is not None:
-                if generator_name not in self.states_:
-                    raise ValueError("state {} does not exist".format(generator_name))
-                orig_cuda_rng_state = paddle.get_cuda_rng_state()
-                paddle.set_cuda_rng_state(self.states_[generator_name])
-                try:
-                    yield
-                finally:
-                    self.states_[generator_name] = paddle.get_cuda_rng_state()
-                    paddle.set_cuda_rng_state(orig_cuda_rng_state)
-            else:
-                yield
-
-    RNG_STATE_TRACKER = RNGStatesTracker()
-
-    def get_rng_state_tracker(*args, **kwargs):
-        return RNG_STATE_TRACKER
-
-    paddle.Generator = get_rng_state_tracker
-
-    randn = paddle.randn
-    rand = paddle.rand
-    randint = paddle.randint
-
-    @paddle.jit.not_to_static
-    def randn_pt(shape, dtype=None, name=None, **kwargs):
-        generator = kwargs.get("generator", None)
-        is_bfloat16 = "bfloat16" in str(dtype) or "bfloat16" in paddle.get_default_dtype()
-        if is_bfloat16:
-            if generator is None:
-                return randn(shape, dtype="float16", name=name).cast(paddle.bfloat16)
-            else:
-                with get_rng_state_tracker().rng_state(generator):
-                    return randn(shape, dtype="float16", name=name).cast(paddle.bfloat16)
-        else:
-            if generator is None:
-                return randn(shape, dtype=dtype, name=name)
-            else:
-                with get_rng_state_tracker().rng_state(generator):
-                    return randn(shape, dtype=dtype, name=name)
-
-    @paddle.jit.not_to_static
-    def rand_pt(shape, dtype=None, name=None, **kwargs):
-        generator = kwargs.get("generator", None)
-        if generator is None:
-            return rand(shape, dtype=dtype, name=name)
-        else:
-            with get_rng_state_tracker().rng_state(generator):
-                return rand(shape, dtype=dtype, name=name)
-
-    @paddle.jit.not_to_static
-    def randint_pt(low=0, high=None, shape=[1], dtype=None, name=None, **kwargs):
-        generator = kwargs.get("generator", None)
-        if generator is None:
-            return randint(low=low, high=high, shape=shape, dtype=dtype, name=name)
-        else:
-            with get_rng_state_tracker().rng_state(generator):
-                return randint(low=low, high=high, shape=shape, dtype=dtype, name=name)
-
-    @paddle.jit.not_to_static
-    def randn_like_pt(x, dtype=None, name=None, **kwargs):
-        generator = kwargs.get("generator", None)
-        if dtype is None:
-            dtype = x.dtype
-        return randn_pt(x.shape, dtype=dtype, generator=generator, name=name, **kwargs)
-
-    paddle.randn = randn_pt
-    paddle.rand = rand_pt
-    paddle.randint = randint_pt
-    paddle.randn_like = randn_like_pt
-
-    def randn_tensor(
-        shape: Union[Tuple, List],
-        generator: Optional[Union[List["paddle.Generator"], "paddle.Generator"]] = None,
-        dtype: Optional["paddle.dtype"] = None,
-        *kwargs,
-    ):
-        """This is a helper function that allows to create random tensors on the desired `device` with the desired `dtype`. When
-        passing a list of generators one can seed each batched size individually. If CPU generators are passed the tensor
-        will always be created on CPU.
-        """
-        if isinstance(generator, (list, tuple)):
-            batch_size = shape[0]
-            shape = (1,) + tuple(shape[1:])
-            latents = [randn_pt(shape, generator=generator[i], dtype=dtype) for i in range(batch_size)]
-            latents = paddle.concat(latents, axis=0)
-        else:
-            latents = randn_pt(shape, generator=generator, dtype=dtype)
-
-        return latents
-
-    def rand_tensor(
-        shape: Union[Tuple, List],
-        generator: Optional[Union[List["paddle.Generator"], "paddle.Generator"]] = None,
-        dtype: Optional["paddle.dtype"] = None,
-        *kwargs,
-    ):
-        """This is a helper function that allows to create random tensors on the desired `device` with the desired `dtype`. When
-        passing a list of generators one can seed each batched size individually. If CPU generators are passed the tensor
-        will always be created on CPU.
-        """
-        if isinstance(generator, (list, tuple)):
-            batch_size = shape[0]
-            shape = (1,) + tuple(shape[1:])
-            latents = [rand_pt(shape, generator=generator[i], dtype=dtype) for i in range(batch_size)]
-            latents = paddle.concat(latents, axis=0)
-        else:
-            latents = rand_pt(shape, generator=generator, dtype=dtype)
-
-        return latents
-
-    def randint_tensor(
-        low=0,
-        high=None,
-        shape: Union[Tuple, List] = [1],
-        generator: Optional["paddle.Generator"] = None,
-        dtype: Optional["paddle.dtype"] = None,
-        *kwargs,
-    ):
-        """This is a helper function that allows to create random tensors on the desired `device` with the desired `dtype`. When
-        passing a list of generators one can seed each batched size individually. If CPU generators are passed the tensor
-        will always be created on CPU.
-        """
-        latents = randint_pt(low=low, high=high, shape=shape, dtype=dtype, generator=generator)
-
-        return latents
-
-    @contextmanager
-    def dtype_guard(dtype="float32"):
-        if isinstance(dtype, paddle.dtype):
-            dtype = str(dtype).replace("paddle.", "")
-        origin_dtype = paddle.get_default_dtype()
-        paddle.set_default_dtype(dtype)
-        try:
-            yield
-        finally:
-            paddle.set_default_dtype(origin_dtype)
-
-    paddle.dtype_guard = dtype_guard
-
-    _init_weights = True
-
-    @contextmanager
-    def no_init_weights(_enable=True):
-        """
-        Context manager to globally disable weight initialization to speed up loading large models.
-
-        TODO(Patrick): Delete safety argument `_enable=True` at next major version. .
-        """
-        global _init_weights
-        old_init_weights = _init_weights
-        if _enable:
-            _init_weights = False
-        try:
-            yield
-        finally:
-            _init_weights = old_init_weights
diff --git a/ppdiffusers/ppdiffusers/utils/pil_utils.py b/ppdiffusers/ppdiffusers/utils/pil_utils.py
deleted file mode 100644
index bef4901a7e5f..000000000000
--- a/ppdiffusers/ppdiffusers/utils/pil_utils.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import PIL.Image
-import PIL.ImageOps
-from packaging import version
-from PIL import Image
-
-if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
-    PIL_INTERPOLATION = {
-        "linear": PIL.Image.Resampling.BILINEAR,
-        "bilinear": PIL.Image.Resampling.BILINEAR,
-        "bicubic": PIL.Image.Resampling.BICUBIC,
-        "lanczos": PIL.Image.Resampling.LANCZOS,
-        "nearest": PIL.Image.Resampling.NEAREST,
-    }
-else:
-    PIL_INTERPOLATION = {
-        "linear": PIL.Image.LINEAR,
-        "bilinear": PIL.Image.BILINEAR,
-        "bicubic": PIL.Image.BICUBIC,
-        "lanczos": PIL.Image.LANCZOS,
-        "nearest": PIL.Image.NEAREST,
-    }
-
-
-def pt_to_pil(images):
-    images = (images / 2 + 0.5).clamp(0, 1)
-    images = images.cpu().permute(0, 2, 3, 1).float().numpy()
-    images = numpy_to_pil(images)
-    return images
-
-
-def pd_to_pil(images):
-    images = (images / 2 + 0.5).clip(0, 1)
-    images = images.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy()
-    images = numpy_to_pil(images)
-    return images
-
-
-def numpy_to_pil(images):
-    """
-    Convert a numpy image or a batch of images to a PIL image.
-    """
-    if images.ndim == 3:
-        images = images[None, ...]
-    images = (images * 255).round().astype("uint8")
-    if images.shape[-1] == 1:
-        # special case for grayscale (single channel) images
-        pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
-    else:
-        pil_images = [Image.fromarray(image) for image in images]
-
-    return pil_images
diff --git a/ppdiffusers/ppdiffusers/utils/testing_utils.py b/ppdiffusers/ppdiffusers/utils/testing_utils.py
deleted file mode 100644
index 91eb4d869c80..000000000000
--- a/ppdiffusers/ppdiffusers/utils/testing_utils.py
+++ /dev/null
@@ -1,539 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-import logging
-import os
-import random
-import re
-import tempfile
-import unittest
-import urllib.parse
-from distutils.util import strtobool
-from io import BytesIO, StringIO
-from pathlib import Path
-from typing import List, Optional, Union
-
-import numpy as np
-import PIL.Image
-import PIL.ImageOps
-import requests
-
-from .import_utils import (
-    BACKENDS_MAPPING,
-    is_compel_available,
-    is_fastdeploy_available,
-    is_note_seq_available,
-    is_opencv_available,
-    is_paddle_available,
-    is_paddle_version,
-    is_torch_available,
-)
-from .logging import get_logger
-
-global_rng = random.Random()
-
-logger = get_logger(__name__)
-
-if is_paddle_available():
-    import paddle
-
-    if "PPDIFFUSERS_TEST_DEVICE" in os.environ:
-        paddle_device = os.environ["PPDIFFUSERS_TEST_DEVICE"]
-
-        available_backends = ["gpu", "cpu"]
-        if paddle_device not in available_backends:
-            raise ValueError(
-                f"unknown paddle backend for ppdiffusers tests: {paddle_device}. Available backends are:"
-                f" {available_backends}"
-            )
-        logger.info(f"paddle_device overrode to {paddle_device}")
-    else:
-        paddle_device = "gpu" if paddle.is_compiled_with_cuda() else "cpu"
-
-
-def image_grid(imgs, rows=None, cols=None):
-    if rows is None and cols is None:
-        rows = 1
-        cols = len(imgs)
-    assert len(imgs) == rows * cols
-    w, h = imgs[0].size
-    grid = PIL.Image.new("RGB", size=(cols * w, rows * h))
-
-    for i, img in enumerate(imgs):
-        grid.paste(img, box=(i % cols * w, i // cols * h))
-    return grid
-
-
-def paddle_all_close(a, b, *args, **kwargs):
-    if not is_paddle_available():
-        raise ValueError("Paddle needs to be installed to use this function.")
-    if not paddle.allclose(a, b, *args, **kwargs):
-        assert False, f"Max diff is absolute {(a - b).abs().max()}. Diff tensor is {(a - b).abs()}."
-    return True
-
-
-def print_tensor_test(tensor, filename="test_corrections.txt", expected_tensor_name="expected_slice"):
-    test_name = os.environ.get("PYTEST_CURRENT_TEST")
-    if not paddle.is_tensor(tensor):
-        tensor = paddle.to_tensor(tensor)
-
-    tensor_str = str(tensor.detach().cpu().flatten().cast("float32")).replace("\n", "")
-    # format is usually:
-    # expected_slice = np.array([-0.5713, -0.3018, -0.9814, 0.04663, -0.879, 0.76, -1.734, 0.1044, 1.161])
-    output_str = tensor_str.replace("tensor", f"{expected_tensor_name} = np.array")
-    test_file, test_class, test_fn = test_name.split("::")
-    test_fn = test_fn.split()[0]
-    with open(filename, "a") as f:
-        print(";".join([test_file, test_class, test_fn, output_str]), file=f)
-
-
-def get_tests_dir(append_path=None):
-    """
-    Args:
-        append_path: optional path to append to the tests dir path
-    Return:
-        The full path to the `tests` dir, so that the tests can be invoked from anywhere. Optionally `append_path` is
-        joined after the `tests` dir the former is provided.
-    """
-    # this function caller's __file__
-    caller__file__ = inspect.stack()[1][1]
-    tests_dir = os.path.abspath(os.path.dirname(caller__file__))
-
-    while not tests_dir.endswith("tests"):
-        tests_dir = os.path.dirname(tests_dir)
-
-    if append_path:
-        return os.path.join(tests_dir, append_path)
-    else:
-        return tests_dir
-
-
-def parse_flag_from_env(key, default=False):
-    try:
-        value = os.environ[key]
-    except KeyError:
-        # KEY isn't set, default to `default`.
-        _value = default
-    else:
-        # KEY is set, convert it to True or False.
-        try:
-            _value = strtobool(value)
-        except ValueError:
-            # More values are supported, but let's keep the message simple.
-            raise ValueError(f"If set, {key} must be yes or no.")
-    return _value
-
-
-_run_slow_tests = parse_flag_from_env("RUN_SLOW", default=False)
-_run_nightly_tests = parse_flag_from_env("RUN_NIGHTLY", default=False)
-
-
-def floats_tensor(shape, scale=1.0, rng=None, name=None):
-    """Creates a random float32 tensor"""
-    if rng is None:
-        rng = global_rng
-
-    total_dims = 1
-    for dim in shape:
-        total_dims *= dim
-
-    values = []
-    for _ in range(total_dims):
-        values.append(rng.random() * scale)
-
-    return paddle.to_tensor(values, dtype=paddle.float32).reshape(shape)
-
-
-def slow(test_case):
-    """
-    Decorator marking a test as slow.
-
-    Slow tests are skipped by default. Set the RUN_SLOW environment variable to a truthy value to run them.
-
-    """
-    return unittest.skipUnless(_run_slow_tests, "test is slow")(test_case)
-
-
-def nightly(test_case):
-    """
-    Decorator marking a test that runs nightly in the ppdiffusers CI.
-
-    Slow tests are skipped by default. Set the RUN_NIGHTLY environment variable to a truthy value to run them.
-
-    """
-    return unittest.skipUnless(_run_nightly_tests, "test is nightly")(test_case)
-
-
-def require_paddle_2_5(test_case):
-    """
-    Decorator marking a test that requires Paddle 2.5. These tests are skipped when it isn't installed.
-    """
-    return unittest.skipUnless(is_paddle_available() and is_paddle_version(">=", "2.5.0"), "test requires Paddle 2.5")(
-        test_case
-    )
-
-
-def require_paddle(test_case):
-    """
-    Decorator marking a test that requires Paddle. These tests are skipped when Paddle isn't installed.
-    """
-    return unittest.skipUnless(is_paddle_available(), "test requires Paddle")(test_case)
-
-
-def require_torch(test_case):
-    """Decorator marking a test that requires TORCH."""
-    return unittest.skipUnless(is_torch_available(), "test requires TORCH")(test_case)
-
-
-def require_paddle_gpu(test_case):
-    """Decorator marking a test that requires CUDA and Paddle."""
-    return unittest.skipUnless(is_paddle_available() and paddle_device == "gpu", "test requires Paddle+CUDA")(
-        test_case
-    )
-
-
-def require_compel(test_case):
-    """
-    Decorator marking a test that requires compel: https://github.com/damian0815/compel. These tests are skipped when
-    the library is not installed.
-    """
-    return unittest.skipUnless(is_compel_available(), "test requires compel")(test_case)
-
-
-def require_fastdeploy(test_case):
-    """
-    Decorator marking a test that requires fastdeploy. These tests are skipped when fastdeploy isn't installed.
-    """
-    return unittest.skipUnless(is_fastdeploy_available(), "test requires fastdeploy")(test_case)
-
-
-def require_note_seq(test_case):
-    """
-    Decorator marking a test that requires note_seq. These tests are skipped when note_seq isn't installed.
-    """
-    return unittest.skipUnless(is_note_seq_available(), "test requires note_seq")(test_case)
-
-
-def load_numpy(arry: Union[str, np.ndarray], local_path: Optional[str] = None) -> np.ndarray:
-    if isinstance(arry, str):
-        # local_path = "/home/patrick_huggingface_co/"
-        if local_path is not None:
-            # local_path can be passed to correct images of tests
-            return os.path.join(local_path, "/".join([arry.split("/")[-5], arry.split("/")[-2], arry.split("/")[-1]]))
-        elif arry.startswith("http://") or arry.startswith("https://"):
-            response = requests.get(arry)
-            response.raise_for_status()
-            arry = np.load(BytesIO(response.content))
-        elif os.path.isfile(arry):
-            arry = np.load(arry)
-        else:
-            raise ValueError(
-                f"Incorrect path or url, URLs must start with `http://` or `https://`, and {arry} is not a valid path"
-            )
-    elif isinstance(arry, np.ndarray):
-        pass
-    else:
-        raise ValueError(
-            "Incorrect format used for numpy ndarray. Should be an url linking to an image, a local path, or a"
-            " ndarray."
-        )
-
-    return arry
-
-
-def load_pt(url: str):
-    if is_torch_available():
-        import torch
-
-        response = requests.get(url)
-        response.raise_for_status()
-        arry = torch.load(BytesIO(response.content), map_location="cpu")
-        return arry
-    else:
-        raise ValueError("Please install torch firstly!")
-
-
-def load_pd(url: str):
-    response = requests.get(url)
-    response.raise_for_status()
-    arry = paddle.load(BytesIO(response.content))
-    return arry
-
-
-def load_image(image: Union[str, PIL.Image.Image]) -> PIL.Image.Image:
-    """
-    Args:
-    Loads `image` to a PIL Image.
-        image (`str` or `PIL.Image.Image`):
-            The image to convert to the PIL Image format.
-    Returns:
-        `PIL.Image.Image`: A PIL Image.
-    """
-    if isinstance(image, str):
-        if image.startswith("http://") or image.startswith("https://"):
-            image = PIL.Image.open(requests.get(image, stream=True).raw)
-        elif os.path.isfile(image):
-            image = PIL.Image.open(image)
-        else:
-            raise ValueError(
-                f"Incorrect path or url, URLs must start with `http://` or `https://`, and {image} is not a valid path"
-            )
-    elif isinstance(image, PIL.Image.Image):
-        image = image
-    else:
-        raise ValueError(
-            "Incorrect format used for image. Should be an url linking to an image, a local path, or a PIL image."
-        )
-    image = PIL.ImageOps.exif_transpose(image)
-    image = image.convert("RGB")
-    return image
-
-
-def preprocess_image(image: PIL.Image, batch_size: int):
-    w, h = image.size
-    w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
-    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
-    image = np.array(image).astype(np.float32) / 255.0
-    image = np.vstack([image[None].transpose(0, 3, 1, 2)] * batch_size)
-    image = paddle.to_tensor(image)
-    return 2.0 * image - 1.0
-
-
-def export_to_video(video_frames: List[np.ndarray], output_video_path: str = None) -> str:
-    if is_opencv_available():
-        import cv2
-    else:
-        raise ImportError(BACKENDS_MAPPING["opencv"][1].format("export_to_video"))
-    if output_video_path is None:
-        output_video_path = tempfile.NamedTemporaryFile(suffix=".mp4").name
-
-    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
-    h, w, c = video_frames[0].shape
-    video_writer = cv2.VideoWriter(output_video_path, fourcc, fps=8, frameSize=(w, h))
-    for i in range(len(video_frames)):
-        img = cv2.cvtColor(video_frames[i], cv2.COLOR_RGB2BGR)
-        video_writer.write(img)
-    return output_video_path
-
-
-def load_hf_numpy(path) -> np.ndarray:
-    if not path.startswith("http://") or path.startswith("https://"):
-        path = os.path.join(
-            "https://huggingface.co/datasets/fusing/diffusers-testing/resolve/main", urllib.parse.quote(path)
-        )
-
-    return load_numpy(path)
-
-
-def load_ppnlp_numpy(path) -> np.ndarray:
-    if not path.startswith("http://") or path.startswith("https://"):
-        path = os.path.join(
-            "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/diffusers-testing", urllib.parse.quote(path)
-        )
-    return load_numpy(path)
-
-
-# --- pytest conf functions --- #
-
-# to avoid multiple invocation from tests/conftest.py and examples/conftest.py - make sure it's called only once
-pytest_opt_registered = {}
-
-
-def pytest_addoption_shared(parser):
-    """
-    This function is to be called from `conftest.py` via `pytest_addoption` wrapper that has to be defined there.
-
-    It allows loading both `conftest.py` files at once without causing a failure due to adding the same `pytest`
-    option.
-
-    """
-    option = "--make-reports"
-    if option not in pytest_opt_registered:
-        parser.addoption(
-            option,
-            action="store",
-            default=False,
-            help="generate report files. The value of this option is used as a prefix to report names",
-        )
-        pytest_opt_registered[option] = 1
-
-
-def pytest_terminal_summary_main(tr, id):
-    """
-    Generate multiple reports at the end of test suite run - each report goes into a dedicated file in the current
-    directory. The report files are prefixed with the test suite name.
-
-    This function emulates --duration and -rA pytest arguments.
-
-    This function is to be called from `conftest.py` via `pytest_terminal_summary` wrapper that has to be defined
-    there.
-
-    Args:
-    - tr: `terminalreporter` passed from `conftest.py`
-    - id: unique id like `tests` or `examples` that will be incorporated into the final reports filenames - this is
-      needed as some jobs have multiple runs of pytest, so we can't have them overwrite each other.
-
-    NB: this functions taps into a private _pytest API and while unlikely, it could break should
-    pytest do internal changes - also it calls default internal methods of terminalreporter which
-    can be hijacked by various `pytest-` plugins and interfere.
-
-    """
-    from _pytest.config import create_terminal_writer
-
-    if not len(id):
-        id = "tests"
-
-    config = tr.config
-    orig_writer = config.get_terminal_writer()
-    orig_tbstyle = config.option.tbstyle
-    orig_reportchars = tr.reportchars
-
-    dir = "reports"
-    Path(dir).mkdir(parents=True, exist_ok=True)
-    report_files = {
-        k: f"{dir}/{id}_{k}.txt"
-        for k in [
-            "durations",
-            "errors",
-            "failures_long",
-            "failures_short",
-            "failures_line",
-            "passes",
-            "stats",
-            "summary_short",
-            "warnings",
-        ]
-    }
-
-    # custom durations report
-    # note: there is no need to call pytest --durations=XX to get this separate report
-    # adapted from https://github.com/pytest-dev/pytest/blob/897f151e/src/_pytest/runner.py#L66
-    dlist = []
-    for replist in tr.stats.values():
-        for rep in replist:
-            if hasattr(rep, "duration"):
-                dlist.append(rep)
-    if dlist:
-        dlist.sort(key=lambda x: x.duration, reverse=True)
-        with open(report_files["durations"], "w") as f:
-            durations_min = 0.05  # sec
-            f.write("slowest durations\n")
-            for i, rep in enumerate(dlist):
-                if rep.duration < durations_min:
-                    f.write(f"{len(dlist)-i} durations < {durations_min} secs were omitted")
-                    break
-                f.write(f"{rep.duration:02.2f}s {rep.when:<8} {rep.nodeid}\n")
-
-    def summary_failures_short(tr):
-        # expecting that the reports were --tb=long (default) so we chop them off here to the last frame
-        reports = tr.getreports("failed")
-        if not reports:
-            return
-        tr.write_sep("=", "FAILURES SHORT STACK")
-        for rep in reports:
-            msg = tr._getfailureheadline(rep)
-            tr.write_sep("_", msg, red=True, bold=True)
-            # chop off the optional leading extra frames, leaving only the last one
-            longrepr = re.sub(r".*_ _ _ (_ ){10,}_ _ ", "", rep.longreprtext, 0, re.M | re.S)
-            tr._tw.line(longrepr)
-            # note: not printing out any rep.sections to keep the report short
-
-    # use ready-made report funcs, we are just hijacking the filehandle to log to a dedicated file each
-    # adapted from https://github.com/pytest-dev/pytest/blob/897f151e/src/_pytest/terminal.py#L814
-    # note: some pytest plugins may interfere by hijacking the default `terminalreporter` (e.g.
-    # pytest-instafail does that)
-
-    # report failures with line/short/long styles
-    config.option.tbstyle = "auto"  # full tb
-    with open(report_files["failures_long"], "w") as f:
-        tr._tw = create_terminal_writer(config, f)
-        tr.summary_failures()
-
-    # config.option.tbstyle = "short" # short tb
-    with open(report_files["failures_short"], "w") as f:
-        tr._tw = create_terminal_writer(config, f)
-        summary_failures_short(tr)
-
-    config.option.tbstyle = "line"  # one line per error
-    with open(report_files["failures_line"], "w") as f:
-        tr._tw = create_terminal_writer(config, f)
-        tr.summary_failures()
-
-    with open(report_files["errors"], "w") as f:
-        tr._tw = create_terminal_writer(config, f)
-        tr.summary_errors()
-
-    with open(report_files["warnings"], "w") as f:
-        tr._tw = create_terminal_writer(config, f)
-        tr.summary_warnings()  # normal warnings
-        tr.summary_warnings()  # final warnings
-
-    tr.reportchars = "wPpsxXEf"  # emulate -rA (used in summary_passes() and short_test_summary())
-    with open(report_files["passes"], "w") as f:
-        tr._tw = create_terminal_writer(config, f)
-        tr.summary_passes()
-
-    with open(report_files["summary_short"], "w") as f:
-        tr._tw = create_terminal_writer(config, f)
-        tr.short_test_summary()
-
-    with open(report_files["stats"], "w") as f:
-        tr._tw = create_terminal_writer(config, f)
-        tr.summary_stats()
-
-    # restore:
-    tr._tw = orig_writer
-    tr.reportchars = orig_reportchars
-    config.option.tbstyle = orig_tbstyle
-
-
-class CaptureLogger:
-    """
-    Args:
-    Context manager to capture `logging` streams
-        logger: 'logging` logger object
-    Returns:
-        The captured output is available via `self.out`
-    Example:
-    ```python
-    >>> from ppdiffusers import logging
-    >>> from ppdiffusers.testing_utils import CaptureLogger
-
-    >>> msg = "Testing 1, 2, 3"
-    >>> logging.set_verbosity_info()
-    >>> logger = logging.get_logger("ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.py")
-    >>> with CaptureLogger(logger) as cl:
-    ...     logger.info(msg)
-    >>> assert cl.out, msg + "\n"
-    ```
-    """
-
-    def __init__(self, logger):
-        self.logger = logger
-        self.io = StringIO()
-        self.sh = logging.StreamHandler(self.io)
-        self.out = ""
-
-    def __enter__(self):
-        self.logger.addHandler(self.sh)
-        return self
-
-    def __exit__(self, *exc):
-        self.logger.removeHandler(self.sh)
-        self.out = self.io.getvalue()
-
-    def __repr__(self):
-        return f"captured: {self.out}\n"
diff --git a/ppdiffusers/ppdiffusers/version.py b/ppdiffusers/ppdiffusers/version.py
deleted file mode 100644
index b3afa3263e04..000000000000
--- a/ppdiffusers/ppdiffusers/version.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# this file will be generated by tools
-# please not modify it.
-VERSION = "0.16.1"
diff --git a/ppdiffusers/requirements.txt b/ppdiffusers/requirements.txt
deleted file mode 100644
index d9f7488a54c1..000000000000
--- a/ppdiffusers/requirements.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-paddlenlp>=2.6.0rc0
-safetensors
-ftfy
-regex
-Pillow
diff --git a/ppdiffusers/scripts/cocoeval_keypoints_score/README.md b/ppdiffusers/scripts/cocoeval_keypoints_score/README.md
deleted file mode 100644
index 92387a580f5d..000000000000
--- a/ppdiffusers/scripts/cocoeval_keypoints_score/README.md
+++ /dev/null
@@ -1,83 +0,0 @@
-# COCOeval for Keypints
-
-本文档将指导你如何利用`pycocotools.cocoeval.COCOeval`针对关键点控制生成任务进行控制效果评估。这个工具提供的评估代码可被用于公开的COCO验证集或任何采用相同格式的数据集。它能计算以下几种指标。为了获取合适的格式的测试数据，我们需要运行`get_openpose_keypoints_result_coco_format.py`脚本，它将对真实图像集合和生成的图像集合进行关键点提取，并生成目标格式的关键点检测文件。
-
-评估关键点检测的核心理念是模仿用于目标检测的评价指标，即平均精度（AP）和平均召回率（AR）及其变体。这些指标的核心是真实目标和预测目标之间的相似性度量。在目标检测的下，交并比（IoU）就充当了这种相似性度量（适用于框和段）。通过设定IoU阈值，定义真实目标和预测目标之间的匹配，从而能够计算精度-召回率曲线。为了将AP/AR应用于关键点检测，我们只需要定义一个类似的相似性度量。我们通过定义目标关键点相似度（OKS）来实现这一点，它起着与IoU相同的作用。
-
-具体来说，以下10个具体指标用于描述关键点检测的效果，其中第一指标最为关键：
-```
-Average Precision (AP):
-AP
-% AP at OKS=.50:.05:.95 (primary challenge metric)
-
-APOKS=.50
-% AP at OKS=.50 (loose metric)
-
-APOKS=.75
-% AP at OKS=.75 (strict metric)
-
-AP Across Scales:
-APmedium
-% AP for medium objects: 322 < area < 962
-
-APlarge
-% AP for large objects: area > 962
-
-Average Recall (AR):
-AR
-% AR at OKS=.50:.05:.95
-
-AROKS=.50
-% AR at OKS=.50
-
-AROKS=.75
-% AR at OKS=.75
-
-AR Across Scales:
-ARmedium
-% AR for medium objects: 322 < area < 962
-
-ARlarge
-% AR for large objects: area > 962
-
-```
-
-
-## 依赖
-- pycocotools
-
-
-## 使用方法
-
-首先，我们需要预备原始图片数据集，位置为`path/to/images_origin`。此外，我们还需要准备如`path/to/images_generate1`、`path/to/images_generate2`等待测试的生成图片数据集。执行以下步骤，我们可以得到`xx_gt.json`和`xx_dt.json`：
-```
-python get_openpose_keypoints_result_coco_format.py \
-    --do_gt \
-    path/to/images_origin \
-    path/to/output/images_origin_gt.json \
-    path/to/output/images_origin_ppdet
-```
-```
-python get_openpose_keypoints_result_coco_format.py \
-    path/to/images_generate1 \
-    path/to/output/images_generate1_dt.json \
-    path/to/output/images_generate1_ppdet
-python get_openpose_keypoints_result_coco_format.py \
-    path/to/images_generate2 \
-    path/to/output/images_generate2_dt.json \
-    path/to/output/images_generate2_ppdet
-```
-
-其次我们需要执行以下命令来获取具体的测试指标：
-```
-python cocoeval_keypoints.py \
-    --gt path/to/output/images_origin_gt.json \
-    --dt path/to/output/images_generate1_dt.json
-python cocoeval_keypoints.py \
-    --gt path/to/output/images_origin_gt.json \
-    --dt path/to/output/images_generate2_dt.json
-```
-
-## 参考
-
-- [https://cocodataset.org/#keypoints-eval](https://cocodataset.org/#keypoints-eval)
diff --git a/ppdiffusers/scripts/cocoeval_keypoints_score/annotator b/ppdiffusers/scripts/cocoeval_keypoints_score/annotator
deleted file mode 120000
index a2c6f79fbf63..000000000000
--- a/ppdiffusers/scripts/cocoeval_keypoints_score/annotator
+++ /dev/null
@@ -1 +0,0 @@
-../../examples/controlnet/annotator
\ No newline at end of file
diff --git a/ppdiffusers/scripts/cocoeval_keypoints_score/cocoeval_keypoints.py b/ppdiffusers/scripts/cocoeval_keypoints_score/cocoeval_keypoints.py
deleted file mode 100644
index ea039412ef29..000000000000
--- a/ppdiffusers/scripts/cocoeval_keypoints_score/cocoeval_keypoints.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-
-from pycocotools.coco import COCO
-from pycocotools.cocoeval import COCOeval
-
-# This script references https://cocodataset.org/#keypoints-eval.
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-g", "--gt", type=str, help="Assign the groud true path.", default=None)
-    parser.add_argument("-d", "--dt", type=str, help="Assign the detection result path.", default=None)
-    args = parser.parse_args()
-
-    cocoGt = COCO(args.gt)
-    cocoDt = cocoGt.loadRes(args.dt)
-    cocoEval = COCOeval(cocoGt, cocoDt, "keypoints")
-    cocoEval.evaluate()
-    cocoEval.accumulate()
-    cocoEval.summarize()
diff --git a/ppdiffusers/scripts/cocoeval_keypoints_score/get_openpose_keypoints_result_coco_format.py b/ppdiffusers/scripts/cocoeval_keypoints_score/get_openpose_keypoints_result_coco_format.py
deleted file mode 100644
index aa7e21b46645..000000000000
--- a/ppdiffusers/scripts/cocoeval_keypoints_score/get_openpose_keypoints_result_coco_format.py
+++ /dev/null
@@ -1,264 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-import pathlib
-from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
-
-import cv2
-import numpy as np
-import paddle
-import paddlehub as hub
-from annotator.ppdet_hrnet.det_keypoint_unite_infer import PPDetPose
-
-# import PIL
-from PIL import Image
-from tqdm import tqdm
-
-
-def keypoint_to_openpose_kpts(coco_keypoints_list):
-    # coco keypoints: [x1,y1,v1,...,xk,yk,vk]       (k=17)
-    #     ['Nose', Leye', 'Reye', 'Lear', 'Rear', 'Lsho', 'Rsho', 'Lelb',
-    #      'Relb', 'Lwri', 'Rwri', 'Lhip', 'Rhip', 'Lkne', 'Rkne', 'Lank', 'Rank']
-    # openpose keypoints: [y1,...,yk], [x1,...xk]   (k=18, with Neck)
-    #     ['Nose', *'Neck'*, 'Rsho', 'Relb', 'Rwri', 'Lsho', 'Lelb', 'Lwri','Rhip',
-    #      'Rkne', 'Rank', 'Lhip', 'Lkne', 'Lank', 'Reye', 'Leye', 'Rear', 'Lear']
-    indices = [0, 6, 8, 10, 5, 7, 9, 12, 14, 16, 11, 13, 15, 2, 1, 4, 3]
-    openpose_kpts = []
-    for i in indices:
-        openpose_kpts.append(coco_keypoints_list[i])
-
-    # Get 'Neck' keypoint by interpolating between 'Lsho' and 'Rsho' keypoints
-    l_shoulder_index = 5
-    r_shoulder_index = 6
-    l_shoulder_keypoint = coco_keypoints_list[l_shoulder_index]
-    r_shoulder_keypoint = coco_keypoints_list[r_shoulder_index]
-
-    neck_keypoint_y = int((l_shoulder_keypoint[1] + r_shoulder_keypoint[1]) / 2.0)
-    neck_keypoint_x = int((l_shoulder_keypoint[0] + r_shoulder_keypoint[0]) / 2.0)
-    neck_keypoint = [neck_keypoint_x, neck_keypoint_y, min(l_shoulder_keypoint[2], r_shoulder_keypoint[2])]
-    open_pose_neck_index = 1
-    openpose_kpts.insert(open_pose_neck_index, neck_keypoint)
-
-    return openpose_kpts
-
-
-class PPDetDetector:
-    def __init__(self):
-        self.body_estimation = hub.Module(name="openpose_body_estimation")
-        self.hand_estimation = hub.Module(name="openpose_hands_estimation")
-        self.ppdetpose = PPDetPose()
-
-    def __call__(self, oriImg, detect_resolution=512, hand=False):
-        with paddle.no_grad():
-            img_scalarfactor = detect_resolution / min(oriImg.shape[:2])
-            result, poseres = self.ppdetpose_pred(oriImg)
-            result["candidate"] = result["candidate"] * img_scalarfactor
-            oriImg = cv2.resize(oriImg, (0, 0), fx=img_scalarfactor, fy=img_scalarfactor)
-            canvas = oriImg.copy()
-            canvas.fill(0)
-            canvas = self.body_estimation.draw_pose(canvas, result["candidate"], result["subset"])
-
-            return canvas, dict(candidate=result["candidate"].tolist(), subset=result["subset"].tolist()), poseres
-
-    def ppdetpose_pred(self, image, kpt_threshold=0.3):
-        poseres = self.ppdetpose.ppdet_hrnet_infer(image)
-        keypoints = poseres["keypoint"][0]
-        num_kpts = len(keypoints)
-        subset = np.ones((num_kpts, 20)) * -1
-        candidate = np.zeros((0, 4))
-        posnum = 0
-        for kptid, keypoint in enumerate(keypoints):
-            openpose_kpts = keypoint_to_openpose_kpts(keypoint)
-            for idx, item in enumerate(openpose_kpts):
-                if item[2] > kpt_threshold:
-                    subset[kptid][idx] = posnum
-                    kpt = np.array(
-                        item
-                        + [
-                            posnum,
-                        ]
-                    )
-                    candidate = np.vstack((candidate, kpt))
-                    posnum += 1
-        return {"candidate": candidate, "subset": subset}, poseres
-
-
-annotator_ckpts_path = os.path.join(os.path.dirname(__file__), "ckpts")
-
-
-def HWC3(x):
-    assert x.dtype == np.uint8
-    if x.ndim == 2:
-        x = x[:, :, None]
-    assert x.ndim == 3
-    H, W, C = x.shape
-    assert C == 1 or C == 3 or C == 4
-    if C == 3:
-        return x
-    if C == 1:
-        return np.concatenate([x, x, x], axis=2)
-    if C == 4:
-        color = x[:, :, 0:3].astype(np.float32)
-        alpha = x[:, :, 3:4].astype(np.float32) / 255.0
-        y = color * alpha + 255.0 * (1.0 - alpha)
-        y = y.clip(0, 255).astype(np.uint8)
-        return y
-
-
-def resize_image(input_image, resolution):
-    H, W, C = input_image.shape
-    H = float(H)
-    W = float(W)
-    k = float(resolution) / min(H, W)
-    H *= k
-    W *= k
-    H = int(np.round(H / 64.0)) * 64
-    W = int(np.round(W / 64.0)) * 64
-    img = cv2.resize(input_image, (W, H), interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA)
-    return img
-
-
-def get_keypoints_result_coco_format(paths, detector, do_gt):
-    """Get keypoints result in coco format"""
-    if not os.path.exists(paths[0]):
-        raise RuntimeError("Invalid path: %s" % paths[0])
-    in_dir_path = pathlib.Path(paths[0])
-    if len(paths) == 3:
-        out_dir_path = pathlib.Path(paths[2])
-        if not os.path.exists(out_dir_path):
-            os.makedirs(out_dir_path)
-    files = sorted([file for ext in IMAGE_EXTENSIONS for file in in_dir_path.glob("*.{}".format(ext))])
-    output = []
-    index = -1
-    for file in tqdm(files):
-        index += 1
-        im = Image.open(file)
-        im = np.array(im, dtype=np.uint8)
-        input_image = HWC3(im)
-        canvas, keypoints_result, poseres = detector(input_image)
-        if len(paths) == 3:
-            Image.fromarray(canvas).save(os.path.join(out_dir_path, os.path.basename(file)))
-        if len(poseres["keypoint"][0]) == 0:
-            sample_dict = {
-                "image_id": index,
-                "category_id": 1,
-                "keypoints": [0, 0, 0] * 17,
-                "score": 0,
-                "id": index,
-                "num_keypoints": 0,
-                "bbox": [0, 0, 0, 0],
-                "area": 0,
-                "iscrowd": 0,
-            }
-        else:
-            keypoints_list = []
-            zero_num = 0
-            for point in poseres["keypoint"][0][0]:
-                if point[2] < 0.3:
-                    keypoints_list += [0, 0, 0]
-                    zero_num += 1
-                else:
-                    keypoints_list += point[:2] + [2]
-
-            sample_dict = {
-                "image_id": index,
-                "category_id": 1,
-                "keypoints": keypoints_list,
-                "score": poseres["keypoint"][1][0][0],
-                "id": index,
-                "num_keypoints": 17 - zero_num,
-                "bbox": poseres["bbox"][0],
-                "area": poseres["bbox"][0][2] * poseres["bbox"][0][3],
-                "iscrowd": 0,
-            }
-
-        output.append(sample_dict)
-
-    with open(paths[1], "w") as json_file:
-        if do_gt:
-            json_file.write(
-                json.dumps(
-                    {
-                        "annotations": output,
-                        "images": [{"id": item} for item in list(range(index + 1))],
-                        "categories": [
-                            {
-                                "supercategory": "person",
-                                "id": 1,
-                                "name": "person",
-                                "keypoints": [
-                                    "nose",
-                                    "left_eye",
-                                    "right_eye",
-                                    "left_ear",
-                                    "right_ear",
-                                    "left_shoulder",
-                                    "right_shoulder",
-                                    "left_elbow",
-                                    "right_elbow",
-                                    "left_wrist",
-                                    "right_wrist",
-                                    "left_hip",
-                                    "right_hip",
-                                    "left_knee",
-                                    "right_knee",
-                                    "left_ankle",
-                                    "right_ankle",
-                                ],
-                                "skeleton": [
-                                    [16, 14],
-                                    [14, 12],
-                                    [17, 15],
-                                    [15, 13],
-                                    [12, 13],
-                                    [6, 12],
-                                    [7, 13],
-                                    [6, 7],
-                                    [6, 8],
-                                    [7, 9],
-                                    [8, 10],
-                                    [9, 11],
-                                    [2, 3],
-                                    [1, 2],
-                                    [1, 3],
-                                    [2, 4],
-                                    [3, 5],
-                                    [4, 6],
-                                    [5, 7],
-                                ],
-                            }
-                        ],
-                    },
-                    indent=4,
-                )
-            )
-        else:
-            json_file.write(json.dumps(output, indent=4))
-
-
-parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
-parser.add_argument("--do_gt", action="store_true", help="whether to predict unseen future data")
-parser.add_argument(
-    "path", type=str, nargs=3, help=("Paths to the input images dir, output json file, and output openpose images dir")
-)
-
-IMAGE_EXTENSIONS = {"bmp", "jpg", "jpeg", "pgm", "png", "ppm", "tif", "tiff", "webp"}
-
-if __name__ == "__main__":
-    args = parser.parse_args()
-    detector = PPDetDetector()
-    get_keypoints_result_coco_format(args.path, detector, args.do_gt)
diff --git a/ppdiffusers/scripts/convert_diffusers_model/README.md b/ppdiffusers/scripts/convert_diffusers_model/README.md
deleted file mode 100644
index 101aacd97dde..000000000000
--- a/ppdiffusers/scripts/convert_diffusers_model/README.md
+++ /dev/null
@@ -1,323 +0,0 @@
-# Stable Diffusion模型转换教程（Pytorch -> Paddle）
-
-本教程支持将Huggingface的[Diffusers](https://github.com/huggingface/diffusers)版本的Stable Diffusion权重转换成[PPDiffusers](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers)版本的Stable Diffusion权重。
-
-Tips：
-如果我们想要将原版的权重转换为[PPDiffusers](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers)的权重，我们可以首先使用
-Huggingface提供的转换脚本[convert_original_stable_diffusion_to_diffusers.py](https://github.com/huggingface/diffusers/blob/main/scripts/convert_original_stable_diffusion_to_diffusers.py)将原版权重转换为[Diffusers](https://github.com/huggingface/diffusers)版本的权重。
-
-## 1 Diffusers 权重转换为 PPDiffusers权重
-
-### 1.1 依赖安装
-
-模型权重转换需要依赖`torch`, `diffusers`, `transformers`, `paddlepaddle`, `paddlenlp`以及`ppdiffusers`，我可使用`pip`执行下面的命令进行快速安装。
-
-```shell
-pip install -r requirements.txt
-```
-
-### 1.2 模型权重转换
-
-___注意：模型权重转换过程中，需要下载Stable Diffusion模型。为了使用该模型与权重，你必须接受该模型所要求的License，并且获取HF Hub授予的Token。请访问HuggingFace的[model card](https://huggingface.co/runwayml/stable-diffusion-v1-5), 仔细阅读里面的License，然后签署该协议。___
-
-___Tips: Stable Diffusion是基于以下的License: The CreativeML OpenRAIL M license is an Open RAIL M license, adapted from the work that BigScience and the RAIL Initiative are jointly carrying in the area of responsible AI licensing. See also the article about the BLOOM Open RAIL license on which this license is based.___
-
-若第一次权重转换模型，需要先登录HuggingFace客户端。执行以下命令进行登录：
-
-```shell
-# 安装huggingface_hub
-pip install huggingface_hub
-# 登录huggingface_hub
-huggingface-cli login
-```
-
-登录成功后，可执行以下命令行完成模型权重转换。
-
-```shell
-python convert_diffusers_stable_diffusion_to_ppdiffusers.py --pretrained_model_name_or_path runwayml/stable-diffusion-v1-5 --output_path stable-diffusion-v1-5-ppdiffusers
-```
-
-输出的模型目录结构如下：
-```shell
-├── stable-diffusion-v1-5-ppdiffusers  # 我们指定的输出文件路径
-    ├── model_index.json # 模型index文件
-    ├── vae # vae权重文件夹
-        ├── model_state.pdparams
-        ├── config.json
-    ├── text_encoder # text_encoder权重文件夹
-        ├── model_config.json
-        ├── model_state.pdparams
-    ├── unet # unet权重文件夹
-        ├── model_state.pdparams
-        ├── config.json
-    ├── scheduler # scheduler文件夹
-        ├── scheduler_config.json
-    ├── feature_extractor # feature_extractor文件夹
-        ├── preprocessor_config.json
-    ├── safety_checker # safety_checker文件夹
-        ├── model_config.json
-        ├── model_state.pdparams
-    ├── tokenizer # tokenizer文件夹
-        ├── tokenizer_config.json
-        ├── merges.txt
-        ├── special_tokens_map.json
-        ├── vocab.json
-```
-
-#### 1.3 参数说明
-
-`convert_diffusers_stable_diffusion_to_ppdiffusers.py` 各命令行参数的说明。
-
-| 参数 |参数说明 |
-|----------|--------------|
-|<div style="width: 230pt">--pretrained_model_name_or_path </div> | Huggingface上提供的diffuers版本的diffusion预训练模型。默认为："runwayml/stable-diffusion-v1-5"。更多diffusion预训练模型可参考[CompVis模型列表](https://huggingface.co/CompVis)及[runwayml模型列表](https://huggingface.co/runwayml)，目前仅支持SD版模型。|
-|--output_path | 转换后的模型目录。 |
-
-
-## 2 原版Stable Diffusion模型权重转换为PPDiffusers权重
-
-总共分为2个步骤
-- Step1 原版ckpt权重转换为Diffusers权重；
-- Step2 Diffusers权重转换为PPDiffusers权重。
-
-### 2.1 依赖安装
-
-模型权重转换需要依赖`omegaconf`, `torch`, `diffusers`, `transformers`, `paddlepaddle`, `paddlenlp`以及`ppdiffusers`，我可使用`pip`执行下面的命令进行快速安装。
-
-```shell
-pip install -r requirements.txt
-```
-
-### 2.2 模型权重转换
-
-#### Step1 原版ckpt权重转换为Diffusers权重
-在开始之前我们需要准备如下的文件：
-- Huggingface提供的转换脚本, https://github.com/huggingface/diffusers/blob/main/scripts/convert_original_stable_diffusion_to_diffusers.py;
-- 原版的权重文件, https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned.ckpt;
-- yaml配置文件, https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml.
-
-所需的文件目录如下所示：
-```shell
-├── convert_original_stable_diffusion_to_diffusers.py  # Huggingface的转换脚本
-├── v1-5-pruned.ckpt # 原版v1-5模型权重文件
-├── v1-inference.yaml # yaml配置文件
-```
-
-```shell
-python convert_original_stable_diffusion_to_diffusers.py --checkpoint_path v1-5-pruned.ckpt --original_config_file v1-inference.yaml --dump_path stable-diffusion-v1-5-diffusers
-```
-
-输出的模型目录结构如下：
-
-```shell
-├── stable-diffusion-v1-5-diffusers  # 我们指定的输出文件路径
-    ├── model_index.json # 模型index文件
-    ├── vae # vae权重文件夹
-        ├── diffusion_pytorch_model.bin
-        ├── config.json
-    ├── text_encoder # text_encoder权重文件夹
-        ├── config.json
-        ├── pytorch_model.bin
-    ├── unet # unet权重文件夹
-        ├── diffusion_pytorch_model.bin
-        ├── config.json
-    ├── scheduler # scheduler文件夹
-        ├── scheduler_config.json
-    ├── feature_extractor # feature_extractor文件夹
-        ├── preprocessor_config.json
-    ├── safety_checker # safety_checker文件夹
-        ├── config.json
-        ├── pytorch_model.bin
-    ├── tokenizer # tokenizer文件夹
-        ├── tokenizer_config.json
-        ├── merges.txt
-        ├── special_tokens_map.json
-        ├── vocab.json
-```
-#### 参数说明
-
-`convert_original_stable_diffusion_to_diffusers.py` 各命令行参数的说明。
-
-| 参数 |参数说明 |
-|----------|--------------|
-|<div style="width: 230pt">--checkpoint_path </div> | 原版Stable Diffusion模型ckpt后缀的权重文件。默认为："v1-5-pruned.ckpt"。更多原版的预训练模型可在[HuggingFace上搜索](https://huggingface.co/)。|
-|--original_config_file | 该权重文件所使用的配置文件，默认为"v1-inference.yaml"。 |
-|--dump_path | 转换后的Diffusers版本模型目录。 |
-
-#### Step2 Diffusers权重转换为PPDiffusers权重
-由于我们已经得到了Huggingface的[Diffusers](https://github.com/huggingface/diffusers)版本的权重，因此我们可以参考第1部分进行权重转换。
-
-我们仅需要运行下面的代码即可成功转换[PPDiffusers](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/ppdiffusers)版本的权重。
-
-```shell
-python convert_diffusers_stable_diffusion_to_ppdiffusers.py --pretrained_model_name_or_path stable-diffusion-v1-5-diffusers --output_path stable-diffusion-v1-5-ppdiffusers
-```
-
-脚本运行完成后，输出的模型目录结构如下：
-```shell
-├── stable-diffusion-v1-5-ppdiffusers  # 我们指定的输出文件路径
-    ├── model_index.json # 模型index文件
-    ├── vae # vae权重文件夹
-        ├── model_state.pdparams
-        ├── config.json
-    ├── text_encoder # text_encoder权重文件夹
-        ├── model_config.json
-        ├── model_state.pdparams
-    ├── unet # unet权重文件夹
-        ├── model_state.pdparams
-        ├── config.json
-    ├── scheduler # scheduler文件夹
-        ├── scheduler_config.json
-    ├── feature_extractor # feature_extractor文件夹
-        ├── preprocessor_config.json
-    ├── safety_checker # safety_checker文件夹
-        ├── model_config.json
-        ├── model_state.pdparams
-    ├── tokenizer # tokenizer文件夹
-        ├── tokenizer_config.json
-        ├── merges.txt
-        ├── special_tokens_map.json
-        ├── vocab.json
-```
-
-
-
-
-
-## 3 转换后的权重效果对比
-
-### 3.1 Text-to-Image效果对比
-```python
-import torch
-from diffusers import StableDiffusionPipeline as DiffusersStableDiffusionPipeline
-pipe = DiffusersStableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-seed = 1024
-generator = torch.Generator("cuda").manual_seed(seed)
-prompt = "a photo of an astronaut riding a horse on mars"
-image = pipe(prompt, generator=generator).images[0]
-image.save("diffusers_astronaut_rides_horse.png")
-```
-![diffusers_astronaut_rides_horse](https://user-images.githubusercontent.com/50394665/201277740-c9b37d59-4ec0-4b3d-8118-bd7f0dfaf352.png)
-
-```python
-import paddle
-from ppdiffusers import StableDiffusionPipeline as PPDiffusersStableDiffusionPipeline
-pipe = PPDiffusersStableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-prompt = "a photo of an astronaut riding a horse on mars"
-seed = 1024
-paddle.seed(seed)
-image = pipe(prompt).images[0]
-image.save("ppdiffusers_astronaut_rides_horse.png")
-```
-![ppdiffusers_astronaut_rides_horse](https://user-images.githubusercontent.com/50394665/201277735-fafa458a-9409-4795-887a-897a2851753d.png)
-
-### 3.2 Image-to-Image text-guided generation效果对比
-```python
-import requests
-import torch
-from PIL import Image
-from io import BytesIO
-
-from diffusers import StableDiffusionImg2ImgPipeline as DiffusersStableDiffusionImg2ImgPipeline
-
-pipe = DiffusersStableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-
-url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
-
-response = requests.get(url)
-image = Image.open(BytesIO(response.content)).convert("RGB")
-image = image.resize((768, 512))
-
-prompt = "A fantasy landscape, trending on artstation"
-seed = 1024
-generator = torch.Generator("cuda").manual_seed(seed)
-image = pipe(prompt=prompt, image=image, strength=0.75, guidance_scale=7.5, generator=generator).images[0]
-
-image.save("diffusers_fantasy_landscape.png")
-```
-![diffusers_fantasy_landscape](https://user-images.githubusercontent.com/50394665/201277726-2c2f2fc8-dbfe-4b38-9940-9000bb6c8333.png)
-
-```python
-import requests
-import paddle
-from PIL import Image
-from io import BytesIO
-
-from ppdiffusers import StableDiffusionImg2ImgPipeline as PPDiffusersStableDiffusionImg2ImgPipeline
-
-pipe = PPDiffusersStableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-
-url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
-
-response = requests.get(url)
-image = Image.open(BytesIO(response.content)).convert("RGB")
-image = image.resize((768, 512))
-
-prompt = "A fantasy landscape, trending on artstation"
-seed = 1024
-paddle.seed(seed)
-image = pipe(prompt=prompt, image=image, strength=0.75, guidance_scale=7.5).images[0]
-
-image.save("ppdiffusers_fantasy_landscape.png")
-```
-![ppdiffusers_fantasy_landscape](https://user-images.githubusercontent.com/50394665/201277718-f01e8f8d-b560-442f-bf93-c026285c337e.png)
-### 3.3 In-painting效果对比
-```python
-import torch
-import PIL
-import requests
-from io import BytesIO
-
-from diffusers import StableDiffusionInpaintPipeline as DiffusersStableDiffusionInpaintPipeline, EulerAncestralDiscreteScheduler as DiffusersEulerAncestralDiscreteScheduler
-
-def download_image(url):
-    response = requests.get(url)
-    return PIL.Image.open(BytesIO(response.content)).convert("RGB")
-
-
-img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
-mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png"
-
-image = download_image(img_url).resize((512, 512))
-mask_image = download_image(mask_url).resize((512, 512))
-scheduler = DiffusersEulerAncestralDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
-pipe = DiffusersStableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting", scheduler=scheduler)
-
-prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
-seed = 1024
-generator = torch.Generator("cuda").manual_seed(seed)
-image = pipe(prompt=prompt, image=image, mask_image=mask_image, generator=generator).images[0]
-
-image.save("diffusers_cat_on_bench.png")
-```
-![diffusers_cat_on_bench](https://user-images.githubusercontent.com/50394665/201277724-76145ee6-a3ef-49e7-a1e9-8ccd3c9eb39e.png)
-
-```python
-import paddle
-import PIL
-import requests
-from io import BytesIO
-
-from ppdiffusers import StableDiffusionInpaintPipeline as PPDiffusersStableDiffusionInpaintPipeline, EulerAncestralDiscreteScheduler as PPDiffusersEulerAncestralDiscreteScheduler
-
-def download_image(url):
-    response = requests.get(url)
-    return PIL.Image.open(BytesIO(response.content)).convert("RGB")
-
-
-img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
-mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png"
-
-image = download_image(img_url).resize((512, 512))
-mask_image = download_image(mask_url).resize((512, 512))
-scheduler = PPDiffusersEulerAncestralDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
-pipe = PPDiffusersStableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting", scheduler=scheduler)
-
-prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
-seed = 1024
-paddle.seed(seed)
-image = pipe(prompt=prompt, image=image, mask_image=mask_image).images[0]
-
-image.save("ppdiffusers_cat_on_bench.png")
-```
-![ppdiffusers_cat_on_bench](https://user-images.githubusercontent.com/50394665/201277712-2e10c188-e1ca-44f5-b963-657e9d51cc95.png)
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_StableDiffusionImageVariation_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_StableDiffusionImageVariation_to_ppdiffusers.py
deleted file mode 100644
index b363cfce3527..000000000000
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_StableDiffusionImageVariation_to_ppdiffusers.py
+++ /dev/null
@@ -1,250 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-from collections import OrderedDict
-
-import paddle
-import torch
-from diffusers import (
-    StableDiffusionImageVariationPipeline as DiffusersStableDiffusionImageVariationPipeline,
-)
-
-from paddlenlp.transformers import (
-    CLIPFeatureExtractor,
-    CLIPVisionConfig,
-    CLIPVisionModelWithProjection,
-)
-from ppdiffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-)
-from ppdiffusers import (
-    StableDiffusionImageVariationPipeline as PPDiffusersStableDiffusionImageVariationPipeline,
-)
-from ppdiffusers import UNet2DConditionModel
-from ppdiffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
-
-paddle.set_device("cpu")
-
-
-def convert_to_ppdiffusers(vae_or_unet, dtype="float32"):
-    need_transpose = []
-    for k, v in vae_or_unet.named_modules():
-        if isinstance(v, torch.nn.Linear):
-            need_transpose.append(k + ".weight")
-    new_vae_or_unet = OrderedDict()
-    for k, v in vae_or_unet.state_dict().items():
-        if k not in need_transpose:
-            new_vae_or_unet[k] = v.cpu().numpy().astype(dtype)
-        else:
-            new_vae_or_unet[k] = v.t().cpu().numpy().astype(dtype)
-    return new_vae_or_unet
-
-
-def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True, need_prefix=False):
-    new_model_state = {}
-    transformers2ppnlp = {
-        ".encoder.": ".transformer.",
-        ".layer_norm": ".norm",
-        ".mlp.": ".",
-        ".fc1.": ".linear1.",
-        ".fc2.": ".linear2.",
-        ".final_layer_norm.": ".ln_final.",
-        ".embeddings.": ".",
-        ".position_embedding.": ".positional_embedding.",
-        ".patch_embedding.": ".conv1.",
-        "visual_projection.weight": "vision_projection",
-        "text_projection.weight": "text_projection",
-        ".pre_layrnorm.": ".ln_pre.",
-        ".post_layernorm.": ".ln_post.",
-        ".vision_model.": ".",
-    }
-    ignore_value = ["position_ids"]
-    donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"]
-
-    for name, value in clip.state_dict().items():
-        # step1: ignore position_ids
-        if any(i in name for i in ignore_value):
-            continue
-        # step2: transpose nn.Linear weight
-        if value.ndim == 2 and not any(i in name for i in donot_transpose):
-            value = value.t()
-        # step3: hf_name -> ppnlp_name mapping
-        for hf_name, ppnlp_name in transformers2ppnlp.items():
-            name = name.replace(hf_name, ppnlp_name)
-        # step4: 0d tensor -> 1d tensor
-        if name == "logit_scale":
-            value = value.reshape((1,))
-        # step5: safety_checker need prefix "clip."
-        if "vision_model" in name and need_prefix:
-            name = "clip." + name
-        new_model_state[name] = value.cpu().numpy().astype(dtype)
-
-    if is_text_encoder:
-        new_config = {
-            "max_text_length": clip.config.max_position_embeddings,
-            "vocab_size": clip.config.vocab_size,
-            "text_embed_dim": clip.config.hidden_size,
-            "text_heads": clip.config.num_attention_heads,
-            "text_layers": clip.config.num_hidden_layers,
-            "text_hidden_act": clip.config.hidden_act,
-            "projection_dim": clip.config.projection_dim,
-            "initializer_range": clip.config.initializer_range,
-            "initializer_factor": clip.config.initializer_factor,
-        }
-    else:
-        if need_prefix:
-            new_config = {
-                "image_resolution": clip.config.vision_config.image_size,
-                "vision_layers": clip.config.vision_config.num_hidden_layers,
-                "vision_heads": clip.config.vision_config.num_attention_heads,
-                "vision_embed_dim": clip.config.vision_config.hidden_size,
-                "vision_patch_size": clip.config.vision_config.patch_size,
-                "vision_mlp_ratio": clip.config.vision_config.intermediate_size
-                // clip.config.vision_config.hidden_size,
-                "vision_hidden_act": clip.config.vision_config.hidden_act,
-                "projection_dim": clip.config.projection_dim,
-            }
-        else:
-            new_config = {
-                "image_resolution": clip.config.image_size,
-                "vision_layers": clip.config.num_hidden_layers,
-                "vision_heads": clip.config.num_attention_heads,
-                "vision_embed_dim": clip.config.hidden_size,
-                "vision_patch_size": clip.config.patch_size,
-                "vision_mlp_ratio": clip.config.intermediate_size // clip.config.hidden_size,
-                "vision_hidden_act": clip.config.hidden_act,
-                "projection_dim": clip.config.projection_dim,
-            }
-    return new_model_state, new_config
-
-
-def check_keys(model, state_dict):
-    cls_name = model.__class__.__name__
-    missing_keys = []
-    mismatched_keys = []
-    for k, v in model.state_dict().items():
-        if k not in state_dict.keys():
-            missing_keys.append(k)
-        if list(v.shape) != list(state_dict[k].shape):
-            mismatched_keys.append(k)
-    if len(missing_keys):
-        missing_keys_str = ", ".join(missing_keys)
-        print(f"{cls_name} Found missing_keys {missing_keys_str}!")
-    if len(mismatched_keys):
-        mismatched_keys_str = ", ".join(mismatched_keys)
-        print(f"{cls_name} Found mismatched_keys {mismatched_keys_str}!")
-
-
-def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path, output_path=None):
-    # 0. load diffusers pipe and convert to ppdiffusers weights format
-    diffusers_pipe = DiffusersStableDiffusionImageVariationPipeline.from_pretrained(
-        pretrained_model_name_or_path, use_auth_token=True
-    )
-    vae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vae)
-    unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.unet)
-    image_encoder_state_dict, vision_config = convert_hf_clip_to_ppnlp_clip(
-        diffusers_pipe.image_encoder, is_text_encoder=False, need_prefix=False
-    )
-    safety_checker_state_dict, safety_checker_config = convert_hf_clip_to_ppnlp_clip(
-        diffusers_pipe.safety_checker, is_text_encoder=False, need_prefix=True
-    )
-
-    # 1. vae
-    pp_vae = AutoencoderKL.from_config(diffusers_pipe.vae.config)
-
-    pp_vae.set_dict(vae_state_dict)
-    check_keys(pp_vae, vae_state_dict)
-    # 2. unet
-    pp_unet = UNet2DConditionModel.from_config(diffusers_pipe.unet.config)
-
-    pp_unet.set_dict(unet_state_dict)
-    check_keys(pp_unet, unet_state_dict)
-
-    # make sure
-    vision_config.update({"projection_dim": pp_unet.config.cross_attention_dim})
-    safety_checker_config.update({"projection_dim": pp_unet.config.cross_attention_dim})
-
-    # 3. image_encoder
-    image_encoder = CLIPVisionModelWithProjection(CLIPVisionConfig.from_dict(vision_config))
-    image_encoder.set_dict(image_encoder_state_dict)
-    check_keys(image_encoder, image_encoder_state_dict)
-    # 4. safety_checker
-    pp_safety_checker = StableDiffusionSafetyChecker(CLIPVisionConfig.from_dict(safety_checker_config))
-    pp_safety_checker.set_dict(safety_checker_state_dict)
-    check_keys(pp_safety_checker, safety_checker_state_dict)
-    # 5. scheduler
-    beta_start = diffusers_pipe.scheduler.beta_start
-    beta_end = diffusers_pipe.scheduler.beta_end
-    scheduler_type = diffusers_pipe.scheduler._class_name.lower()
-    if "pndm" in scheduler_type:
-        pp_scheduler = PNDMScheduler(
-            beta_start=beta_start,
-            beta_end=beta_end,
-            beta_schedule="scaled_linear",
-            # Make sure the scheduler compatible with DDIM
-            set_alpha_to_one=False,
-            steps_offset=1,
-            # Make sure the scheduler compatible with PNDM
-            skip_prk_steps=True,
-        )
-    elif "lms" in scheduler_type:
-        pp_scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
-    elif "ddim" in scheduler_type:
-        pp_scheduler = DDIMScheduler(
-            beta_start=beta_start,
-            beta_end=beta_end,
-            beta_schedule="scaled_linear",
-            # Make sure the scheduler compatible with DDIM
-            clip_sample=False,
-            set_alpha_to_one=False,
-            steps_offset=1,
-        )
-    else:
-        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
-
-    pp_feature_extractor = CLIPFeatureExtractor.from_pretrained("CompVis/stable-diffusion-v1-4/feature_extractor")
-
-    # 7. create ppdiffusers pipe
-    paddle_pipe = PPDiffusersStableDiffusionImageVariationPipeline(
-        vae=pp_vae,
-        image_encoder=image_encoder,
-        unet=pp_unet,
-        safety_checker=pp_safety_checker,
-        feature_extractor=pp_feature_extractor,
-        scheduler=pp_scheduler,
-    )
-    # 8. save_pretrained
-    paddle_pipe.save_pretrained(output_path)
-    return paddle_pipe
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
-    parser.add_argument(
-        "--pretrained_model_name_or_path",
-        type=str,
-        default="fusing/sd-image-variations-diffusers",
-        help="Path to pretrained model or model identifier from huggingface.co/models.",
-    )
-    parser.add_argument(
-        "--output_path",
-        type=str,
-        default="sd-image-variations-ppdiffusers",
-        help="The model output path.",
-    )
-    args = parser.parse_args()
-    ppdiffusers_pipe = convert_diffusers_to_ppdiffusers(args.pretrained_model_name_or_path, args.output_path)
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_StableDiffusionUpscalePipeline_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_StableDiffusionUpscalePipeline_to_ppdiffusers.py
deleted file mode 100644
index ceec649685bd..000000000000
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_StableDiffusionUpscalePipeline_to_ppdiffusers.py
+++ /dev/null
@@ -1,252 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import tempfile
-from collections import OrderedDict
-
-import paddle
-import torch
-from diffusers import (
-    StableDiffusionUpscalePipeline as DiffusersStableDiffusionUpscalePipeline,
-)
-
-from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    DDPMScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-)
-from ppdiffusers import (
-    StableDiffusionUpscalePipeline as PPDiffusersStableDiffusionUpscalePipeline,
-)
-from ppdiffusers import UNet2DConditionModel
-
-paddle.set_device("cpu")
-
-
-def convert_to_ppdiffusers(vae_or_unet, dtype="float32"):
-    need_transpose = []
-    for k, v in vae_or_unet.named_modules():
-        if isinstance(v, torch.nn.Linear):
-            need_transpose.append(k + ".weight")
-    new_vae_or_unet = OrderedDict()
-    for k, v in vae_or_unet.state_dict().items():
-        if k not in need_transpose:
-            new_vae_or_unet[k] = v.cpu().numpy().astype(dtype)
-        else:
-            new_vae_or_unet[k] = v.t().cpu().numpy().astype(dtype)
-    return new_vae_or_unet
-
-
-def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True):
-    new_model_state = {}
-    transformers2ppnlp = {
-        ".encoder.": ".transformer.",
-        ".layer_norm": ".norm",
-        ".mlp.": ".",
-        ".fc1.": ".linear1.",
-        ".fc2.": ".linear2.",
-        ".final_layer_norm.": ".ln_final.",
-        ".embeddings.": ".",
-        ".position_embedding.": ".positional_embedding.",
-        ".patch_embedding.": ".conv1.",
-        "visual_projection.weight": "vision_projection",
-        "text_projection.weight": "text_projection",
-        ".pre_layrnorm.": ".ln_pre.",
-        ".post_layernorm.": ".ln_post.",
-        ".vision_model.": ".",
-    }
-    ignore_value = ["position_ids"]
-    donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"]
-
-    for name, value in clip.state_dict().items():
-        # step1: ignore position_ids
-        if any(i in name for i in ignore_value):
-            continue
-        # step2: transpose nn.Linear weight
-        if value.ndim == 2 and not any(i in name for i in donot_transpose):
-            value = value.t()
-        # step3: hf_name -> ppnlp_name mapping
-        for hf_name, ppnlp_name in transformers2ppnlp.items():
-            name = name.replace(hf_name, ppnlp_name)
-        # step4: 0d tensor -> 1d tensor
-        if name == "logit_scale":
-            value = value.reshape((1,))
-        # step5: safety_checker need prefix "clip."
-        if "vision_model" in name:
-            name = "clip." + name
-        new_model_state[name] = value.cpu().numpy().astype(dtype)
-
-    if is_text_encoder:
-        new_config = {
-            "max_text_length": clip.config.max_position_embeddings,
-            "vocab_size": clip.config.vocab_size,
-            "text_embed_dim": clip.config.hidden_size,
-            "text_heads": clip.config.num_attention_heads,
-            "text_layers": clip.config.num_hidden_layers,
-            "text_hidden_act": clip.config.hidden_act,
-            "projection_dim": clip.config.projection_dim,
-            "initializer_range": clip.config.initializer_range,
-            "initializer_factor": clip.config.initializer_factor,
-        }
-    else:
-        new_config = {
-            "image_resolution": clip.config.vision_config.image_size,
-            "vision_layers": clip.config.vision_config.num_hidden_layers,
-            "vision_heads": clip.config.vision_config.num_attention_heads,
-            "vision_embed_dim": clip.config.vision_config.hidden_size,
-            "vision_patch_size": clip.config.vision_config.patch_size,
-            "vision_mlp_ratio": clip.config.vision_config.intermediate_size // clip.config.vision_config.hidden_size,
-            "vision_hidden_act": clip.config.vision_config.hidden_act,
-            "projection_dim": clip.config.projection_dim,
-        }
-    return new_model_state, new_config
-
-
-def convert_diffusers_stable_diffusion_to_ppdiffusers(pretrained_model_name_or_path, output_path=None):
-    # 0. load diffusers pipe and convert to ppdiffusers weights format
-    diffusers_pipe = DiffusersStableDiffusionUpscalePipeline.from_pretrained(
-        pretrained_model_name_or_path, use_auth_token=True
-    )
-    vae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vae)
-    unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.unet)
-    text_encoder_state_dict, text_encoder_config = convert_hf_clip_to_ppnlp_clip(
-        diffusers_pipe.text_encoder, is_text_encoder=True
-    )
-    max_noise_level = diffusers_pipe.max_noise_level
-
-    # 1. vae
-    pp_vae = AutoencoderKL.from_config(diffusers_pipe.vae.config)
-
-    pp_vae.set_dict(vae_state_dict)
-
-    # 2. unet
-    pp_unet = UNet2DConditionModel.from_config(diffusers_pipe.unet.config)
-
-    pp_unet.set_dict(unet_state_dict)
-
-    # 3. text_encoder
-    pp_text_encoder = CLIPTextModel(CLIPTextConfig.from_dict(text_encoder_config))
-    pp_text_encoder.set_dict(text_encoder_state_dict)
-
-    # 4. scheduler
-    beta_start = diffusers_pipe.scheduler.beta_start
-    beta_end = diffusers_pipe.scheduler.beta_end
-    beta_schedule = diffusers_pipe.scheduler.beta_schedule
-    num_train_timesteps = diffusers_pipe.scheduler.num_train_timesteps
-    scheduler_type = diffusers_pipe.scheduler._class_name.lower()
-    if "pndm" in scheduler_type:
-        pp_scheduler = PNDMScheduler(
-            beta_end=beta_end,
-            beta_schedule=beta_schedule,
-            beta_start=beta_start,
-            num_train_timesteps=num_train_timesteps,
-            skip_prk_steps=True,
-        )
-    elif "lms" in scheduler_type:
-        pp_scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule=beta_schedule)
-    elif "ddim" in scheduler_type:
-        pp_scheduler = DDIMScheduler(
-            beta_start=beta_start,
-            beta_end=beta_end,
-            beta_schedule=beta_schedule,
-            clip_sample=False,
-            prediction_type="v_prediction",
-            set_alpha_to_one=False,
-            steps_offset=1,
-        )
-    else:
-        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
-
-    # 5. low_res_scheduler
-    beta_start = diffusers_pipe.low_res_scheduler.beta_start
-    beta_end = diffusers_pipe.low_res_scheduler.beta_end
-    num_train_timesteps = diffusers_pipe.low_res_scheduler.num_train_timesteps
-    beta_schedule = diffusers_pipe.low_res_scheduler.beta_schedule
-    scheduler_type = diffusers_pipe.low_res_scheduler._class_name.lower()
-    if "pndm" in scheduler_type:
-        pp_low_res_scheduler = PNDMScheduler(
-            beta_start=beta_start,
-            beta_end=beta_end,
-            beta_schedule="scaled_linear",
-            # Make sure the scheduler compatible with DDIM
-            set_alpha_to_one=False,
-            steps_offset=1,
-            # Make sure the scheduler compatible with PNDM
-            skip_prk_steps=True,
-        )
-    elif "ddpm" in scheduler_type:
-        pp_low_res_scheduler = DDPMScheduler(
-            beta_end=beta_end,
-            beta_schedule=beta_schedule,
-            beta_start=beta_start,
-            num_train_timesteps=num_train_timesteps,
-        )
-    elif "lms" in scheduler_type:
-        pp_low_res_scheduler = LMSDiscreteScheduler(
-            beta_start=beta_start, beta_end=beta_end, beta_schedule=beta_schedule
-        )
-    elif "ddim" in scheduler_type:
-        pp_low_res_scheduler = DDIMScheduler(
-            beta_start=beta_start,
-            beta_end=beta_end,
-            beta_schedule="scaled_linear",
-            # Make sure the scheduler compatible with DDIM
-            clip_sample=False,
-            set_alpha_to_one=False,
-            steps_offset=1,
-        )
-    else:
-        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
-
-    with tempfile.TemporaryDirectory() as tmpdirname:
-        # 6. tokenizer
-        diffusers_pipe.tokenizer.save_pretrained(tmpdirname)
-        pp_tokenizer = CLIPTokenizer.from_pretrained(tmpdirname)
-        # 7. create ppdiffusers pipe
-        paddle_pipe = PPDiffusersStableDiffusionUpscalePipeline(
-            max_noise_level=max_noise_level,
-            vae=pp_vae,
-            text_encoder=pp_text_encoder,
-            tokenizer=pp_tokenizer,
-            unet=pp_unet,
-            low_res_scheduler=pp_low_res_scheduler,
-            scheduler=pp_scheduler,
-        )
-
-        # 9. save_pretrained
-        paddle_pipe.save_pretrained(output_path)
-    return paddle_pipe
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
-    parser.add_argument(
-        "--pretrained_model_name_or_path",
-        type=str,
-        default="stabilityai/stable-diffusion-x4-upscaler",
-        help="Path to pretrained model or model identifier from huggingface.co/models.",
-    )
-    parser.add_argument(
-        "--output_path",
-        type=str,
-        default="stable-diffusion-x4-upscaler-ppdiffusers",
-        help="The model output path.",
-    )
-    args = parser.parse_args()
-    ppdiffusers_pipe = convert_diffusers_stable_diffusion_to_ppdiffusers(
-        args.pretrained_model_name_or_path, args.output_path
-    )
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_VersatileDiffusion_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_VersatileDiffusion_to_ppdiffusers.py
deleted file mode 100644
index 02874fb4265f..000000000000
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_VersatileDiffusion_to_ppdiffusers.py
+++ /dev/null
@@ -1,263 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import tempfile
-from collections import OrderedDict
-
-import paddle
-import torch
-from diffusers import VersatileDiffusionPipeline as DiffusersVersatileDiffusionPipeline
-
-from paddlenlp.transformers import (
-    CLIPFeatureExtractor,
-    CLIPTextConfig,
-    CLIPTextModelWithProjection,
-    CLIPTokenizer,
-    CLIPVisionConfig,
-    CLIPVisionModelWithProjection,
-)
-from ppdiffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    UNet2DConditionModel,
-)
-from ppdiffusers import (
-    VersatileDiffusionPipeline as PPDiffusersVersatileDiffusionPipeline,
-)
-from ppdiffusers.pipelines.versatile_diffusion import UNetFlatConditionModel
-
-paddle.set_device("cpu")
-
-
-def convert_to_ppdiffusers(vae_or_unet, dtype="float32"):
-    need_transpose = []
-    for k, v in vae_or_unet.named_modules():
-        if isinstance(v, torch.nn.Linear):
-            need_transpose.append(k + ".weight")
-    new_vae_or_unet = OrderedDict()
-    for k, v in vae_or_unet.state_dict().items():
-        if k not in need_transpose:
-            new_vae_or_unet[k] = v.cpu().numpy().astype(dtype)
-        else:
-            new_vae_or_unet[k] = v.t().cpu().numpy().astype(dtype)
-    return new_vae_or_unet
-
-
-def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True, need_prefix=False):
-    new_model_state = {}
-    transformers2ppnlp = {
-        ".encoder.": ".transformer.",
-        ".layer_norm": ".norm",
-        ".mlp.": ".",
-        ".fc1.": ".linear1.",
-        ".fc2.": ".linear2.",
-        ".final_layer_norm.": ".ln_final.",
-        ".embeddings.": ".",
-        ".position_embedding.": ".positional_embedding.",
-        ".patch_embedding.": ".conv1.",
-        "visual_projection.weight": "vision_projection",
-        "text_projection.weight": "text_projection",
-        ".pre_layrnorm.": ".ln_pre.",
-        ".post_layernorm.": ".ln_post.",
-        ".vision_model.": ".",
-    }
-    ignore_value = ["position_ids"]
-    donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"]
-
-    for name, value in clip.state_dict().items():
-        # step1: ignore position_ids
-        if any(i in name for i in ignore_value):
-            continue
-        # step2: transpose nn.Linear weight
-        if value.ndim == 2 and not any(i in name for i in donot_transpose):
-            value = value.t()
-        # step3: hf_name -> ppnlp_name mapping
-        for hf_name, ppnlp_name in transformers2ppnlp.items():
-            name = name.replace(hf_name, ppnlp_name)
-        # step4: 0d tensor -> 1d tensor
-        if name == "logit_scale":
-            value = value.reshape((1,))
-        # step5: safety_checker need prefix "clip."
-        if "vision_model" in name and need_prefix:
-            name = "clip." + name
-        new_model_state[name] = value.cpu().numpy().astype(dtype)
-
-    if is_text_encoder:
-        new_config = {
-            "max_text_length": clip.config.max_position_embeddings,
-            "vocab_size": clip.config.vocab_size,
-            "text_embed_dim": clip.config.hidden_size,
-            "text_heads": clip.config.num_attention_heads,
-            "text_layers": clip.config.num_hidden_layers,
-            "text_hidden_act": clip.config.hidden_act,
-            "projection_dim": clip.config.projection_dim,
-            "initializer_range": clip.config.initializer_range,
-            "initializer_factor": clip.config.initializer_factor,
-        }
-    else:
-        if need_prefix:
-            new_config = {
-                "image_resolution": clip.config.vision_config.image_size,
-                "vision_layers": clip.config.vision_config.num_hidden_layers,
-                "vision_heads": clip.config.vision_config.num_attention_heads,
-                "vision_embed_dim": clip.config.vision_config.hidden_size,
-                "vision_patch_size": clip.config.vision_config.patch_size,
-                "vision_mlp_ratio": clip.config.vision_config.intermediate_size
-                // clip.config.vision_config.hidden_size,
-                "vision_hidden_act": clip.config.vision_config.hidden_act,
-                "projection_dim": clip.config.projection_dim,
-            }
-        else:
-            new_config = {
-                "image_resolution": clip.config.image_size,
-                "vision_layers": clip.config.num_hidden_layers,
-                "vision_heads": clip.config.num_attention_heads,
-                "vision_embed_dim": clip.config.hidden_size,
-                "vision_patch_size": clip.config.patch_size,
-                "vision_mlp_ratio": clip.config.intermediate_size // clip.config.hidden_size,
-                "vision_hidden_act": clip.config.hidden_act,
-                "projection_dim": clip.config.projection_dim,
-            }
-    return new_model_state, new_config
-
-
-def check_keys(model, state_dict):
-    cls_name = model.__class__.__name__
-    missing_keys = []
-    mismatched_keys = []
-    for k, v in model.state_dict().items():
-        if k not in state_dict.keys():
-            missing_keys.append(k)
-        if list(v.shape) != list(state_dict[k].shape):
-            mismatched_keys.append(k)
-    if len(missing_keys):
-        missing_keys_str = ", ".join(missing_keys)
-        print(f"{cls_name} Found missing_keys {missing_keys_str}!")
-    if len(mismatched_keys):
-        mismatched_keys_str = ", ".join(mismatched_keys)
-        print(f"{cls_name} Found mismatched_keys {mismatched_keys_str}!")
-
-
-def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path, output_path=None):
-    # 0. load diffusers pipe and convert to ppdiffusers weights format
-    diffusers_pipe = DiffusersVersatileDiffusionPipeline.from_pretrained(
-        pretrained_model_name_or_path, use_auth_token=True
-    )
-    vae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vae)
-    image_unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.image_unet)
-    text_unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.text_unet)
-
-    text_encoder_state_dict, text_config = convert_hf_clip_to_ppnlp_clip(
-        diffusers_pipe.text_encoder, is_text_encoder=True, need_prefix=False
-    )
-
-    image_encoder_state_dict, vision_config = convert_hf_clip_to_ppnlp_clip(
-        diffusers_pipe.image_encoder, is_text_encoder=False, need_prefix=False
-    )
-
-    # 1. vae
-    pp_vae = AutoencoderKL.from_config(diffusers_pipe.vae.config)
-
-    pp_vae.set_dict(vae_state_dict)
-    check_keys(pp_vae, vae_state_dict)
-
-    # 2. image_unet
-    pp_image_unet = UNet2DConditionModel(**diffusers_pipe.image_unet.config)
-    pp_image_unet.set_dict(image_unet_state_dict)
-    check_keys(pp_image_unet, image_unet_state_dict)
-
-    # 3. text_unet
-    pp_text_unet = UNetFlatConditionModel(**diffusers_pipe.text_unet.config)
-    pp_text_unet.set_dict(text_unet_state_dict)
-    check_keys(pp_text_unet, text_unet_state_dict)
-
-    # 4. image_encoder
-    pp_image_encoder = CLIPVisionModelWithProjection(CLIPVisionConfig.from_dict(vision_config))
-    pp_image_encoder.set_dict(image_encoder_state_dict)
-    check_keys(pp_image_encoder, image_encoder_state_dict)
-
-    # 5. text_encoder
-    pp_text_encoder = CLIPTextModelWithProjection(CLIPTextConfig.from_dict(text_config))
-    pp_text_encoder.set_dict(text_encoder_state_dict)
-    check_keys(pp_text_encoder, text_encoder_state_dict)
-
-    # 6. scheduler
-    beta_start = diffusers_pipe.scheduler.beta_start
-    beta_end = diffusers_pipe.scheduler.beta_end
-    scheduler_type = diffusers_pipe.scheduler._class_name.lower()
-    if "pndm" in scheduler_type:
-        pp_scheduler = PNDMScheduler(
-            beta_start=beta_start,
-            beta_end=beta_end,
-            beta_schedule="scaled_linear",
-            # Make sure the scheduler compatible with DDIM
-            set_alpha_to_one=False,
-            steps_offset=1,
-            # Make sure the scheduler compatible with PNDM
-            skip_prk_steps=True,
-        )
-    elif "lms" in scheduler_type:
-        pp_scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
-    elif "ddim" in scheduler_type:
-        pp_scheduler = DDIMScheduler(
-            beta_start=beta_start,
-            beta_end=beta_end,
-            beta_schedule="scaled_linear",
-            # Make sure the scheduler compatible with DDIM
-            clip_sample=False,
-            set_alpha_to_one=False,
-            steps_offset=1,
-        )
-    else:
-        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
-
-    with tempfile.TemporaryDirectory() as tmpdirname:
-        pp_feature_extractor = CLIPFeatureExtractor.from_pretrained("CompVis/stable-diffusion-v1-4/feature_extractor")
-        # 7. tokenizer
-        diffusers_pipe.tokenizer.save_pretrained(tmpdirname)
-        pp_tokenizer = CLIPTokenizer.from_pretrained(tmpdirname)
-        # 8. create ppdiffusers pipe
-        paddle_pipe = PPDiffusersVersatileDiffusionPipeline(
-            tokenizer=pp_tokenizer,
-            image_feature_extractor=pp_feature_extractor,
-            text_encoder=pp_text_encoder,
-            image_encoder=pp_image_encoder,
-            image_unet=pp_image_unet,
-            text_unet=pp_text_unet,
-            vae=pp_vae,
-            scheduler=pp_scheduler,
-        )
-        # 9. save_pretrained
-        paddle_pipe.save_pretrained(output_path)
-    return paddle_pipe
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
-    parser.add_argument(
-        "--pretrained_model_name_or_path",
-        type=str,
-        default="shi-labs/versatile-diffusion",
-        help="Path to pretrained model or model identifier from huggingface.co/models.",
-    )
-    parser.add_argument(
-        "--output_path",
-        type=str,
-        default="versatile-diffusion-ppdiffusers",
-        help="The model output path.",
-    )
-    args = parser.parse_args()
-    ppdiffusers_pipe = convert_diffusers_to_ppdiffusers(args.pretrained_model_name_or_path, args.output_path)
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_alt_diffusion_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_alt_diffusion_to_ppdiffusers.py
deleted file mode 100644
index 7faa2664fb09..000000000000
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_alt_diffusion_to_ppdiffusers.py
+++ /dev/null
@@ -1,331 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import tempfile
-from collections import OrderedDict
-
-import paddle
-import torch
-from diffusers import AltDiffusionPipeline as DiffusersAltDiffusionPipeline
-
-from paddlenlp.transformers import (
-    CLIPFeatureExtractor,
-    CLIPVisionConfig,
-    XLMRobertaTokenizer,
-)
-from ppdiffusers import AltDiffusionPipeline as PPDiffusersAltDiffusionPipeline
-from ppdiffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    DPMSolverMultistepScheduler,
-    EulerAncestralDiscreteScheduler,
-    EulerDiscreteScheduler,
-    HeunDiscreteScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    UNet2DConditionModel,
-)
-from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import (
-    RobertaSeriesConfig,
-    RobertaSeriesModelWithTransformation,
-)
-from ppdiffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
-
-paddle.set_device("cpu")
-
-
-def convert_to_ppdiffusers(vae_or_unet, dtype="float32"):
-    need_transpose = []
-    for k, v in vae_or_unet.named_modules():
-        if isinstance(v, torch.nn.Linear):
-            need_transpose.append(k + ".weight")
-    new_vae_or_unet = OrderedDict()
-    for k, v in vae_or_unet.state_dict().items():
-        if k not in need_transpose:
-            new_vae_or_unet[k] = v.cpu().numpy().astype(dtype)
-        else:
-            new_vae_or_unet[k] = v.t().cpu().numpy().astype(dtype)
-    return new_vae_or_unet
-
-
-def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True):
-    new_model_state = {}
-    transformers2ppnlp = {
-        ".encoder.": ".transformer.",
-        ".layer_norm": ".norm",
-        ".mlp.": ".",
-        ".fc1.": ".linear1.",
-        ".fc2.": ".linear2.",
-        ".final_layer_norm.": ".ln_final.",
-        ".embeddings.": ".",
-        ".position_embedding.": ".positional_embedding.",
-        ".patch_embedding.": ".conv1.",
-        "visual_projection.weight": "vision_projection",
-        "text_projection.weight": "text_projection",
-        ".pre_layrnorm.": ".ln_pre.",
-        ".post_layernorm.": ".ln_post.",
-        ".vision_model.": ".",
-    }
-    ignore_value = ["position_ids"]
-    donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"]
-
-    for name, value in clip.state_dict().items():
-        # step1: ignore position_ids
-        if any(i in name for i in ignore_value):
-            continue
-        # step2: transpose nn.Linear weight
-        if value.ndim == 2 and not any(i in name for i in donot_transpose):
-            value = value.t()
-        # step3: hf_name -> ppnlp_name mapping
-        for hf_name, ppnlp_name in transformers2ppnlp.items():
-            name = name.replace(hf_name, ppnlp_name)
-        # step4: 0d tensor -> 1d tensor
-        if name == "logit_scale":
-            value = value.reshape((1,))
-        # step5: safety_checker need prefix "clip."
-        if "vision_model" in name:
-            name = "clip." + name
-        new_model_state[name] = value.cpu().numpy().astype(dtype)
-
-    if is_text_encoder:
-        new_config = {
-            "max_text_length": clip.config.max_position_embeddings,
-            "vocab_size": clip.config.vocab_size,
-            "text_embed_dim": clip.config.hidden_size,
-            "text_heads": clip.config.num_attention_heads,
-            "text_layers": clip.config.num_hidden_layers,
-            "text_hidden_act": clip.config.hidden_act,
-            "projection_dim": clip.config.projection_dim,
-            "initializer_range": clip.config.initializer_range,
-            "initializer_factor": clip.config.initializer_factor,
-        }
-    else:
-        new_config = {
-            "image_resolution": clip.config.vision_config.image_size,
-            "vision_layers": clip.config.vision_config.num_hidden_layers,
-            "vision_heads": clip.config.vision_config.num_attention_heads,
-            "vision_embed_dim": clip.config.vision_config.hidden_size,
-            "vision_patch_size": clip.config.vision_config.patch_size,
-            "vision_mlp_ratio": clip.config.vision_config.intermediate_size // clip.config.vision_config.hidden_size,
-            "vision_hidden_act": clip.config.vision_config.hidden_act,
-            "projection_dim": clip.config.projection_dim,
-        }
-    return new_model_state, new_config
-
-
-def convert_hf_xlm_roberta_to_ppnlp_xlm_roberta(xlm_roberta, dtype="float32"):
-    new_model_state = {}
-    mappings = [
-        ["embeddings.word_embeddings.weight", "embeddings.word_embeddings.weight"],
-        ["embeddings.position_embeddings.weight", "embeddings.position_embeddings.weight"],
-        ["embeddings.token_type_embeddings.weight", "embeddings.token_type_embeddings.weight"],
-        ["embeddings.LayerNorm.weight", "embeddings.layer_norm.weight"],
-        ["embeddings.LayerNorm.bias", "embeddings.layer_norm.bias"],
-        ["pooler.dense.weight", "pooler.dense.weight", "transpose"],
-        ["pooler.dense.bias", "pooler.dense.bias"],
-        ["transformation.weight", "transformation.weight", "transpose"],
-        ["transformation.bias", "transformation.bias"],
-    ]
-    for layer_index in range(xlm_roberta.config.num_hidden_layers):
-        layer_mappings = [
-            [
-                f"encoder.layer.{layer_index}.attention.self.query.weight",
-                f"encoder.layers.{layer_index}.self_attn.q_proj.weight",
-                "transpose",
-            ],
-            [
-                f"encoder.layer.{layer_index}.attention.self.query.bias",
-                f"encoder.layers.{layer_index}.self_attn.q_proj.bias",
-            ],
-            [
-                f"encoder.layer.{layer_index}.attention.self.key.weight",
-                f"encoder.layers.{layer_index}.self_attn.k_proj.weight",
-                "transpose",
-            ],
-            [
-                f"encoder.layer.{layer_index}.attention.self.key.bias",
-                f"encoder.layers.{layer_index}.self_attn.k_proj.bias",
-            ],
-            [
-                f"encoder.layer.{layer_index}.attention.self.value.weight",
-                f"encoder.layers.{layer_index}.self_attn.v_proj.weight",
-                "transpose",
-            ],
-            [
-                f"encoder.layer.{layer_index}.attention.self.value.bias",
-                f"encoder.layers.{layer_index}.self_attn.v_proj.bias",
-            ],
-            [
-                f"encoder.layer.{layer_index}.attention.output.dense.weight",
-                f"encoder.layers.{layer_index}.self_attn.out_proj.weight",
-                "transpose",
-            ],
-            [
-                f"encoder.layer.{layer_index}.attention.output.dense.bias",
-                f"encoder.layers.{layer_index}.self_attn.out_proj.bias",
-            ],
-            [
-                f"encoder.layer.{layer_index}.attention.output.LayerNorm.weight",
-                f"encoder.layers.{layer_index}.norm1.weight",
-            ],
-            [
-                f"encoder.layer.{layer_index}.attention.output.LayerNorm.bias",
-                f"encoder.layers.{layer_index}.norm1.bias",
-            ],
-            [
-                f"encoder.layer.{layer_index}.intermediate.dense.weight",
-                f"encoder.layers.{layer_index}.linear1.weight",
-                "transpose",
-            ],
-            [f"encoder.layer.{layer_index}.intermediate.dense.bias", f"encoder.layers.{layer_index}.linear1.bias"],
-            [
-                f"encoder.layer.{layer_index}.output.dense.weight",
-                f"encoder.layers.{layer_index}.linear2.weight",
-                "transpose",
-            ],
-            [f"encoder.layer.{layer_index}.output.dense.bias", f"encoder.layers.{layer_index}.linear2.bias"],
-            [f"encoder.layer.{layer_index}.output.LayerNorm.weight", f"encoder.layers.{layer_index}.norm2.weight"],
-            [f"encoder.layer.{layer_index}.output.LayerNorm.bias", f"encoder.layers.{layer_index}.norm2.bias"],
-        ]
-        mappings.extend(layer_mappings)
-
-    state_dict = xlm_roberta.state_dict()
-    prefix = "roberta."
-    for data in mappings:
-        need_transpose = False
-        if len(data) == 3:
-            need_transpose = True
-        hf_name, pp_name = data[:2]
-        if "transformation." not in hf_name:
-            hf_name = prefix + hf_name
-            pp_name = prefix + pp_name
-        if need_transpose:
-            new_model_state[pp_name] = state_dict[hf_name].t().cpu().numpy().astype(dtype)
-        else:
-            new_model_state[pp_name] = state_dict[hf_name].cpu().numpy().astype(dtype)
-
-    new_config = xlm_roberta.config.to_dict()
-    return new_model_state, new_config
-
-
-def convert_diffusers_stable_diffusion_to_ppdiffusers(pretrained_model_name_or_path, output_path=None):
-    # 0. load diffusers pipe and convert to ppdiffusers weights format
-    diffusers_pipe = DiffusersAltDiffusionPipeline.from_pretrained(pretrained_model_name_or_path, use_auth_token=True)
-    vae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vae)
-    unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.unet)
-    text_encoder_state_dict, text_encoder_config = convert_hf_xlm_roberta_to_ppnlp_xlm_roberta(
-        diffusers_pipe.text_encoder
-    )
-    safety_checker_state_dict, safety_checker_config = convert_hf_clip_to_ppnlp_clip(
-        diffusers_pipe.safety_checker, is_text_encoder=False
-    )
-
-    # 1. vae
-    pp_vae = AutoencoderKL.from_config(diffusers_pipe.vae.config)
-
-    pp_vae.set_dict(vae_state_dict)
-
-    # 2. unet
-    pp_unet = UNet2DConditionModel.from_config(diffusers_pipe.unet.config)
-
-    pp_unet.set_dict(unet_state_dict)
-
-    # 3. text_encoder
-    config = RobertaSeriesConfig(**text_encoder_config)
-    pp_text_encoder = RobertaSeriesModelWithTransformation(config)
-    pp_text_encoder.set_dict(text_encoder_state_dict)
-
-    # 4. safety_checker
-    pp_safety_checker = StableDiffusionSafetyChecker(CLIPVisionConfig.from_dict(safety_checker_config))
-    pp_safety_checker.set_dict(safety_checker_state_dict)
-
-    # 5. scheduler
-    beta_start = diffusers_pipe.scheduler.beta_start
-    beta_end = diffusers_pipe.scheduler.beta_end
-    num_train_timesteps = diffusers_pipe.scheduler.num_train_timesteps
-    scheduler_type = diffusers_pipe.scheduler._class_name.lower()
-
-    scheduler = DDIMScheduler(
-        beta_end=beta_end,
-        beta_schedule="scaled_linear",
-        beta_start=beta_start,
-        num_train_timesteps=num_train_timesteps,
-        steps_offset=1,
-        clip_sample=False,
-        set_alpha_to_one=False,
-    )
-    # make sure scheduler works correctly with DDIM
-    scheduler.register_to_config(clip_sample=False)
-
-    if scheduler_type == "pndm":
-        config = dict(scheduler.config)
-        config["skip_prk_steps"] = True
-        scheduler = PNDMScheduler.from_config(config)
-    elif scheduler_type == "lms":
-        scheduler = LMSDiscreteScheduler.from_config(scheduler.config)
-    elif scheduler_type == "heun":
-        scheduler = HeunDiscreteScheduler.from_config(scheduler.config)
-    elif scheduler_type == "euler":
-        scheduler = EulerDiscreteScheduler.from_config(scheduler.config)
-    elif scheduler_type == "euler-ancestral":
-        scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config)
-    elif scheduler_type == "dpm":
-        scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
-    elif scheduler_type == "ddim":
-        scheduler = scheduler
-    else:
-        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
-
-    with tempfile.TemporaryDirectory() as tmpdirname:
-        # 6. feature_extractor
-        # diffusers_pipe.feature_extractor.save_pretrained(tmpdirname)
-        pp_feature_extractor = CLIPFeatureExtractor.from_pretrained("CompVis/stable-diffusion-v1-4/feature_extractor")
-
-        # 7. tokenizer
-        diffusers_pipe.tokenizer.save_pretrained(tmpdirname)
-        pp_tokenizer = XLMRobertaTokenizer.from_pretrained(tmpdirname)
-
-        # 8. create ppdiffusers pipe
-        paddle_pipe = PPDiffusersAltDiffusionPipeline(
-            vae=pp_vae,
-            text_encoder=pp_text_encoder,
-            tokenizer=pp_tokenizer,
-            unet=pp_unet,
-            safety_checker=pp_safety_checker,
-            feature_extractor=pp_feature_extractor,
-            scheduler=scheduler,
-        )
-        # 9. save_pretrained
-        paddle_pipe.save_pretrained(output_path)
-    return paddle_pipe
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
-    parser.add_argument(
-        "--pretrained_model_name_or_path",
-        type=str,
-        default="BAAI/AltDiffusion",
-        help="Path to pretrained model or model identifier from huggingface.co/models.",
-    )
-    parser.add_argument(
-        "--output_path",
-        type=str,
-        default="AltDiffusion-ppdiffusers",
-        help="The model output path.",
-    )
-    args = parser.parse_args()
-    ppdiffusers_pipe = convert_diffusers_stable_diffusion_to_ppdiffusers(
-        args.pretrained_model_name_or_path, args.output_path
-    )
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_controlnet_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_controlnet_to_ppdiffusers.py
deleted file mode 100644
index bd8d3e8bbb15..000000000000
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_controlnet_to_ppdiffusers.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-from collections import OrderedDict
-
-import paddle
-import torch
-from diffusers import ControlNetModel as DiffusersControlNetModel
-
-from ppdiffusers import ControlNetModel as PPDiffusersControlNetModel
-
-paddle.set_device("cpu")
-
-
-def convert_to_ppdiffusers(controlnet, dtype="float32"):
-    need_transpose = []
-    for k, v in controlnet.named_modules():
-        if isinstance(v, torch.nn.Linear):
-            need_transpose.append(k + ".weight")
-    new_controlnet = OrderedDict()
-    for k, v in controlnet.state_dict().items():
-        if k not in need_transpose:
-            new_controlnet[k] = v.cpu().numpy().astype(dtype)
-        else:
-            new_controlnet[k] = v.t().cpu().numpy().astype(dtype)
-    return new_controlnet
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
-    parser.add_argument(
-        "--pretrained_model_name_or_path",
-        type=str,
-        default="lllyasviel/sd-controlnet-canny",
-        help="Path to pretrained model or model identifier from huggingface.co/models.",
-    )
-    parser.add_argument(
-        "--output_path",
-        type=str,
-        default="paddle_models/sd-controlnet-canny",
-        help="The output path.",
-    )
-    args = parser.parse_args()
-
-    th_controlnet = DiffusersControlNetModel.from_pretrained(args.pretrained_model_name_or_path)
-    controlnet_state_dict = convert_to_ppdiffusers(th_controlnet)
-    pp_controlnet = PPDiffusersControlNetModel.from_config(th_controlnet.config)
-    pp_controlnet.set_dict(controlnet_state_dict)
-    if not os.path.exists(args.output_path):
-        os.makedirs(args.output_path)
-    pp_controlnet.save_pretrained(args.output_path)
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_latent_diffusion_model_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_latent_diffusion_model_to_ppdiffusers.py
deleted file mode 100644
index 024705a35500..000000000000
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_latent_diffusion_model_to_ppdiffusers.py
+++ /dev/null
@@ -1,172 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import tempfile
-from collections import OrderedDict
-
-import paddle
-import torch
-from diffusers import LDMTextToImagePipeline as DiffusersLDMTextToImagePipeline
-
-from paddlenlp.transformers import BertTokenizer
-from ppdiffusers import AutoencoderKL, DDIMScheduler, LDMBertModel
-from ppdiffusers import LDMTextToImagePipeline as PPDiffusersLDMTextToImagePipeline
-from ppdiffusers import LMSDiscreteScheduler, PNDMScheduler, UNet2DConditionModel
-
-paddle.set_device("cpu")
-
-
-def convert_to_ppdiffusers(vae_or_unet, dtype="float32"):
-    need_transpose = []
-    for k, v in vae_or_unet.named_modules():
-        if isinstance(v, torch.nn.Linear):
-            need_transpose.append(k + ".weight")
-    new_vae_or_unet = OrderedDict()
-    for k, v in vae_or_unet.state_dict().items():
-        if k not in need_transpose:
-            new_vae_or_unet[k] = v.cpu().numpy().astype(dtype)
-        else:
-            new_vae_or_unet[k] = v.t().cpu().numpy().astype(dtype)
-    return new_vae_or_unet
-
-
-def convert_hf_ldmbert_to_ppnlp_ldmbert(ldmbert, dtype="float32"):
-    transformers2ppnlp = {
-        "model.embed_tokens.weight": "embeddings.word_embeddings.weight",
-        "model.embed_positions.weight": "embeddings.position_embeddings.weight",
-        "model.layer_norm.": "final_layer_norm.",
-        "model.layers": "encoder.layers",
-        ".self_attn_layer_norm.": ".norm1.",
-        ".final_layer_norm.": ".norm2.",
-        ".fc1.": ".linear1.",
-        ".fc2.": ".linear2.",
-    }
-    ignore_value = ["to_logits"]
-    donot_transpose = ["embed_tokens", "embed_positions", "norm"]
-    new_model_state = OrderedDict()
-    for name, value in ldmbert.state_dict().items():
-        # step1: ignore to_logits
-        if any(i in name for i in ignore_value):
-            continue
-        # step2: transpose nn.Linear weight
-        if value.ndim == 2 and not any(i in name for i in donot_transpose):
-            value = value.t()
-        # step3: hf_name -> ppnlp_name mapping
-        for hf_name, ppnlp_name in transformers2ppnlp.items():
-            name = name.replace(hf_name, ppnlp_name)
-        new_model_state[name] = value.cpu().numpy().astype(dtype)
-
-    new_config = {
-        "vocab_size": ldmbert.config.vocab_size,
-        "max_position_embeddings": ldmbert.config.max_position_embeddings,
-        "encoder_layers": ldmbert.config.encoder_layers,
-        "encoder_ffn_dim": ldmbert.config.encoder_ffn_dim,
-        "encoder_attention_heads": ldmbert.config.encoder_attention_heads,
-        "head_dim": ldmbert.config.head_dim,
-        "activation_function": ldmbert.config.activation_function,
-        "d_model": ldmbert.config.d_model,
-        "dropout": 0.0,  # we do not use dropout in original ldmbert
-        "attention_dropout": ldmbert.config.attention_dropout,
-        "activation_dropout": ldmbert.config.activation_dropout,
-        "init_std": ldmbert.config.init_std,
-        "pad_token_id": ldmbert.config.pad_token_id,
-    }
-    return new_model_state, new_config
-
-
-def convert_diffusers_stable_diffusion_to_ppdiffusers(pretrained_model_name_or_path, output_path=None):
-    # 0. load diffusers pipe and convert to ppdiffusers weights format
-    diffusers_pipe = DiffusersLDMTextToImagePipeline.from_pretrained(
-        pretrained_model_name_or_path, use_auth_token=True
-    )
-    vqvae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vqvae)
-    unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.unet)
-    bert_state_dict, bert_config = convert_hf_ldmbert_to_ppnlp_ldmbert(diffusers_pipe.bert)
-
-    # 1. vqvae
-    pp_vqvae = AutoencoderKL.from_config(diffusers_pipe.vqvae.config)
-    pp_vqvae.set_dict(vqvae_state_dict)
-
-    # 2. unet
-    pp_unet = UNet2DConditionModel.from_config(diffusers_pipe.unet.config)
-
-    pp_unet.set_dict(unet_state_dict)
-
-    # 3. bert
-    pp_bert = LDMBertModel(**bert_config)
-    pp_bert.set_dict(bert_state_dict)
-
-    # 4. scheduler
-    beta_start = diffusers_pipe.scheduler.beta_start
-    beta_end = diffusers_pipe.scheduler.beta_end
-    scheduler_type = diffusers_pipe.scheduler._class_name.lower()
-    if "pndm" in scheduler_type:
-        pp_scheduler = PNDMScheduler(
-            beta_start=beta_start,
-            beta_end=beta_end,
-            beta_schedule="scaled_linear",
-            # Make sure the scheduler compatible with DDIM
-            set_alpha_to_one=False,
-            steps_offset=1,
-            # Make sure the scheduler compatible with PNDM
-            skip_prk_steps=True,
-        )
-    elif "lms" in scheduler_type:
-        pp_scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
-    elif "ddim" in scheduler_type:
-        pp_scheduler = DDIMScheduler(
-            beta_start=beta_start,
-            beta_end=beta_end,
-            beta_schedule="scaled_linear",
-            # Make sure the scheduler compatible with DDIM
-            clip_sample=False,
-            set_alpha_to_one=False,
-            steps_offset=1,
-        )
-    else:
-        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
-
-    with tempfile.TemporaryDirectory() as tmpdirname:
-        # 5. tokenizer
-        diffusers_pipe.tokenizer.save_pretrained(tmpdirname)
-        pp_tokenizer = BertTokenizer.from_pretrained(tmpdirname, model_max_length=77)
-
-        # 6. create ppdiffusers pipe
-        paddle_pipe = PPDiffusersLDMTextToImagePipeline(
-            vqvae=pp_vqvae, bert=pp_bert, tokenizer=pp_tokenizer, unet=pp_unet, scheduler=pp_scheduler
-        )
-
-        # 7. save_pretrained
-        paddle_pipe.save_pretrained(output_path)
-    return paddle_pipe
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
-    parser.add_argument(
-        "--pretrained_model_name_or_path",
-        type=str,
-        default="CompVis/ldm-text2im-large-256",
-        help="Path to pretrained model or model identifier from huggingface.co/models.",
-    )
-    parser.add_argument(
-        "--output_path",
-        type=str,
-        default="ldm-text2im-large-256-ppdiffusers",
-        help="The model output path.",
-    )
-    args = parser.parse_args()
-    ppdiffusers_pipe = convert_diffusers_stable_diffusion_to_ppdiffusers(
-        args.pretrained_model_name_or_path, args.output_path
-    )
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_paintbyexample_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_paintbyexample_to_ppdiffusers.py
deleted file mode 100644
index ca11dcd91543..000000000000
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_paintbyexample_to_ppdiffusers.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import tempfile
-from collections import OrderedDict
-
-import paddle
-import torch
-from diffusers import PaintByExamplePipeline as DiffusersPaintByExamplePipeline
-
-# CLIPImageProcessor need paddlenlp latest
-from paddlenlp.transformers import CLIPImageProcessor, CLIPVisionConfig
-from ppdiffusers import AutoencoderKL
-from ppdiffusers import PaintByExamplePipeline as PPDiffusersPaintByExamplePipeline
-from ppdiffusers import PNDMScheduler, UNet2DConditionModel
-from ppdiffusers.pipelines.paint_by_example.image_encoder import (
-    PaintByExampleImageEncoder,
-)
-
-paddle.set_device("cpu")
-
-
-def convert_to_ppdiffusers(vae_or_unet, dtype="float32", prefix=""):
-    need_transpose = []
-    for k, v in vae_or_unet.named_modules():
-        if isinstance(v, torch.nn.Linear):
-            need_transpose.append(k + ".weight")
-    new_vae_or_unet = OrderedDict()
-    for k, v in vae_or_unet.state_dict().items():
-        if k not in need_transpose:
-            new_vae_or_unet[prefix + k] = v.cpu().numpy().astype(dtype)
-        else:
-            new_vae_or_unet[prefix + k] = v.t().cpu().numpy().astype(dtype)
-    return new_vae_or_unet
-
-
-def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32"):
-    new_model_state = {}
-    transformers2ppnlp = {
-        ".encoder.": ".transformer.",
-        ".layer_norm": ".norm",
-        ".mlp.": ".",
-        ".fc1.": ".linear1.",
-        ".fc2.": ".linear2.",
-        ".final_layer_norm.": ".ln_final.",
-        ".embeddings.": ".",
-        ".position_embedding.": ".positional_embedding.",
-        ".patch_embedding.": ".conv1.",
-        "visual_projection.weight": "vision_projection",
-        "text_projection.weight": "text_projection",
-        ".pre_layrnorm.": ".ln_pre.",
-        ".post_layernorm.": ".ln_post.",
-    }
-    ignore_value = ["position_ids", "mapper"]
-    donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"]
-
-    for name, value in clip.state_dict().items():
-        # step1: ignore position_ids and mapper
-        if any(i in name for i in ignore_value):
-            continue
-        # step2: transpose nn.Linear weight
-        if value.ndim == 2 and not any(i in name for i in donot_transpose):
-            value = value.t()
-        # step3: hf_name -> ppnlp_name mapping
-        for hf_name, ppnlp_name in transformers2ppnlp.items():
-            name = name.replace(hf_name, ppnlp_name)
-        # step4: 0d tensor -> 1d tensor
-        if name == "logit_scale":
-            value = value.reshape((1,))
-
-        new_model_state[name] = value.cpu().numpy().astype(dtype)
-
-    # convert mapper
-    mappersd = convert_to_ppdiffusers(clip.mapper, prefix="mapper.")
-    new_model_state.update(mappersd)
-
-    new_config = {
-        "image_resolution": clip.config.image_size,
-        "vision_layers": clip.config.num_hidden_layers,
-        "vision_heads": clip.config.num_attention_heads,
-        "vision_embed_dim": clip.config.hidden_size,
-        "vision_patch_size": clip.config.patch_size,
-        "vision_mlp_ratio": clip.config.intermediate_size // clip.config.hidden_size,
-        "vision_hidden_act": clip.config.hidden_act,
-        "projection_dim": clip.config.projection_dim,
-    }
-    return new_model_state, new_config
-
-
-def check_keys(model, state_dict):
-    cls_name = model.__class__.__name__
-    missing_keys = []
-    mismatched_keys = []
-    for k, v in model.state_dict().items():
-        if k not in state_dict.keys():
-            missing_keys.append(k)
-        if list(v.shape) != list(state_dict[k].shape):
-            mismatched_keys.append(k)
-    if len(missing_keys):
-        missing_keys_str = ", ".join(missing_keys)
-        print(f"{cls_name} Found missing_keys {missing_keys_str}!")
-    if len(mismatched_keys):
-        mismatched_keys_str = ", ".join(mismatched_keys)
-        print(f"{cls_name} Found mismatched_keys {mismatched_keys_str}!")
-
-
-def convert_diffusers_paintbyexample_to_ppdiffusers(pretrained_model_name_or_path, output_path=None):
-    # 0. load diffusers pipe and convert to ppdiffusers weights format
-    diffusers_pipe = DiffusersPaintByExamplePipeline.from_pretrained(
-        pretrained_model_name_or_path, use_auth_token=True
-    )
-    vae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vae)
-    unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.unet)
-    image_encoder_state_dict, image_encoder_config = convert_hf_clip_to_ppnlp_clip(diffusers_pipe.image_encoder)
-
-    # 1. vae
-    pp_vae = AutoencoderKL.from_config(diffusers_pipe.vae.config)
-    pp_vae.set_dict(vae_state_dict)
-    check_keys(pp_vae, vae_state_dict)
-    # 2. unet
-    pp_unet = UNet2DConditionModel.from_config(diffusers_pipe.unet.config)
-    pp_unet.set_dict(unet_state_dict)
-    check_keys(pp_unet, unet_state_dict)
-
-    # 3. image_encoder
-    pp_image_encoder = PaintByExampleImageEncoder(CLIPVisionConfig.from_dict(image_encoder_config))
-    pp_image_encoder.set_dict(image_encoder_state_dict)
-    check_keys(pp_image_encoder, image_encoder_state_dict)
-    # 4. scheduler
-    pp_scheduler = PNDMScheduler.from_config(diffusers_pipe.scheduler.config)
-
-    with tempfile.TemporaryDirectory() as tmpdirname:
-        # 5. feature_extractor
-        diffusers_pipe.feature_extractor.save_pretrained(tmpdirname)
-        feature_extractor = CLIPImageProcessor.from_pretrained(tmpdirname)
-
-        # 6. create ppdiffusers pipe
-        paddle_pipe = PPDiffusersPaintByExamplePipeline(
-            vae=pp_vae,
-            image_encoder=pp_image_encoder,
-            unet=pp_unet,
-            scheduler=pp_scheduler,
-            safety_checker=None,
-            feature_extractor=feature_extractor,
-            requires_safety_checker=False,
-        )
-
-        # 6. save_pretrained
-        paddle_pipe.save_pretrained(output_path)
-    return paddle_pipe
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
-    parser.add_argument(
-        "--pretrained_model_name_or_path",
-        type=str,
-        default="Fantasy-Studio/Paint-by-Example",
-        help="Path to pretrained model or model identifier from huggingface.co/models.",
-    )
-    parser.add_argument(
-        "--output_path",
-        type=str,
-        default="./Paint-by-Example",
-        help="The model output path.",
-    )
-    args = parser.parse_args()
-    ppdiffusers_pipe = convert_diffusers_paintbyexample_to_ppdiffusers(
-        args.pretrained_model_name_or_path, args.output_path
-    )
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion2.0_depth_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion2.0_depth_to_ppdiffusers.py
deleted file mode 100644
index 35c69c0a6fd8..000000000000
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion2.0_depth_to_ppdiffusers.py
+++ /dev/null
@@ -1,196 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import tempfile
-from collections import OrderedDict
-
-import paddle
-import torch
-from diffusers import (
-    StableDiffusionDepth2ImgPipeline as DiffusersStableDiffusionDepth2ImgPipeline,
-)
-
-from paddlenlp.transformers import (
-    CLIPTextConfig,
-    CLIPTextModel,
-    CLIPTokenizer,
-    DPTConfig,
-    DPTForDepthEstimation,
-    DPTImageProcessor,
-)
-from ppdiffusers import AutoencoderKL, PNDMScheduler
-from ppdiffusers import (
-    StableDiffusionDepth2ImgPipeline as PPDiffusersStableDiffusionDepth2ImgPipeline,
-)
-from ppdiffusers import UNet2DConditionModel
-
-paddle.set_device("cpu")
-
-
-def convert_to_ppdiffusers(vae_or_unet, dtype="float32"):
-    need_transpose = []
-    for k, v in vae_or_unet.named_modules():
-        if isinstance(v, torch.nn.Linear):
-            need_transpose.append(k + ".weight")
-    new_vae_or_unet = OrderedDict()
-    for k, v in vae_or_unet.state_dict().items():
-        if k not in need_transpose:
-            new_vae_or_unet[k] = v.cpu().numpy().astype(dtype)
-        else:
-            new_vae_or_unet[k] = v.t().cpu().numpy().astype(dtype)
-    return new_vae_or_unet
-
-
-def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32"):
-    new_model_state = {}
-    transformers2ppnlp = {
-        ".encoder.": ".transformer.",
-        ".layer_norm": ".norm",
-        ".mlp.": ".",
-        ".fc1.": ".linear1.",
-        ".fc2.": ".linear2.",
-        ".final_layer_norm.": ".ln_final.",
-        ".embeddings.": ".",
-        ".position_embedding.": ".positional_embedding.",
-        ".patch_embedding.": ".conv1.",
-        "visual_projection.weight": "vision_projection",
-        "text_projection.weight": "text_projection",
-        ".pre_layrnorm.": ".ln_pre.",
-        ".post_layernorm.": ".ln_post.",
-        ".vision_model.": ".",
-    }
-    ignore_value = ["position_ids"]
-    donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"]
-
-    for name, value in clip.state_dict().items():
-        # step1: ignore position_ids
-        if any(i in name for i in ignore_value):
-            continue
-        # step2: transpose nn.Linear weight
-        if value.ndim == 2 and not any(i in name for i in donot_transpose):
-            value = value.t()
-        # step3: hf_name -> ppnlp_name mapping
-        for hf_name, ppnlp_name in transformers2ppnlp.items():
-            name = name.replace(hf_name, ppnlp_name)
-        # step4: 0d tensor -> 1d tensor
-        if name == "logit_scale":
-            value = value.reshape((1,))
-
-        new_model_state[name] = value.cpu().numpy().astype(dtype)
-
-    new_config = {
-        "max_text_length": clip.config.max_position_embeddings,
-        "vocab_size": clip.config.vocab_size,
-        "text_embed_dim": clip.config.hidden_size,
-        "text_heads": clip.config.num_attention_heads,
-        "text_layers": clip.config.num_hidden_layers,
-        "text_hidden_act": clip.config.hidden_act,
-        "projection_dim": clip.config.projection_dim,
-        "initializer_range": clip.config.initializer_range,
-        "initializer_factor": clip.config.initializer_factor,
-    }
-    return new_model_state, new_config
-
-
-def check_keys(model, state_dict):
-    cls_name = model.__class__.__name__
-    missing_keys = []
-    mismatched_keys = []
-    for k, v in model.state_dict().items():
-        if k not in state_dict.keys():
-            missing_keys.append(k)
-        if list(v.shape) != list(state_dict[k].shape):
-            mismatched_keys.append(k)
-    if len(missing_keys):
-        missing_keys_str = ", ".join(missing_keys)
-        print(f"{cls_name} Found missing_keys {missing_keys_str}!")
-    if len(mismatched_keys):
-        mismatched_keys_str = ", ".join(mismatched_keys)
-        print(f"{cls_name} Found mismatched_keys {mismatched_keys_str}!")
-
-
-def convert_diffusers_stable_diffusion2_0_depth_to_ppdiffusers(pretrained_model_name_or_path, output_path=None):
-    # 0. load diffusers pipe and convert to ppdiffusers weights format
-    diffusers_pipe = DiffusersStableDiffusionDepth2ImgPipeline.from_pretrained(
-        pretrained_model_name_or_path, use_auth_token=True
-    )
-    vae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vae)
-    unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.unet)
-    depth_estimator_state_dict = convert_to_ppdiffusers(diffusers_pipe.depth_estimator)
-    text_encoder_state_dict, text_encoder_config = convert_hf_clip_to_ppnlp_clip(diffusers_pipe.text_encoder)
-
-    # 1. vae
-    pp_vae = AutoencoderKL.from_config(diffusers_pipe.vae.config)
-    pp_vae.set_dict(vae_state_dict)
-    check_keys(pp_vae, vae_state_dict)
-    # 2. unet
-    pp_unet = UNet2DConditionModel.from_config(diffusers_pipe.unet.config)
-    pp_unet.set_dict(unet_state_dict)
-    check_keys(pp_unet, unet_state_dict)
-    # 3. text_encoder
-    pp_text_encoder = CLIPTextModel(CLIPTextConfig.from_dict(text_encoder_config))
-    pp_text_encoder.set_dict(text_encoder_state_dict)
-    check_keys(pp_text_encoder, text_encoder_state_dict)
-    # 4. scheduler
-    pp_scheduler = PNDMScheduler.from_config(diffusers_pipe.scheduler.config)
-
-    with tempfile.TemporaryDirectory() as tmpdirname:
-        # 5. depth_estimator
-        diffusers_pipe.depth_estimator.config.save_pretrained(tmpdirname)
-        config = DPTConfig.from_pretrained(tmpdirname, return_dict=True)
-        pp_depth_estimator = DPTForDepthEstimation(config)
-        pp_depth_estimator.set_dict(depth_estimator_state_dict)
-        check_keys(pp_depth_estimator, depth_estimator_state_dict)
-        # 6. tokenizer
-        diffusers_pipe.tokenizer.save_pretrained(tmpdirname)
-        pp_tokenizer = CLIPTokenizer.from_pretrained(tmpdirname)
-
-        # 7. feature_extractor
-        diffusers_pipe.feature_extractor.save_pretrained(tmpdirname)
-        pp_feature_extractor = DPTImageProcessor.from_pretrained(tmpdirname)
-
-        # 8. create ppdiffusers pipe
-        paddle_pipe = PPDiffusersStableDiffusionDepth2ImgPipeline(
-            vae=pp_vae,
-            text_encoder=pp_text_encoder,
-            tokenizer=pp_tokenizer,
-            unet=pp_unet,
-            feature_extractor=pp_feature_extractor,
-            depth_estimator=pp_depth_estimator,
-            scheduler=pp_scheduler,
-        )
-
-        # 9. save_pretrained
-        paddle_pipe.save_pretrained(output_path)
-    return paddle_pipe
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
-    parser.add_argument(
-        "--pretrained_model_name_or_path",
-        type=str,
-        default="stabilityai/stable-diffusion-2-depth",
-        help="Path to pretrained model or model identifier from huggingface.co/models.",
-    )
-    parser.add_argument(
-        "--output_path",
-        type=str,
-        default="stable-diffusion-2-depth",
-        help="The model output path.",
-    )
-    args = parser.parse_args()
-    ppdiffusers_pipe = convert_diffusers_stable_diffusion2_0_depth_to_ppdiffusers(
-        args.pretrained_model_name_or_path, args.output_path
-    )
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers.py
deleted file mode 100644
index e126049795e8..000000000000
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers.py
+++ /dev/null
@@ -1,257 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import tempfile
-from collections import OrderedDict
-
-import paddle
-import torch
-from diffusers import (
-    StableDiffusionControlNetPipeline as DiffusersStableDiffusionControlNetPipeline,
-)
-
-from paddlenlp.transformers import (
-    CLIPFeatureExtractor,
-    CLIPTextConfig,
-    CLIPTextModel,
-    CLIPTokenizer,
-    CLIPVisionConfig,
-)
-from ppdiffusers import (
-    AutoencoderKL,
-    ControlNetModel,
-    DDIMScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-)
-from ppdiffusers import (
-    StableDiffusionControlNetPipeline as PPDiffusersStableDiffusionControlNetPipeline,
-)
-from ppdiffusers import UNet2DConditionModel
-from ppdiffusers.configuration_utils import FrozenDict
-from ppdiffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
-
-paddle.set_device("cpu")
-
-
-def convert_to_ppdiffusers(vae_or_unet, dtype="float32"):
-    need_transpose = []
-    for k, v in vae_or_unet.named_modules():
-        if isinstance(v, torch.nn.Linear):
-            need_transpose.append(k + ".weight")
-    new_vae_or_unet = OrderedDict()
-    for k, v in vae_or_unet.state_dict().items():
-        if k not in need_transpose:
-            new_vae_or_unet[k] = v.cpu().numpy().astype(dtype)
-        else:
-            new_vae_or_unet[k] = v.t().cpu().numpy().astype(dtype)
-    return new_vae_or_unet
-
-
-def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True):
-    new_model_state = {}
-    transformers2ppnlp = {
-        ".encoder.": ".transformer.",
-        ".layer_norm": ".norm",
-        ".mlp.": ".",
-        ".fc1.": ".linear1.",
-        ".fc2.": ".linear2.",
-        ".final_layer_norm.": ".ln_final.",
-        ".embeddings.": ".",
-        ".position_embedding.": ".positional_embedding.",
-        ".patch_embedding.": ".conv1.",
-        "visual_projection.weight": "vision_projection",
-        "text_projection.weight": "text_projection",
-        ".pre_layrnorm.": ".ln_pre.",
-        ".post_layernorm.": ".ln_post.",
-        ".vision_model.": ".",
-    }
-    ignore_value = ["position_ids"]
-    donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"]
-
-    for name, value in clip.state_dict().items():
-        # step1: ignore position_ids
-        if any(i in name for i in ignore_value):
-            continue
-        # step2: transpose nn.Linear weight
-        if value.ndim == 2 and not any(i in name for i in donot_transpose):
-            value = value.t()
-        # step3: hf_name -> ppnlp_name mapping
-        for hf_name, ppnlp_name in transformers2ppnlp.items():
-            name = name.replace(hf_name, ppnlp_name)
-        # step4: 0d tensor -> 1d tensor
-        if name == "logit_scale":
-            value = value.reshape((1,))
-        # step5: safety_checker need prefix "clip."
-        if "vision_model" in name:
-            name = "clip." + name
-        new_model_state[name] = value.cpu().numpy().astype(dtype)
-
-    if is_text_encoder:
-        new_config = {
-            "max_text_length": clip.config.max_position_embeddings,
-            "vocab_size": clip.config.vocab_size,
-            "text_embed_dim": clip.config.hidden_size,
-            "text_heads": clip.config.num_attention_heads,
-            "text_layers": clip.config.num_hidden_layers,
-            "text_hidden_act": clip.config.hidden_act,
-            "projection_dim": clip.config.projection_dim,
-            "initializer_range": clip.config.initializer_range,
-            "initializer_factor": clip.config.initializer_factor,
-        }
-    else:
-        new_config = {
-            "image_resolution": clip.config.vision_config.image_size,
-            "vision_layers": clip.config.vision_config.num_hidden_layers,
-            "vision_heads": clip.config.vision_config.num_attention_heads,
-            "vision_embed_dim": clip.config.vision_config.hidden_size,
-            "vision_patch_size": clip.config.vision_config.patch_size,
-            "vision_mlp_ratio": clip.config.vision_config.intermediate_size // clip.config.vision_config.hidden_size,
-            "vision_hidden_act": clip.config.vision_config.hidden_act,
-            "projection_dim": clip.config.projection_dim,
-        }
-    return new_model_state, new_config
-
-
-def convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers(pretrained_model_name_or_path, output_path=None):
-    # 0. load diffusers pipe and convert to ppdiffusers weights format
-    diffusers_pipe = DiffusersStableDiffusionControlNetPipeline.from_pretrained(
-        pretrained_model_name_or_path, use_auth_token=True
-    )
-    requires_safety_checker = getattr(diffusers_pipe, "requires_safety_checker", False)
-    vae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vae)
-    unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.unet)
-    controlnet_state_dict = convert_to_ppdiffusers(diffusers_pipe.controlnet)
-    text_encoder_state_dict, text_encoder_config = convert_hf_clip_to_ppnlp_clip(
-        diffusers_pipe.text_encoder, is_text_encoder=True
-    )
-
-    # 1. vae
-    pp_vae = AutoencoderKL.from_config(diffusers_pipe.vae.config)
-
-    pp_vae.set_dict(vae_state_dict)
-
-    # 2. unet
-    pp_unet = UNet2DConditionModel.from_config(diffusers_pipe.unet.config)
-
-    pp_unet.set_dict(unet_state_dict)
-
-    # 3. controlnet
-    pp_controlnet = ControlNetModel.from_config(diffusers_pipe.controlnet.config)
-
-    pp_controlnet.set_dict(controlnet_state_dict)
-
-    # 4. text_encoder
-    pp_text_encoder = CLIPTextModel(CLIPTextConfig.from_dict(text_encoder_config))
-    pp_text_encoder.set_dict(text_encoder_state_dict)
-
-    # 5. scheduler
-    beta_start = diffusers_pipe.scheduler.beta_start
-    beta_end = diffusers_pipe.scheduler.beta_end
-    scheduler_type = diffusers_pipe.scheduler._class_name.lower()
-    if "pndm" in scheduler_type:
-        pp_scheduler = PNDMScheduler(
-            beta_start=beta_start,
-            beta_end=beta_end,
-            beta_schedule="scaled_linear",
-            # Make sure the scheduler compatible with DDIM
-            set_alpha_to_one=False,
-            steps_offset=1,
-            # Make sure the scheduler compatible with PNDM
-            skip_prk_steps=True,
-        )
-    elif "lms" in scheduler_type:
-        pp_scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
-    elif "ddim" in scheduler_type:
-        pp_scheduler = DDIMScheduler(
-            beta_start=beta_start,
-            beta_end=beta_end,
-            beta_schedule="scaled_linear",
-            # Make sure the scheduler compatible with DDIM
-            clip_sample=False,
-            set_alpha_to_one=False,
-            steps_offset=1,
-        )
-    else:
-        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
-
-    with tempfile.TemporaryDirectory() as tmpdirname:
-        # 6. tokenizer
-        diffusers_pipe.tokenizer.save_pretrained(tmpdirname)
-        pp_tokenizer = CLIPTokenizer.from_pretrained(tmpdirname)
-
-        if requires_safety_checker:
-            # 7. feature_extractor
-            # diffusers_pipe.feature_extractor.save_pretrained(tmpdirname)
-            pp_feature_extractor = CLIPFeatureExtractor.from_pretrained(
-                "CompVis/stable-diffusion-v1-4/feature_extractor"
-            )
-            # 8. safety_checker
-            safety_checker_state_dict, safety_checker_config = convert_hf_clip_to_ppnlp_clip(
-                diffusers_pipe.safety_checker, is_text_encoder=False
-            )
-            pp_safety_checker = StableDiffusionSafetyChecker(CLIPVisionConfig.from_dict(safety_checker_config))
-            pp_safety_checker.set_dict(safety_checker_state_dict)
-            # 9. create ppdiffusers pipe
-            paddle_pipe = PPDiffusersStableDiffusionControlNetPipeline(
-                vae=pp_vae,
-                text_encoder=pp_text_encoder,
-                tokenizer=pp_tokenizer,
-                unet=pp_unet,
-                controlnet=pp_controlnet,
-                safety_checker=pp_safety_checker,
-                feature_extractor=pp_feature_extractor,
-                scheduler=pp_scheduler,
-            )
-        else:
-            # 9. create ppdiffusers pipe
-            paddle_pipe = PPDiffusersStableDiffusionControlNetPipeline(
-                vae=pp_vae,
-                text_encoder=pp_text_encoder,
-                tokenizer=pp_tokenizer,
-                unet=pp_unet,
-                controlnet=pp_controlnet,
-                safety_checker=None,
-                feature_extractor=None,
-                scheduler=pp_scheduler,
-                requires_safety_checker=False,
-            )
-        if "runwayml/stable-diffusion-inpainting" in pretrained_model_name_or_path:
-            _internal_dict = dict(paddle_pipe._internal_dict)
-            if _internal_dict["_ppdiffusers_version"] == "0.0.0":
-                _internal_dict.update({"_ppdiffusers_version": "0.6.0"})
-            paddle_pipe._internal_dict = FrozenDict(_internal_dict)
-        # 10. save_pretrained
-        paddle_pipe.save_pretrained(output_path)
-    return paddle_pipe
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
-    parser.add_argument(
-        "--pretrained_model_name_or_path",
-        type=str,
-        default="takuma104/control_sd15_canny",
-        help="Path to pretrained model or model identifier from huggingface.co/models.",
-    )
-    parser.add_argument(
-        "--output_path",
-        type=str,
-        default="control_sd15_canny-ppdiffusers",
-        help="The model output path.",
-    )
-    args = parser.parse_args()
-    ppdiffusers_pipe = convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers(
-        args.pretrained_model_name_or_path, args.output_path
-    )
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion_to_ppdiffusers.py
deleted file mode 100644
index 6f77cbe89ce5..000000000000
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion_to_ppdiffusers.py
+++ /dev/null
@@ -1,244 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import tempfile
-from collections import OrderedDict
-
-import paddle
-import torch
-from diffusers import StableDiffusionPipeline as DiffusersStableDiffusionPipeline
-
-from paddlenlp.transformers import (
-    CLIPFeatureExtractor,
-    CLIPTextConfig,
-    CLIPTextModel,
-    CLIPTokenizer,
-    CLIPVisionConfig,
-)
-from ppdiffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-)
-from ppdiffusers import StableDiffusionPipeline as PPDiffusersStableDiffusionPipeline
-from ppdiffusers import UNet2DConditionModel
-from ppdiffusers.configuration_utils import FrozenDict
-from ppdiffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
-
-paddle.set_device("cpu")
-
-
-def convert_to_ppdiffusers(vae_or_unet, dtype="float32"):
-    need_transpose = []
-    for k, v in vae_or_unet.named_modules():
-        if isinstance(v, torch.nn.Linear):
-            need_transpose.append(k + ".weight")
-    new_vae_or_unet = OrderedDict()
-    for k, v in vae_or_unet.state_dict().items():
-        if k not in need_transpose:
-            new_vae_or_unet[k] = v.cpu().numpy().astype(dtype)
-        else:
-            new_vae_or_unet[k] = v.t().cpu().numpy().astype(dtype)
-    return new_vae_or_unet
-
-
-def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True):
-    new_model_state = {}
-    transformers2ppnlp = {
-        ".encoder.": ".transformer.",
-        ".layer_norm": ".norm",
-        ".mlp.": ".",
-        ".fc1.": ".linear1.",
-        ".fc2.": ".linear2.",
-        ".final_layer_norm.": ".ln_final.",
-        ".embeddings.": ".",
-        ".position_embedding.": ".positional_embedding.",
-        ".patch_embedding.": ".conv1.",
-        "visual_projection.weight": "vision_projection",
-        "text_projection.weight": "text_projection",
-        ".pre_layrnorm.": ".ln_pre.",
-        ".post_layernorm.": ".ln_post.",
-        ".vision_model.": ".",
-    }
-    ignore_value = ["position_ids"]
-    donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"]
-
-    for name, value in clip.state_dict().items():
-        # step1: ignore position_ids
-        if any(i in name for i in ignore_value):
-            continue
-        # step2: transpose nn.Linear weight
-        if value.ndim == 2 and not any(i in name for i in donot_transpose):
-            value = value.t()
-        # step3: hf_name -> ppnlp_name mapping
-        for hf_name, ppnlp_name in transformers2ppnlp.items():
-            name = name.replace(hf_name, ppnlp_name)
-        # step4: 0d tensor -> 1d tensor
-        if name == "logit_scale":
-            value = value.reshape((1,))
-        # step5: safety_checker need prefix "clip."
-        if "vision_model" in name:
-            name = "clip." + name
-        new_model_state[name] = value.cpu().numpy().astype(dtype)
-
-    if is_text_encoder:
-        new_config = {
-            "max_text_length": clip.config.max_position_embeddings,
-            "vocab_size": clip.config.vocab_size,
-            "text_embed_dim": clip.config.hidden_size,
-            "text_heads": clip.config.num_attention_heads,
-            "text_layers": clip.config.num_hidden_layers,
-            "text_hidden_act": clip.config.hidden_act,
-            "projection_dim": clip.config.projection_dim,
-            "initializer_range": clip.config.initializer_range,
-            "initializer_factor": clip.config.initializer_factor,
-        }
-    else:
-        new_config = {
-            "image_resolution": clip.config.vision_config.image_size,
-            "vision_layers": clip.config.vision_config.num_hidden_layers,
-            "vision_heads": clip.config.vision_config.num_attention_heads,
-            "vision_embed_dim": clip.config.vision_config.hidden_size,
-            "vision_patch_size": clip.config.vision_config.patch_size,
-            "vision_mlp_ratio": clip.config.vision_config.intermediate_size // clip.config.vision_config.hidden_size,
-            "vision_hidden_act": clip.config.vision_config.hidden_act,
-            "projection_dim": clip.config.projection_dim,
-        }
-    return new_model_state, new_config
-
-
-def convert_diffusers_stable_diffusion_to_ppdiffusers(pretrained_model_name_or_path, output_path=None):
-    # 0. load diffusers pipe and convert to ppdiffusers weights format
-    diffusers_pipe = DiffusersStableDiffusionPipeline.from_pretrained(
-        pretrained_model_name_or_path, use_auth_token=True
-    )
-    requires_safety_checker = getattr(diffusers_pipe, "requires_safety_checker", False)
-    vae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vae)
-    unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.unet)
-    text_encoder_state_dict, text_encoder_config = convert_hf_clip_to_ppnlp_clip(
-        diffusers_pipe.text_encoder, is_text_encoder=True
-    )
-
-    # 1. vae
-    pp_vae = AutoencoderKL.from_config(diffusers_pipe.vae.config)
-
-    pp_vae.set_dict(vae_state_dict)
-
-    # 2. unet
-    pp_unet = UNet2DConditionModel.from_config(diffusers_pipe.unet.config)
-
-    pp_unet.set_dict(unet_state_dict)
-
-    # 3. text_encoder
-    pp_text_encoder = CLIPTextModel(CLIPTextConfig.from_dict(text_encoder_config))
-    pp_text_encoder.set_dict(text_encoder_state_dict)
-
-    # 4. scheduler
-    beta_start = diffusers_pipe.scheduler.beta_start
-    beta_end = diffusers_pipe.scheduler.beta_end
-    scheduler_type = diffusers_pipe.scheduler._class_name.lower()
-    if "pndm" in scheduler_type:
-        pp_scheduler = PNDMScheduler(
-            beta_start=beta_start,
-            beta_end=beta_end,
-            beta_schedule="scaled_linear",
-            # Make sure the scheduler compatible with DDIM
-            set_alpha_to_one=False,
-            steps_offset=1,
-            # Make sure the scheduler compatible with PNDM
-            skip_prk_steps=True,
-        )
-    elif "lms" in scheduler_type:
-        pp_scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
-    elif "ddim" in scheduler_type:
-        pp_scheduler = DDIMScheduler(
-            beta_start=beta_start,
-            beta_end=beta_end,
-            beta_schedule="scaled_linear",
-            # Make sure the scheduler compatible with DDIM
-            clip_sample=False,
-            set_alpha_to_one=False,
-            steps_offset=1,
-        )
-    else:
-        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
-
-    with tempfile.TemporaryDirectory() as tmpdirname:
-        # 5. tokenizer
-        diffusers_pipe.tokenizer.save_pretrained(tmpdirname)
-        pp_tokenizer = CLIPTokenizer.from_pretrained(tmpdirname)
-
-        if requires_safety_checker:
-            # 6. feature_extractor
-            # diffusers_pipe.feature_extractor.save_pretrained(tmpdirname)
-            pp_feature_extractor = CLIPFeatureExtractor.from_pretrained(
-                "CompVis/stable-diffusion-v1-4/feature_extractor"
-            )
-            # 7. safety_checker
-            safety_checker_state_dict, safety_checker_config = convert_hf_clip_to_ppnlp_clip(
-                diffusers_pipe.safety_checker, is_text_encoder=False
-            )
-            pp_safety_checker = StableDiffusionSafetyChecker(CLIPVisionConfig.from_dict(safety_checker_config))
-            pp_safety_checker.set_dict(safety_checker_state_dict)
-            # 8. create ppdiffusers pipe
-            paddle_pipe = PPDiffusersStableDiffusionPipeline(
-                vae=pp_vae,
-                text_encoder=pp_text_encoder,
-                tokenizer=pp_tokenizer,
-                unet=pp_unet,
-                safety_checker=pp_safety_checker,
-                feature_extractor=pp_feature_extractor,
-                scheduler=pp_scheduler,
-            )
-        else:
-            # 8. create ppdiffusers pipe
-            paddle_pipe = PPDiffusersStableDiffusionPipeline(
-                vae=pp_vae,
-                text_encoder=pp_text_encoder,
-                tokenizer=pp_tokenizer,
-                unet=pp_unet,
-                safety_checker=None,
-                feature_extractor=None,
-                scheduler=pp_scheduler,
-                requires_safety_checker=False,
-            )
-        if "runwayml/stable-diffusion-inpainting" in pretrained_model_name_or_path:
-            _internal_dict = dict(paddle_pipe._internal_dict)
-            if _internal_dict["_ppdiffusers_version"] == "0.0.0":
-                _internal_dict.update({"_ppdiffusers_version": "0.6.0"})
-            paddle_pipe._internal_dict = FrozenDict(_internal_dict)
-        # 9. save_pretrained
-        paddle_pipe.save_pretrained(output_path)
-    return paddle_pipe
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
-    parser.add_argument(
-        "--pretrained_model_name_or_path",
-        type=str,
-        default="runwayml/stable-diffusion-v1-5",
-        help="Path to pretrained model or model identifier from huggingface.co/models.",
-    )
-    parser.add_argument(
-        "--output_path",
-        type=str,
-        default="stable-diffusion-v1-5-ppdiffusers",
-        help="The model output path.",
-    )
-    args = parser.parse_args()
-    ppdiffusers_pipe = convert_diffusers_stable_diffusion_to_ppdiffusers(
-        args.pretrained_model_name_or_path, args.output_path
-    )
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_unclip_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_unclip_to_ppdiffusers.py
deleted file mode 100644
index 1bb1839c57c3..000000000000
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_unclip_to_ppdiffusers.py
+++ /dev/null
@@ -1,221 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import tempfile
-from collections import OrderedDict
-
-import paddle
-import torch
-from diffusers import UnCLIPPipeline as DiffusersUnCLIPPipeline
-
-from paddlenlp.transformers import (
-    CLIPTextConfig,
-    CLIPTextModelWithProjection,
-    CLIPTokenizer,
-)
-from ppdiffusers import PriorTransformer
-from ppdiffusers import UnCLIPPipeline as PPDiffusersUnCLIPPipeline
-from ppdiffusers import UnCLIPScheduler, UNet2DConditionModel, UNet2DModel
-from ppdiffusers.pipelines.unclip.text_proj import UnCLIPTextProjModel
-
-paddle.set_device("cpu")
-
-
-def convert_to_ppdiffusers(vae_or_unet, dtype="float32", prefix=""):
-    need_transpose = []
-    for k, v in vae_or_unet.named_modules():
-        if isinstance(v, torch.nn.Linear):
-            need_transpose.append(k + ".weight")
-    new_vae_or_unet = OrderedDict()
-    for k, v in vae_or_unet.state_dict().items():
-        if k not in need_transpose:
-            new_vae_or_unet[prefix + k] = v.cpu().numpy().astype(dtype)
-        else:
-            new_vae_or_unet[prefix + k] = v.t().cpu().numpy().astype(dtype)
-    return new_vae_or_unet
-
-
-def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True, need_prefix=False):
-    new_model_state = {}
-    transformers2ppnlp = {
-        ".encoder.": ".transformer.",
-        ".layer_norm": ".norm",
-        ".mlp.": ".",
-        ".fc1.": ".linear1.",
-        ".fc2.": ".linear2.",
-        ".final_layer_norm.": ".ln_final.",
-        ".embeddings.": ".",
-        ".position_embedding.": ".positional_embedding.",
-        ".patch_embedding.": ".conv1.",
-        "visual_projection.weight": "vision_projection",
-        "text_projection.weight": "text_projection",
-        ".pre_layrnorm.": ".ln_pre.",
-        ".post_layernorm.": ".ln_post.",
-        ".vision_model.": ".",
-    }
-    ignore_value = ["position_ids"]
-    donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"]
-
-    for name, value in clip.state_dict().items():
-        # step1: ignore position_ids
-        if any(i in name for i in ignore_value):
-            continue
-        # step2: transpose nn.Linear weight
-        if value.ndim == 2 and not any(i in name for i in donot_transpose):
-            value = value.t()
-        # step3: hf_name -> ppnlp_name mapping
-        for hf_name, ppnlp_name in transformers2ppnlp.items():
-            name = name.replace(hf_name, ppnlp_name)
-        # step4: 0d tensor -> 1d tensor
-        if name == "logit_scale":
-            value = value.reshape((1,))
-        # step5: safety_checker need prefix "clip."
-        if "vision_model" in name and need_prefix:
-            name = "clip." + name
-        new_model_state[name] = value.cpu().numpy().astype(dtype)
-
-    if is_text_encoder:
-        new_config = {
-            "max_text_length": clip.config.max_position_embeddings,
-            "vocab_size": clip.config.vocab_size,
-            "text_embed_dim": clip.config.hidden_size,
-            "text_heads": clip.config.num_attention_heads,
-            "text_layers": clip.config.num_hidden_layers,
-            "text_hidden_act": clip.config.hidden_act,
-            "projection_dim": clip.config.projection_dim,
-            "initializer_range": clip.config.initializer_range,
-            "initializer_factor": clip.config.initializer_factor,
-        }
-    else:
-        if need_prefix:
-            new_config = {
-                "image_resolution": clip.config.vision_config.image_size,
-                "vision_layers": clip.config.vision_config.num_hidden_layers,
-                "vision_heads": clip.config.vision_config.num_attention_heads,
-                "vision_embed_dim": clip.config.vision_config.hidden_size,
-                "vision_patch_size": clip.config.vision_config.patch_size,
-                "vision_mlp_ratio": clip.config.vision_config.intermediate_size
-                // clip.config.vision_config.hidden_size,
-                "vision_hidden_act": clip.config.vision_config.hidden_act,
-                "projection_dim": clip.config.projection_dim,
-            }
-        else:
-            new_config = {
-                "image_resolution": clip.config.image_size,
-                "vision_layers": clip.config.num_hidden_layers,
-                "vision_heads": clip.config.num_attention_heads,
-                "vision_embed_dim": clip.config.hidden_size,
-                "vision_patch_size": clip.config.patch_size,
-                "vision_mlp_ratio": clip.config.intermediate_size // clip.config.hidden_size,
-                "vision_hidden_act": clip.config.hidden_act,
-                "projection_dim": clip.config.projection_dim,
-            }
-    return new_model_state, new_config
-
-
-def check_keys(model, state_dict):
-    cls_name = model.__class__.__name__
-    missing_keys = []
-    mismatched_keys = []
-    for k, v in model.state_dict().items():
-        if k not in state_dict.keys():
-            missing_keys.append(k)
-        if list(v.shape) != list(state_dict[k].shape):
-            mismatched_keys.append(k)
-    if len(missing_keys):
-        missing_keys_str = ", ".join(missing_keys)
-        print(f"{cls_name} Found missing_keys {missing_keys_str}!")
-    if len(mismatched_keys):
-        mismatched_keys_str = ", ".join(mismatched_keys)
-        print(f"{cls_name} Found mismatched_keys {mismatched_keys_str}!")
-
-
-def convert_diffusers_unclip_to_ppdiffusers(pretrained_model_name_or_path, output_path=None):
-    # 0. load diffusers pipe and convert to ppdiffusers weights format
-    diffusers_pipe = DiffusersUnCLIPPipeline.from_pretrained(pretrained_model_name_or_path, use_auth_token=True)
-    prior_state_dict = convert_to_ppdiffusers(diffusers_pipe.prior)
-    decoder_state_dict = convert_to_ppdiffusers(diffusers_pipe.decoder)
-    text_proj_state_dict = convert_to_ppdiffusers(diffusers_pipe.text_proj)
-    super_res_first_state_dict = convert_to_ppdiffusers(diffusers_pipe.super_res_first)
-    super_res_last_state_dict = convert_to_ppdiffusers(diffusers_pipe.super_res_last)
-    text_encoder_state_dict, text_config = convert_hf_clip_to_ppnlp_clip(
-        diffusers_pipe.text_encoder, is_text_encoder=True, need_prefix=False
-    )
-
-    pp_prior = PriorTransformer.from_config(diffusers_pipe.prior.config)
-    pp_prior.set_dict(prior_state_dict)
-    check_keys(pp_prior, prior_state_dict)
-
-    pp_decoder = UNet2DConditionModel.from_config(diffusers_pipe.decoder.config)
-    pp_decoder.set_dict(decoder_state_dict)
-    check_keys(pp_decoder, decoder_state_dict)
-
-    pp_text_proj = UnCLIPTextProjModel.from_config(diffusers_pipe.text_proj.config)
-    pp_text_proj.set_dict(text_proj_state_dict)
-    check_keys(pp_text_proj, text_proj_state_dict)
-
-    pp_super_res_first = UNet2DModel.from_config(diffusers_pipe.super_res_first.config)
-    pp_super_res_first.set_dict(super_res_first_state_dict)
-    check_keys(pp_super_res_first, super_res_first_state_dict)
-
-    pp_super_res_last = UNet2DModel.from_config(diffusers_pipe.super_res_last.config)
-    pp_super_res_last.set_dict(super_res_last_state_dict)
-    check_keys(pp_super_res_last, super_res_last_state_dict)
-
-    pp_text_encoder = CLIPTextModelWithProjection(CLIPTextConfig.from_dict(text_config))
-    pp_text_encoder.set_dict(text_encoder_state_dict)
-    check_keys(pp_text_encoder, text_encoder_state_dict)
-
-    pp_prior_scheduler = UnCLIPScheduler.from_config(diffusers_pipe.prior_scheduler.config)
-    pp_decoder_scheduler = UnCLIPScheduler.from_config(diffusers_pipe.decoder_scheduler.config)
-    pp_super_res_scheduler = UnCLIPScheduler.from_config(diffusers_pipe.super_res_scheduler.config)
-
-    with tempfile.TemporaryDirectory() as tmpdirname:
-        # 5. feature_extractor
-        diffusers_pipe.tokenizer.save_pretrained(tmpdirname)
-        pp_tokenizer = CLIPTokenizer.from_pretrained(tmpdirname)
-        # 6. create ppdiffusers pipe
-        paddle_pipe = PPDiffusersUnCLIPPipeline(
-            prior=pp_prior,
-            decoder=pp_decoder,
-            text_encoder=pp_text_encoder,
-            tokenizer=pp_tokenizer,
-            text_proj=pp_text_proj,
-            super_res_first=pp_super_res_first,
-            super_res_last=pp_super_res_last,
-            prior_scheduler=pp_prior_scheduler,
-            decoder_scheduler=pp_decoder_scheduler,
-            super_res_scheduler=pp_super_res_scheduler,
-        )
-        # 6. save_pretrained
-        paddle_pipe.save_pretrained(output_path)
-    return paddle_pipe
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
-    parser.add_argument(
-        "--pretrained_model_name_or_path",
-        type=str,
-        default="kakaobrain/karlo-v1-alpha",
-        help="Path to pretrained model or model identifier from huggingface.co/models.",
-    )
-    parser.add_argument(
-        "--output_path",
-        type=str,
-        default="./karlo-v1-alpha",
-        help="The model output path.",
-    )
-    args = parser.parse_args()
-    ppdiffusers_pipe = convert_diffusers_unclip_to_ppdiffusers(args.pretrained_model_name_or_path, args.output_path)
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_vq_diffusion_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_vq_diffusion_to_ppdiffusers.py
deleted file mode 100644
index c0718a9f0001..000000000000
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_vq_diffusion_to_ppdiffusers.py
+++ /dev/null
@@ -1,186 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import tempfile
-from collections import OrderedDict
-
-import paddle
-import torch
-from diffusers import VQDiffusionPipeline as DiffusersVQDiffusionPipeline
-
-from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import Transformer2DModel
-from ppdiffusers import VQDiffusionPipeline as PPDiffusersVQDiffusionPipeline
-from ppdiffusers import VQDiffusionScheduler, VQModel
-from ppdiffusers.pipelines.vq_diffusion import LearnedClassifierFreeSamplingEmbeddings
-
-paddle.set_device("cpu")
-
-
-def convert_to_ppdiffusers(vae_or_unet, dtype="float32"):
-    need_transpose = []
-    for k, v in vae_or_unet.named_modules():
-        if isinstance(v, torch.nn.Linear):
-            need_transpose.append(k + ".weight")
-    new_vae_or_unet = OrderedDict()
-    for k, v in vae_or_unet.state_dict().items():
-        if k not in need_transpose:
-            new_vae_or_unet[k] = v.cpu().numpy().astype(dtype)
-        else:
-            new_vae_or_unet[k] = v.t().cpu().numpy().astype(dtype)
-    return new_vae_or_unet
-
-
-def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True):
-    new_model_state = {}
-    transformers2ppnlp = {
-        ".encoder.": ".transformer.",
-        ".layer_norm": ".norm",
-        ".mlp.": ".",
-        ".fc1.": ".linear1.",
-        ".fc2.": ".linear2.",
-        ".final_layer_norm.": ".ln_final.",
-        ".embeddings.": ".",
-        ".position_embedding.": ".positional_embedding.",
-        ".patch_embedding.": ".conv1.",
-        "visual_projection.weight": "vision_projection",
-        "text_projection.weight": "text_projection",
-        ".pre_layrnorm.": ".ln_pre.",
-        ".post_layernorm.": ".ln_post.",
-        ".vision_model.": ".",
-    }
-    ignore_value = ["position_ids"]
-    donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"]
-
-    for name, value in clip.state_dict().items():
-        # step1: ignore position_ids
-        if any(i in name for i in ignore_value):
-            continue
-        # step2: transpose nn.Linear weight
-        if value.ndim == 2 and not any(i in name for i in donot_transpose):
-            value = value.t()
-        # step3: hf_name -> ppnlp_name mapping
-        for hf_name, ppnlp_name in transformers2ppnlp.items():
-            name = name.replace(hf_name, ppnlp_name)
-        # step4: 0d tensor -> 1d tensor
-        if name == "logit_scale":
-            value = value.reshape((1,))
-        # step5: safety_checker need prefix "clip."
-        if "vision_model" in name:
-            name = "clip." + name
-        new_model_state[name] = value.cpu().numpy().astype(dtype)
-
-    if is_text_encoder:
-        new_config = {
-            "max_text_length": clip.config.max_position_embeddings,
-            "vocab_size": clip.config.vocab_size,
-            "text_embed_dim": clip.config.hidden_size,
-            "text_heads": clip.config.num_attention_heads,
-            "text_layers": clip.config.num_hidden_layers,
-            "text_hidden_act": clip.config.hidden_act,
-            "projection_dim": clip.config.projection_dim,
-            "initializer_range": clip.config.initializer_range,
-            "initializer_factor": clip.config.initializer_factor,
-        }
-    else:
-        new_config = {
-            "image_resolution": clip.config.vision_config.image_size,
-            "vision_layers": clip.config.vision_config.num_hidden_layers,
-            "vision_heads": clip.config.vision_config.num_attention_heads,
-            "vision_embed_dim": clip.config.vision_config.hidden_size,
-            "vision_patch_size": clip.config.vision_config.patch_size,
-            "vision_mlp_ratio": clip.config.vision_config.intermediate_size // clip.config.vision_config.hidden_size,
-            "vision_hidden_act": clip.config.vision_config.hidden_act,
-            "projection_dim": clip.config.projection_dim,
-        }
-    return new_model_state, new_config
-
-
-def convert_diffusers_vq_diffusion_to_ppdiffusers(pretrained_model_name_or_path, output_path=None):
-
-    # 0. load diffusers pipe and convert to ppdiffusers weights format
-    diffusers_pipe = DiffusersVQDiffusionPipeline.from_pretrained(pretrained_model_name_or_path, use_auth_token=True)
-
-    # 1. vqvae
-    vqvae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vqvae)
-    # 2. transformer
-    transformer_state_dict = convert_to_ppdiffusers(diffusers_pipe.transformer)
-    # 3. learned_classifier_free_sampling_embeddings
-    learned_classifier_free_sampling_embeddings_state_dict = convert_to_ppdiffusers(
-        diffusers_pipe.learned_classifier_free_sampling_embeddings
-    )
-    # 4.text_encoder
-    text_encoder_state_dict, text_encoder_config = convert_hf_clip_to_ppnlp_clip(
-        diffusers_pipe.text_encoder, is_text_encoder=True
-    )
-
-    # 1. vqvae
-    pp_vqvae = VQModel.from_config(diffusers_pipe.vqvae.config)
-    pp_vqvae.set_dict(vqvae_state_dict)
-
-    # 2. transformer
-    pp_transformer = Transformer2DModel.from_config(diffusers_pipe.transformer.config)
-    pp_transformer.set_dict(transformer_state_dict)
-
-    # 3. pp_learned_classifier_free_sampling_embeddings
-    pp_learned_classifier_free_sampling_embeddings = LearnedClassifierFreeSamplingEmbeddings.from_config(
-        diffusers_pipe.learned_classifier_free_sampling_embeddings.config
-    )
-    pp_learned_classifier_free_sampling_embeddings.set_dict(learned_classifier_free_sampling_embeddings_state_dict)
-
-    # 4. text_encoder
-    pp_text_encoder = CLIPTextModel(CLIPTextConfig.from_dict(text_encoder_config))
-    pp_text_encoder.set_dict(text_encoder_state_dict)
-
-    # 5. scheduler
-    pp_scheduler = VQDiffusionScheduler.from_config(diffusers_pipe.scheduler.config)
-
-    with tempfile.TemporaryDirectory() as tmpdirname:
-        # 6. tokenizer
-        diffusers_pipe.tokenizer.save_pretrained(tmpdirname)
-        pp_tokenizer = CLIPTokenizer.from_pretrained(tmpdirname)
-
-        # 7. create ppdiffusers pipe
-        paddle_pipe = PPDiffusersVQDiffusionPipeline(
-            vqvae=pp_vqvae,
-            text_encoder=pp_text_encoder,
-            tokenizer=pp_tokenizer,
-            transformer=pp_transformer,
-            learned_classifier_free_sampling_embeddings=pp_learned_classifier_free_sampling_embeddings,
-            scheduler=pp_scheduler,
-        )
-
-        # 8. save_pretrained
-        paddle_pipe.save_pretrained(output_path)
-    return paddle_pipe
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.")
-    parser.add_argument(
-        "--pretrained_model_name_or_path",
-        type=str,
-        default="microsoft/vq-diffusion-ithq",
-        help="Path to pretrained model or model identifier from huggingface.co/models.",
-    )
-    parser.add_argument(
-        "--output_path",
-        type=str,
-        default="microsoft/vq-diffusion-ithq-ppdiffusers",
-        help="The model output path.",
-    )
-    args = parser.parse_args()
-    ppdiffusers_pipe = convert_diffusers_vq_diffusion_to_ppdiffusers(
-        args.pretrained_model_name_or_path, args.output_path
-    )
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_orig_sd_ckpt_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_orig_sd_ckpt_to_ppdiffusers.py
deleted file mode 100644
index 18c5a1004833..000000000000
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_orig_sd_ckpt_to_ppdiffusers.py
+++ /dev/null
@@ -1,974 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import io
-import pickle
-from functools import lru_cache
-
-import numpy as np
-import paddle
-
-from paddlenlp.utils.downloader import get_path_from_url
-
-try:
-    from omegaconf import OmegaConf
-except ImportError:
-    raise ImportError(
-        "OmegaConf is required to convert the SD checkpoints. Please install it with `pip install OmegaConf`."
-    )
-from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    DPMSolverMultistepScheduler,
-    EulerAncestralDiscreteScheduler,
-    EulerDiscreteScheduler,
-    HeunDiscreteScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    StableDiffusionPipeline,
-    UNet2DConditionModel,
-)
-
-paddle.set_device("cpu")
-MZ_ZIP_LOCAL_DIR_HEADER_SIZE = 30
-
-
-class TensorMeta:
-    """
-    metadata of tensor
-    """
-
-    def __init__(self, key: str, n_bytes: int, dtype: str):
-        self.key = key
-        self.nbytes = n_bytes
-        self.dtype = dtype
-        self.size = None
-
-    def __repr__(self):
-        return f"size: {self.size} key: {self.key}, nbytes: {self.nbytes}, dtype: {self.dtype}"
-
-
-@lru_cache(maxsize=None)
-def _storage_type_to_dtype_to_map():
-    """convert storage type to numpy dtype"""
-    return {
-        "DoubleStorage": np.double,
-        "FloatStorage": np.float32,
-        "HalfStorage": np.half,
-        "LongStorage": np.int64,
-        "IntStorage": np.int32,
-        "ShortStorage": np.int16,
-        "CharStorage": np.int8,
-        "ByteStorage": np.uint8,
-        "BoolStorage": np.bool8,
-        "ComplexDoubleStorage": np.cdouble,
-        "ComplexFloatStorage": np.cfloat,
-    }
-
-
-class StorageType:
-    """Temp Class for Storage Type"""
-
-    def __init__(self, name):
-        self.dtype = _storage_type_to_dtype_to_map()[name]
-
-    def __str__(self):
-        return f"StorageType(dtype={self.dtype})"
-
-
-def _element_size(dtype: str) -> int:
-    """
-    Returns the element size for a dtype, in bytes
-    """
-    if dtype in [np.float16, np.float32, np.float64]:
-        return np.finfo(dtype).bits >> 3
-    elif dtype == np.bool8:
-        return 1
-    else:
-        return np.iinfo(dtype).bits >> 3
-
-
-class UnpicklerWrapperStage(pickle.Unpickler):
-    def find_class(self, mod_name, name):
-        if type(name) is str and "Storage" in name:
-            try:
-                return StorageType(name)
-            except KeyError:
-                pass
-
-        # pure torch tensor builder
-        if mod_name == "torch._utils":
-            return _rebuild_tensor_stage
-
-        # pytorch_lightning tensor builder
-        if mod_name == "pytorch_lightning":
-            return dumpy
-        return super().find_class(mod_name, name)
-
-
-def get_data_iostream(file: str, file_name="data.pkl"):
-    FILENAME = f"archive/{file_name}".encode("latin")
-    padding_size_plus_fbxx = 4 + 14
-    data_iostream = []
-    offset = MZ_ZIP_LOCAL_DIR_HEADER_SIZE + len(FILENAME) + padding_size_plus_fbxx
-    with open(file, "rb") as r:
-        r.seek(offset)
-        for bytes_data in io.BytesIO(r.read()):
-            if b".PK" in bytes_data:
-                data_iostream.append(bytes_data.split(b".PK")[0])
-                data_iostream.append(b".")
-                break
-            data_iostream.append(bytes_data)
-    out = b"".join(data_iostream)
-    return out, offset + len(out)
-
-
-def _rebuild_tensor_stage(storage, storage_offset, size, stride, requires_grad, backward_hooks):
-    if isinstance(storage, TensorMeta):
-        storage.size = size
-    return storage
-
-
-def dumpy(*args, **kwarsg):
-    return None
-
-
-def load_torch(path: str, **pickle_load_args):
-    """
-    load torch weight file with the following steps:
-
-    1. load the structure of pytorch weight file
-    2. read the tensor data and re-construct the state-dict
-
-    Args:
-        path: the path of pytorch weight file
-        **pickle_load_args: args of pickle module
-
-    Returns:
-
-    """
-    pickle_load_args.update({"encoding": "utf-8"})
-
-    # 1. load the structure of pytorch weight file
-    def persistent_load_stage1(saved_id):
-        assert isinstance(saved_id, tuple)
-
-        data = saved_id[1:]
-        storage_type, key, _, numel = data
-        dtype = storage_type.dtype
-        n_bytes = numel * _element_size(dtype)
-        return TensorMeta(key, n_bytes, dtype)
-
-    data_iostream, pre_offset = get_data_iostream(path, file_name="data.pkl")
-    # 1. read the structure of storage
-    unpickler_stage1 = UnpicklerWrapperStage(io.BytesIO(data_iostream), **pickle_load_args)
-    unpickler_stage1.persistent_load = persistent_load_stage1
-    result_stage1 = unpickler_stage1.load()
-
-    # 2. get the metadata of weight file
-    metadata = []
-
-    def extract_maybe_dict(result):
-        if isinstance(result, dict):
-            for k, v in result.items():
-                extract_maybe_dict(v)
-        elif isinstance(result, (list, tuple)):
-            for res in result:
-                extract_maybe_dict(res)
-        elif isinstance(result, TensorMeta):
-            metadata.append(result)
-
-    extract_maybe_dict(result_stage1)
-    metadata = sorted(metadata, key=lambda x: x.key)
-    # 3. parse the tensor of pytorch weight file
-    stage1_key_to_tensor = {}
-    with open(path, "rb") as file_handler:
-        file_handler.seek(pre_offset)
-        for tensor_meta in metadata:
-            key = tensor_meta.key
-            # eg: archive/data/1FB
-            filename_with_fb = len(f"archive/data/{key}") + 2
-
-            # skip the fix position to read tensor data
-            # `MZ_ZIP_LOCAL_DIR_HEADER_SIZE` is from: https://github.com/pytorch/pytorch/blob/master/caffe2/serialize/inline_container.cc#L186
-            # `16` is the fixed characters size from binary file.
-            # `filename_with_fb` is the length of dynamic data key name
-            file_handler.seek(MZ_ZIP_LOCAL_DIR_HEADER_SIZE + 16 + filename_with_fb, 1)
-
-            padding_offset = np.frombuffer(file_handler.read(2)[:1], dtype=np.uint8)[0]
-            file_handler.read(padding_offset)
-
-            # save the tensor info in result to re-use memory
-            stage1_key_to_tensor[key] = np.frombuffer(
-                file_handler.read(tensor_meta.nbytes), dtype=tensor_meta.dtype
-            ).reshape(tensor_meta.size)
-
-    def persistent_load_stage2(saved_id):
-        assert isinstance(saved_id, tuple)
-        key = saved_id[2]
-        return stage1_key_to_tensor[key]
-
-    # 4. read the structure of storage
-    unpickler_stage2 = UnpicklerWrapperStage(io.BytesIO(data_iostream), **pickle_load_args)
-    unpickler_stage2.persistent_load = persistent_load_stage2
-    result_stage2 = unpickler_stage2.load()
-
-    return result_stage2
-
-
-def shave_segments(path, n_shave_prefix_segments=1):
-    """
-    Removes segments. Positive values shave the first segments, negative shave the last segments.
-    """
-    if n_shave_prefix_segments >= 0:
-        return ".".join(path.split(".")[n_shave_prefix_segments:])
-    else:
-        return ".".join(path.split(".")[:n_shave_prefix_segments])
-
-
-def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside resnets to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item.replace("in_layers.0", "norm1")
-        new_item = new_item.replace("in_layers.2", "conv1")
-
-        new_item = new_item.replace("out_layers.0", "norm2")
-        new_item = new_item.replace("out_layers.3", "conv2")
-
-        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
-        new_item = new_item.replace("skip_connection", "conv_shortcut")
-
-        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-
-        mapping.append({"old": old_item, "new": new_item})
-
-    return mapping
-
-
-def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside resnets to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item
-
-        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
-        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-
-        mapping.append({"old": old_item, "new": new_item})
-
-    return mapping
-
-
-def renew_attention_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside attentions to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item
-        mapping.append({"old": old_item, "new": new_item})
-
-    return mapping
-
-
-def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside attentions to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item
-
-        new_item = new_item.replace("norm.weight", "group_norm.weight")
-        new_item = new_item.replace("norm.bias", "group_norm.bias")
-
-        new_item = new_item.replace("q.weight", "query.weight")
-        new_item = new_item.replace("q.bias", "query.bias")
-
-        new_item = new_item.replace("k.weight", "key.weight")
-        new_item = new_item.replace("k.bias", "key.bias")
-
-        new_item = new_item.replace("v.weight", "value.weight")
-        new_item = new_item.replace("v.bias", "value.bias")
-
-        new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
-        new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
-
-        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-
-        mapping.append({"old": old_item, "new": new_item})
-
-    return mapping
-
-
-def assign_to_checkpoint(
-    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
-):
-    """
-    This does the final conversion step: take locally converted weights and apply a global renaming
-    to them. It splits attention layers, and takes into account additional replacements
-    that may arise.
-
-    Assigns the weights to the new checkpoint.
-    """
-    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
-
-    # Splits the attention layers into three variables.
-    if attention_paths_to_split is not None:
-        for path, path_map in attention_paths_to_split.items():
-            old_tensor = old_checkpoint[path]
-            channels = old_tensor.shape[0] // 3
-
-            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
-
-            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
-
-            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
-
-            query, key, value = np.split(old_tensor, 3, axis=1)
-
-            checkpoint[path_map["query"]] = query.reshape(target_shape)
-            checkpoint[path_map["key"]] = key.reshape(target_shape)
-            checkpoint[path_map["value"]] = value.reshape(target_shape)
-
-    for path in paths:
-        new_path = path["new"]
-
-        # These have already been assigned
-        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
-            continue
-
-        # Global renaming happens here
-        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
-        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
-        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
-
-        if additional_replacements is not None:
-            for replacement in additional_replacements:
-                new_path = new_path.replace(replacement["old"], replacement["new"])
-
-        # proj_attn.weight has to be converted from conv 1D to linear
-        if "proj_attn.weight" in new_path:
-            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
-        else:
-            checkpoint[new_path] = old_checkpoint[path["old"]]
-
-
-def conv_attn_to_linear(checkpoint):
-    keys = list(checkpoint.keys())
-    attn_keys = ["query.weight", "key.weight", "value.weight"]
-    for key in keys:
-        if ".".join(key.split(".")[-2:]) in attn_keys:
-            if checkpoint[key].ndim > 2:
-                checkpoint[key] = checkpoint[key][:, :, 0, 0]
-        elif "proj_attn.weight" in key:
-            if checkpoint[key].ndim > 2:
-                checkpoint[key] = checkpoint[key][:, :, 0]
-
-
-def create_unet_diffusers_config(original_config, image_size: int):
-    """
-    Creates a config for the diffusers based on the config of the LDM model.
-    """
-    unet_params = original_config.model.params.unet_config.params
-    vae_params = original_config.model.params.first_stage_config.params.ddconfig
-
-    block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
-
-    down_block_types = []
-    resolution = 1
-    for i in range(len(block_out_channels)):
-        block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D"
-        down_block_types.append(block_type)
-        if i != len(block_out_channels) - 1:
-            resolution *= 2
-
-    up_block_types = []
-    for i in range(len(block_out_channels)):
-        block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D"
-        up_block_types.append(block_type)
-        resolution //= 2
-
-    vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1)
-
-    head_dim = unet_params.num_heads if "num_heads" in unet_params else None
-    use_linear_projection = (
-        unet_params.use_linear_in_transformer if "use_linear_in_transformer" in unet_params else False
-    )
-    if use_linear_projection:
-        # stable diffusion 2-base-512 and 2-768
-        if head_dim is None:
-            head_dim = [5, 10, 20, 20]
-
-    config = dict(
-        sample_size=image_size // vae_scale_factor,
-        in_channels=unet_params.in_channels,
-        out_channels=unet_params.out_channels,
-        down_block_types=tuple(down_block_types),
-        up_block_types=tuple(up_block_types),
-        block_out_channels=tuple(block_out_channels),
-        layers_per_block=unet_params.num_res_blocks,
-        cross_attention_dim=unet_params.context_dim,
-        attention_head_dim=head_dim,
-        use_linear_projection=use_linear_projection,
-    )
-
-    return config
-
-
-def create_vae_diffusers_config(original_config, image_size: int):
-    """
-    Creates a config for the diffusers based on the config of the LDM model.
-    """
-    vae_params = original_config.model.params.first_stage_config.params.ddconfig
-    _ = original_config.model.params.first_stage_config.params.embed_dim
-
-    block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult]
-    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
-    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
-
-    config = dict(
-        sample_size=image_size,
-        in_channels=vae_params.in_channels,
-        out_channels=vae_params.out_ch,
-        down_block_types=tuple(down_block_types),
-        up_block_types=tuple(up_block_types),
-        block_out_channels=tuple(block_out_channels),
-        latent_channels=vae_params.z_channels,
-        layers_per_block=vae_params.num_res_blocks,
-    )
-    return config
-
-
-def create_diffusers_schedular(original_config):
-    schedular = DDIMScheduler(
-        num_train_timesteps=original_config.model.params.timesteps,
-        beta_start=original_config.model.params.linear_start,
-        beta_end=original_config.model.params.linear_end,
-        beta_schedule="scaled_linear",
-    )
-    return schedular
-
-
-def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False):
-    """
-    Takes a state dict and a config, and returns a converted checkpoint.
-    """
-
-    # extract state_dict for UNet
-    unet_state_dict = {}
-    keys = list(checkpoint.keys())
-
-    unet_key = "model.diffusion_model."
-    # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
-    if sum(k.startswith("model_ema") for k in keys) > 100:
-        print(f"Checkpoint {path} has both EMA and non-EMA weights.")
-        if extract_ema:
-            print(
-                "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
-                " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
-            )
-            for key in keys:
-                if key.startswith("model.diffusion_model"):
-                    flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
-                    unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
-        else:
-            print(
-                "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
-                " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
-            )
-
-    for key in keys:
-        if key.startswith(unet_key):
-            unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
-
-    new_checkpoint = {}
-
-    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
-    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
-    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
-    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
-
-    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
-    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
-
-    new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
-    new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
-    new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
-    new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
-
-    # Retrieves the keys for the input blocks only
-    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
-    input_blocks = {
-        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
-        for layer_id in range(num_input_blocks)
-    }
-
-    # Retrieves the keys for the middle blocks only
-    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
-    middle_blocks = {
-        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
-        for layer_id in range(num_middle_blocks)
-    }
-
-    # Retrieves the keys for the output blocks only
-    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
-    output_blocks = {
-        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
-        for layer_id in range(num_output_blocks)
-    }
-
-    for i in range(1, num_input_blocks):
-        block_id = (i - 1) // (config["layers_per_block"] + 1)
-        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
-
-        resnets = [
-            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
-        ]
-        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
-
-        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
-            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
-                f"input_blocks.{i}.0.op.weight"
-            )
-            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
-                f"input_blocks.{i}.0.op.bias"
-            )
-
-        paths = renew_resnet_paths(resnets)
-        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
-        assign_to_checkpoint(
-            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-        )
-
-        if len(attentions):
-            paths = renew_attention_paths(attentions)
-            meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
-            assign_to_checkpoint(
-                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-            )
-
-    resnet_0 = middle_blocks[0]
-    attentions = middle_blocks[1]
-    resnet_1 = middle_blocks[2]
-
-    resnet_0_paths = renew_resnet_paths(resnet_0)
-    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
-
-    resnet_1_paths = renew_resnet_paths(resnet_1)
-    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
-
-    attentions_paths = renew_attention_paths(attentions)
-    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
-    assign_to_checkpoint(
-        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-    )
-
-    for i in range(num_output_blocks):
-        block_id = i // (config["layers_per_block"] + 1)
-        layer_in_block_id = i % (config["layers_per_block"] + 1)
-        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
-        output_block_list = {}
-
-        for layer in output_block_layers:
-            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
-            if layer_id in output_block_list:
-                output_block_list[layer_id].append(layer_name)
-            else:
-                output_block_list[layer_id] = [layer_name]
-
-        if len(output_block_list) > 1:
-            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
-            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
-
-            resnet_0_paths = renew_resnet_paths(resnets)
-            paths = renew_resnet_paths(resnets)
-
-            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
-            assign_to_checkpoint(
-                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-            )
-
-            if ["conv.weight", "conv.bias"] in output_block_list.values():
-                index = list(output_block_list.values()).index(["conv.weight", "conv.bias"])
-                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
-                    f"output_blocks.{i}.{index}.conv.weight"
-                ]
-                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
-                    f"output_blocks.{i}.{index}.conv.bias"
-                ]
-
-                # Clear attentions as they have been attributed above.
-                if len(attentions) == 2:
-                    attentions = []
-
-            if ["conv.bias", "conv.weight"] in output_block_list.values():
-                index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
-                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
-                    f"output_blocks.{i}.{index}.conv.weight"
-                ]
-                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
-                    f"output_blocks.{i}.{index}.conv.bias"
-                ]
-
-                # Clear attentions as they have been attributed above.
-                if len(attentions) == 2:
-                    attentions = []
-
-            if len(attentions):
-                paths = renew_attention_paths(attentions)
-                meta_path = {
-                    "old": f"output_blocks.{i}.1",
-                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
-                }
-                assign_to_checkpoint(
-                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-                )
-        else:
-            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
-            for path in resnet_0_paths:
-                old_path = ".".join(["output_blocks", str(i), path["old"]])
-                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
-
-                new_checkpoint[new_path] = unet_state_dict[old_path]
-
-    return new_checkpoint
-
-
-def convert_ldm_vae_checkpoint(checkpoint, config):
-    # extract state dict for VAE
-    vae_state_dict = {}
-    vae_key = "first_stage_model."
-    keys = list(checkpoint.keys())
-    for key in keys:
-        if key.startswith(vae_key):
-            vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key)
-
-    new_checkpoint = {}
-
-    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
-    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
-    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
-    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
-    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
-    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
-
-    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
-    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
-    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
-    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
-    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
-    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
-
-    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
-    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
-    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
-    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
-
-    # Retrieves the keys for the encoder down blocks only
-    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
-    down_blocks = {
-        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
-    }
-
-    # Retrieves the keys for the decoder up blocks only
-    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
-    up_blocks = {
-        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
-    }
-
-    for i in range(num_down_blocks):
-        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
-
-        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
-            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
-                f"encoder.down.{i}.downsample.conv.weight"
-            )
-            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
-                f"encoder.down.{i}.downsample.conv.bias"
-            )
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
-    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
-    num_mid_res_blocks = 2
-    for i in range(1, num_mid_res_blocks + 1):
-        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
-    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
-    paths = renew_vae_attention_paths(mid_attentions)
-    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
-    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-    conv_attn_to_linear(new_checkpoint)
-
-    for i in range(num_up_blocks):
-        block_id = num_up_blocks - 1 - i
-        resnets = [
-            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
-        ]
-
-        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
-            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
-                f"decoder.up.{block_id}.upsample.conv.weight"
-            ]
-            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
-                f"decoder.up.{block_id}.upsample.conv.bias"
-            ]
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
-    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
-    num_mid_res_blocks = 2
-    for i in range(1, num_mid_res_blocks + 1):
-        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
-    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
-    paths = renew_vae_attention_paths(mid_attentions)
-    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
-    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-    conv_attn_to_linear(new_checkpoint)
-    return new_checkpoint
-
-
-def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet, diffusers_vae_unet_checkpoint, dtype="float32"):
-    need_transpose = []
-    for k, v in vae_or_unet.named_sublayers(include_self=True):
-        if isinstance(v, paddle.nn.Linear):
-            need_transpose.append(k + ".weight")
-    new_vae_or_unet = {}
-    for k, v in diffusers_vae_unet_checkpoint.items():
-        if k not in need_transpose:
-            new_vae_or_unet[k] = v.astype(dtype)
-        else:
-            new_vae_or_unet[k] = v.T.astype(dtype)
-    return new_vae_or_unet
-
-
-def check_keys(model, state_dict):
-    cls_name = model.__class__.__name__
-    missing_keys = []
-    mismatched_keys = []
-    for k, v in model.state_dict().items():
-        if k not in state_dict.keys():
-            missing_keys.append(k)
-        if list(v.shape) != list(state_dict[k].shape):
-            mismatched_keys.append(k)
-    if len(missing_keys):
-        missing_keys_str = ", ".join(missing_keys)
-        print(f"{cls_name} Found missing_keys {missing_keys_str}!")
-    if len(mismatched_keys):
-        mismatched_keys_str = ", ".join(mismatched_keys)
-        print(f"{cls_name} Found mismatched_keys {mismatched_keys_str}!")
-
-
-def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"):
-    clip = {}
-    for key in checkpoint.keys():
-        if key.startswith("cond_stage_model.transformer"):
-            clip[key[len("cond_stage_model.transformer.") :]] = checkpoint[key]
-
-    new_model_state = {}
-    transformers2ppnlp = {
-        ".encoder.": ".transformer.",
-        ".layer_norm": ".norm",
-        ".mlp.": ".",
-        ".fc1.": ".linear1.",
-        ".fc2.": ".linear2.",
-        ".final_layer_norm.": ".ln_final.",
-        ".embeddings.": ".",
-        ".position_embedding.": ".positional_embedding.",
-        ".patch_embedding.": ".conv1.",
-        "visual_projection.weight": "vision_projection",
-        "text_projection.weight": "text_projection",
-        ".pre_layrnorm.": ".ln_pre.",
-        ".post_layernorm.": ".ln_post.",
-        ".vision_model.": ".",
-    }
-    ignore_value = ["position_ids"]
-    donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"]
-    for name, value in clip.items():
-        # step1: ignore position_ids
-        if any(i in name for i in ignore_value):
-            continue
-        # step2: transpose nn.Linear weight
-        if value.ndim == 2 and not any(i in name for i in donot_transpose):
-            value = value.T
-        # step3: hf_name -> ppnlp_name mapping
-        for hf_name, ppnlp_name in transformers2ppnlp.items():
-            name = name.replace(hf_name, ppnlp_name)
-        # step4: 0d tensor -> 1d tensor
-        if name == "logit_scale":
-            value = value.reshape((1,))
-
-        new_model_state[name] = value.astype(dtype)
-
-    new_config = {
-        "max_text_length": new_model_state["text_model.positional_embedding.weight"].shape[0],
-        "vocab_size": new_model_state["text_model.token_embedding.weight"].shape[0],
-        "text_embed_dim": new_model_state["text_model.token_embedding.weight"].shape[1],
-        "text_heads": 12,
-        "text_layers": 12,
-        "text_hidden_act": "quick_gelu",
-        "projection_dim": 768,
-        "initializer_range": 0.02,
-        "initializer_factor": 1.0,
-    }
-    return new_model_state, new_config
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
-    )
-    parser.add_argument(
-        "--original_config_file",
-        default=None,
-        type=str,
-        help="The YAML config file corresponding to the original architecture.",
-    )
-    parser.add_argument(
-        "--num_in_channels",
-        default=None,
-        type=int,
-        help="The number of input channels. If `None` number of input channels will be automatically inferred.",
-    )
-    parser.add_argument(
-        "--scheduler_type",
-        default="pndm",
-        type=str,
-        help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim', 'euler', 'euler-ancestral', 'dpm']",
-    )
-    parser.add_argument(
-        "--extract_ema",
-        action="store_true",
-        help=(
-            "Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights"
-            " or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield"
-            " higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning."
-        ),
-    )
-    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
-    args = parser.parse_args()
-
-    image_size = 512
-    checkpoint = load_torch(args.checkpoint_path)
-    checkpoint = checkpoint.get("state_dict", checkpoint)
-
-    if args.original_config_file is None:
-        get_path_from_url(
-            "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/v1-inference.yaml",
-            root_dir="./",
-        )
-        args.original_config_file = "./v1-inference.yaml"
-
-    original_config = OmegaConf.load(args.original_config_file)
-
-    if args.num_in_channels is not None:
-        original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = args.num_in_channels
-
-    num_train_timesteps = original_config.model.params.timesteps
-    beta_start = original_config.model.params.linear_start
-    beta_end = original_config.model.params.linear_end
-
-    scheduler = DDIMScheduler(
-        beta_end=beta_end,
-        beta_schedule="scaled_linear",
-        beta_start=beta_start,
-        num_train_timesteps=num_train_timesteps,
-        steps_offset=1,
-        clip_sample=False,
-        set_alpha_to_one=False,
-    )
-    # make sure scheduler works correctly with DDIM
-    scheduler.register_to_config(clip_sample=False)
-
-    if args.scheduler_type == "pndm":
-        config = dict(scheduler.config)
-        config["skip_prk_steps"] = True
-        scheduler = PNDMScheduler.from_config(config)
-    elif args.scheduler_type == "lms":
-        scheduler = LMSDiscreteScheduler.from_config(scheduler.config)
-    elif args.scheduler_type == "heun":
-        scheduler = HeunDiscreteScheduler.from_config(scheduler.config)
-    elif args.scheduler_type == "euler":
-        scheduler = EulerDiscreteScheduler.from_config(scheduler.config)
-    elif args.scheduler_type == "euler-ancestral":
-        scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config)
-    elif args.scheduler_type == "dpm":
-        scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
-    elif args.scheduler_type == "ddim":
-        scheduler = scheduler
-    else:
-        raise ValueError(f"Scheduler of type {args.scheduler_type} doesn't exist!")
-
-    # 1. Convert the UNet2DConditionModel model.
-    diffusers_unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
-    diffusers_unet_checkpoint = convert_ldm_unet_checkpoint(
-        checkpoint, diffusers_unet_config, path=args.checkpoint_path, extract_ema=args.extract_ema
-    )
-    unet = UNet2DConditionModel.from_config(diffusers_unet_config)
-    ppdiffusers_unet_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(unet, diffusers_unet_checkpoint)
-    check_keys(unet, ppdiffusers_unet_checkpoint)
-    unet.load_dict(ppdiffusers_unet_checkpoint)
-
-    # 2. Convert the VAE model.
-    vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
-    diffusers_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
-    vae = AutoencoderKL.from_config(vae_config)
-    ppdiffusers_vae_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(vae, diffusers_vae_checkpoint)
-    check_keys(vae, ppdiffusers_vae_checkpoint)
-    vae.load_dict(ppdiffusers_vae_checkpoint)
-
-    # 3. Convert the text_encoder model.
-    text_model_state_dict, text_config = convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32")
-    text_model = CLIPTextModel(CLIPTextConfig.from_dict(text_config))
-    text_model.eval()
-    check_keys(text_model, text_model_state_dict)
-    text_model.load_dict(text_model_state_dict)
-
-    # 4. Convert the tokenizer.
-    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
-
-    pipe = StableDiffusionPipeline(
-        vae=vae,
-        text_encoder=text_model,
-        tokenizer=tokenizer,
-        unet=unet,
-        scheduler=scheduler,
-        safety_checker=None,
-        feature_extractor=None,
-        requires_safety_checker=False,
-    )
-    pipe.save_pretrained(args.dump_path)
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_orig_stablediffusion2.0_ckpt_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_orig_stablediffusion2.0_ckpt_to_ppdiffusers.py
deleted file mode 100644
index 24e11e22ba50..000000000000
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_orig_stablediffusion2.0_ckpt_to_ppdiffusers.py
+++ /dev/null
@@ -1,755 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-
-import paddle
-import torch
-
-try:
-    from omegaconf import OmegaConf
-except ImportError:
-    raise ImportError(
-        "OmegaConf is required to convert the LDM checkpoints. Please install it with `pip install OmegaConf`."
-    )
-from transformers import CLIPTextModel as HFCLIPTextModel
-
-from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    EulerAncestralDiscreteScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    StableDiffusionPipeline,
-    UNet2DConditionModel,
-)
-
-paddle.set_device("cpu")
-
-
-def shave_segments(path, n_shave_prefix_segments=1):
-    """
-    Removes segments. Positive values shave the first segments, negative shave the last segments.
-    """
-    if n_shave_prefix_segments >= 0:
-        return ".".join(path.split(".")[n_shave_prefix_segments:])
-    else:
-        return ".".join(path.split(".")[:n_shave_prefix_segments])
-
-
-def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside resnets to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item.replace("in_layers.0", "norm1")
-        new_item = new_item.replace("in_layers.2", "conv1")
-
-        new_item = new_item.replace("out_layers.0", "norm2")
-        new_item = new_item.replace("out_layers.3", "conv2")
-
-        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
-        new_item = new_item.replace("skip_connection", "conv_shortcut")
-
-        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-
-        mapping.append({"old": old_item, "new": new_item})
-
-    return mapping
-
-
-def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside resnets to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item
-
-        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
-        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-
-        mapping.append({"old": old_item, "new": new_item})
-
-    return mapping
-
-
-def renew_attention_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside attentions to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item
-        mapping.append({"old": old_item, "new": new_item})
-
-    return mapping
-
-
-def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside attentions to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item
-
-        new_item = new_item.replace("norm.weight", "group_norm.weight")
-        new_item = new_item.replace("norm.bias", "group_norm.bias")
-
-        new_item = new_item.replace("q.weight", "query.weight")
-        new_item = new_item.replace("q.bias", "query.bias")
-
-        new_item = new_item.replace("k.weight", "key.weight")
-        new_item = new_item.replace("k.bias", "key.bias")
-
-        new_item = new_item.replace("v.weight", "value.weight")
-        new_item = new_item.replace("v.bias", "value.bias")
-
-        new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
-        new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
-
-        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-
-        mapping.append({"old": old_item, "new": new_item})
-
-    return mapping
-
-
-def assign_to_checkpoint(
-    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
-):
-    """
-    This does the final conversion step: take locally converted weights and apply a global renaming
-    to them. It splits attention layers, and takes into account additional replacements
-    that may arise.
-    Assigns the weights to the new checkpoint.
-    """
-    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
-
-    # Splits the attention layers into three variables.
-    if attention_paths_to_split is not None:
-        for path, path_map in attention_paths_to_split.items():
-            old_tensor = old_checkpoint[path]
-            channels = old_tensor.shape[0] // 3
-
-            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
-
-            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
-
-            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
-            query, key, value = old_tensor.split(channels // num_heads, dim=1)
-
-            checkpoint[path_map["query"]] = query.reshape(target_shape)
-            checkpoint[path_map["key"]] = key.reshape(target_shape)
-            checkpoint[path_map["value"]] = value.reshape(target_shape)
-
-    for path in paths:
-        new_path = path["new"]
-
-        # These have already been assigned
-        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
-            continue
-
-        # Global renaming happens here
-        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
-        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
-        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
-
-        if additional_replacements is not None:
-            for replacement in additional_replacements:
-                new_path = new_path.replace(replacement["old"], replacement["new"])
-
-        # proj_attn.weight has to be converted from conv 1D to linear
-        if "proj_attn.weight" in new_path:
-            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
-        else:
-            checkpoint[new_path] = old_checkpoint[path["old"]]
-
-
-def conv_attn_to_linear(checkpoint):
-    keys = list(checkpoint.keys())
-    attn_keys = ["query.weight", "key.weight", "value.weight"]
-    for key in keys:
-        if ".".join(key.split(".")[-2:]) in attn_keys:
-            if checkpoint[key].ndim > 2:
-                checkpoint[key] = checkpoint[key][:, :, 0, 0]
-        elif "proj_attn.weight" in key:
-            if checkpoint[key].ndim > 2:
-                checkpoint[key] = checkpoint[key][:, :, 0]
-
-
-def create_unet_diffusers_config(original_config):
-    """
-    Creates a config for the diffusers based on the config of the LDM model.
-    """
-    unet_params = original_config.model.params.unet_config.params
-
-    block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
-
-    down_block_types = []
-    resolution = 1
-    for i in range(len(block_out_channels)):
-        block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D"
-        down_block_types.append(block_type)
-        if i != len(block_out_channels) - 1:
-            resolution *= 2
-
-    up_block_types = []
-    for i in range(len(block_out_channels)):
-        block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D"
-        up_block_types.append(block_type)
-        resolution //= 2
-
-    try:
-        attention_head_dim = unet_params.num_heads
-    except Exception:
-        attention_head_dim = unet_params.num_head_channels
-    config = dict(
-        sample_size=unet_params.image_size,
-        in_channels=unet_params.in_channels,
-        out_channels=unet_params.out_channels,
-        down_block_types=tuple(down_block_types),
-        up_block_types=tuple(up_block_types),
-        block_out_channels=tuple(block_out_channels),
-        layers_per_block=unet_params.num_res_blocks,
-        cross_attention_dim=unet_params.context_dim,
-        attention_head_dim=attention_head_dim,
-    )
-
-    return config
-
-
-def create_vae_diffusers_config(original_config):
-    """
-    Creates a config for the diffusers based on the config of the LDM model.
-    """
-    vae_params = original_config.model.params.first_stage_config.params.ddconfig
-    _ = original_config.model.params.first_stage_config.params.embed_dim
-
-    block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult]
-    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
-    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
-
-    config = dict(
-        sample_size=vae_params.resolution,
-        in_channels=vae_params.in_channels,
-        out_channels=vae_params.out_ch,
-        down_block_types=tuple(down_block_types),
-        up_block_types=tuple(up_block_types),
-        block_out_channels=tuple(block_out_channels),
-        latent_channels=vae_params.z_channels,
-        layers_per_block=vae_params.num_res_blocks,
-    )
-    return config
-
-
-def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False):
-    """
-    Takes a state dict and a config, and returns a converted checkpoint.
-    """
-
-    # extract state_dict for UNet
-    unet_state_dict = {}
-    keys = list(checkpoint.keys())
-
-    unet_key = "model.diffusion_model."
-    # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
-    if sum(k.startswith("model_ema") for k in keys) > 100:
-        print(f"Checkpoint {path} has both EMA and non-EMA weights.")
-        if extract_ema:
-            print(
-                "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
-                " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
-            )
-            for key in keys:
-                if key.startswith("model.diffusion_model"):
-                    flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
-                    unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
-        else:
-            print(
-                "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
-                " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
-            )
-
-    for key in keys:
-        if key.startswith(unet_key):
-            unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
-
-    new_checkpoint = {}
-
-    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
-    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
-    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
-    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
-
-    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
-    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
-
-    new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
-    new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
-    new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
-    new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
-
-    # Retrieves the keys for the input blocks only
-    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
-    input_blocks = {
-        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
-        for layer_id in range(num_input_blocks)
-    }
-
-    # Retrieves the keys for the middle blocks only
-    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
-    middle_blocks = {
-        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
-        for layer_id in range(num_middle_blocks)
-    }
-
-    # Retrieves the keys for the output blocks only
-    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
-    output_blocks = {
-        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
-        for layer_id in range(num_output_blocks)
-    }
-
-    for i in range(1, num_input_blocks):
-        block_id = (i - 1) // (config["layers_per_block"] + 1)
-        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
-
-        resnets = [
-            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
-        ]
-        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
-
-        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
-            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
-                f"input_blocks.{i}.0.op.weight"
-            )
-            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
-                f"input_blocks.{i}.0.op.bias"
-            )
-
-        paths = renew_resnet_paths(resnets)
-        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
-        assign_to_checkpoint(
-            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-        )
-
-        if len(attentions):
-            paths = renew_attention_paths(attentions)
-            meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
-            assign_to_checkpoint(
-                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-            )
-
-    resnet_0 = middle_blocks[0]
-    attentions = middle_blocks[1]
-    resnet_1 = middle_blocks[2]
-
-    resnet_0_paths = renew_resnet_paths(resnet_0)
-    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
-
-    resnet_1_paths = renew_resnet_paths(resnet_1)
-    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
-
-    attentions_paths = renew_attention_paths(attentions)
-    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
-    assign_to_checkpoint(
-        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-    )
-
-    for i in range(num_output_blocks):
-        block_id = i // (config["layers_per_block"] + 1)
-        layer_in_block_id = i % (config["layers_per_block"] + 1)
-        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
-        output_block_list = {}
-
-        for layer in output_block_layers:
-            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
-            if layer_id in output_block_list:
-                output_block_list[layer_id].append(layer_name)
-            else:
-                output_block_list[layer_id] = [layer_name]
-
-        if len(output_block_list) > 1:
-            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
-            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
-
-            resnet_0_paths = renew_resnet_paths(resnets)
-            paths = renew_resnet_paths(resnets)
-
-            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
-            assign_to_checkpoint(
-                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-            )
-
-            if ["conv.weight", "conv.bias"] in output_block_list.values():
-                index = list(output_block_list.values()).index(["conv.weight", "conv.bias"])
-                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
-                    f"output_blocks.{i}.{index}.conv.weight"
-                ]
-                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
-                    f"output_blocks.{i}.{index}.conv.bias"
-                ]
-
-                # Clear attentions as they have been attributed above.
-                if len(attentions) == 2:
-                    attentions = []
-
-            if len(attentions):
-                paths = renew_attention_paths(attentions)
-                meta_path = {
-                    "old": f"output_blocks.{i}.1",
-                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
-                }
-                assign_to_checkpoint(
-                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-                )
-        else:
-            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
-            for path in resnet_0_paths:
-                old_path = ".".join(["output_blocks", str(i), path["old"]])
-                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
-
-                new_checkpoint[new_path] = unet_state_dict[old_path]
-
-    return new_checkpoint
-
-
-def convert_ldm_vae_checkpoint(checkpoint, config):
-    # extract state dict for VAE
-    vae_state_dict = {}
-    vae_key = "first_stage_model."
-    keys = list(checkpoint.keys())
-    for key in keys:
-        if key.startswith(vae_key):
-            vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key)
-
-    new_checkpoint = {}
-
-    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
-    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
-    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
-    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
-    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
-    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
-
-    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
-    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
-    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
-    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
-    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
-    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
-
-    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
-    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
-    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
-    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
-
-    # Retrieves the keys for the encoder down blocks only
-    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
-    down_blocks = {
-        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
-    }
-
-    # Retrieves the keys for the decoder up blocks only
-    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
-    up_blocks = {
-        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
-    }
-
-    for i in range(num_down_blocks):
-        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
-
-        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
-            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
-                f"encoder.down.{i}.downsample.conv.weight"
-            )
-            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
-                f"encoder.down.{i}.downsample.conv.bias"
-            )
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
-    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
-    num_mid_res_blocks = 2
-    for i in range(1, num_mid_res_blocks + 1):
-        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
-    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
-    paths = renew_vae_attention_paths(mid_attentions)
-    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
-    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-    conv_attn_to_linear(new_checkpoint)
-
-    for i in range(num_up_blocks):
-        block_id = num_up_blocks - 1 - i
-        resnets = [
-            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
-        ]
-
-        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
-            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
-                f"decoder.up.{block_id}.upsample.conv.weight"
-            ]
-            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
-                f"decoder.up.{block_id}.upsample.conv.bias"
-            ]
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
-    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
-    num_mid_res_blocks = 2
-    for i in range(1, num_mid_res_blocks + 1):
-        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
-    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
-    paths = renew_vae_attention_paths(mid_attentions)
-    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
-    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-    conv_attn_to_linear(new_checkpoint)
-    return new_checkpoint
-
-
-def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet, diffusers_vae_unet_checkpoint, dtype="float32"):
-    need_transpose = []
-    for k, v in vae_or_unet.named_sublayers(include_self=True):
-        if isinstance(v, paddle.nn.Linear):
-            need_transpose.append(k + ".weight")
-    new_vae_or_unet = {}
-    for k, v in diffusers_vae_unet_checkpoint.items():
-        if k not in need_transpose:
-            new_vae_or_unet[k] = v.numpy().astype(dtype)
-        else:
-            new_vae_or_unet[k] = v.t().numpy().astype(dtype)
-    return new_vae_or_unet
-
-
-def check_keys(model, state_dict):
-    cls_name = model.__class__.__name__
-    missing_keys = []
-    mismatched_keys = []
-    for k, v in model.state_dict().items():
-        if k not in state_dict.keys():
-            missing_keys.append(k)
-        if list(v.shape) != list(state_dict[k].shape):
-            mismatched_keys.append(k)
-    if len(missing_keys):
-        missing_keys_str = ", ".join(missing_keys)
-        print(f"{cls_name} Found missing_keys {missing_keys_str}!")
-    if len(mismatched_keys):
-        mismatched_keys_str = ", ".join(mismatched_keys)
-        print(f"{cls_name} Found mismatched_keys {mismatched_keys_str}!")
-
-
-def convert_hf_clip_to_ppnlp_clip(clip, layer_idx, dtype="float32"):
-    layer_need_to_ignore = clip.config.num_hidden_layers - layer_idx
-
-    new_model_state = {}
-    transformers2ppnlp = {
-        ".encoder.": ".transformer.",
-        ".layer_norm": ".norm",
-        ".mlp.": ".",
-        ".fc1.": ".linear1.",
-        ".fc2.": ".linear2.",
-        ".final_layer_norm.": ".ln_final.",
-        ".embeddings.": ".",
-        ".position_embedding.": ".positional_embedding.",
-        ".patch_embedding.": ".conv1.",
-        "visual_projection.weight": "vision_projection",
-        "text_projection.weight": "text_projection",
-        ".pre_layrnorm.": ".ln_pre.",
-        ".post_layernorm.": ".ln_post.",
-        ".vision_model.": ".",
-    }
-    ignore_value = ["position_ids"]
-    donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"]
-    for name, value in clip.state_dict().items():
-        if f".{layer_need_to_ignore}." in name:
-            continue
-        # step1: ignore position_ids
-        if any(i in name for i in ignore_value):
-            continue
-        # step2: transpose nn.Linear weight
-        if value.ndim == 2 and not any(i in name for i in donot_transpose):
-            value = value.t()
-        # step3: hf_name -> ppnlp_name mapping
-        for hf_name, ppnlp_name in transformers2ppnlp.items():
-            name = name.replace(hf_name, ppnlp_name)
-        # step4: 0d tensor -> 1d tensor
-        if name == "logit_scale":
-            value = value.reshape((1,))
-        # step5: safety_checker need prefix "clip."
-        new_model_state[name] = value.cpu().numpy().astype(dtype)
-
-    new_config = {
-        "max_text_length": clip.config.max_position_embeddings,
-        "vocab_size": clip.config.vocab_size,
-        "text_embed_dim": clip.config.hidden_size,
-        "text_heads": clip.config.num_attention_heads,
-        "text_layers": clip.config.num_hidden_layers - layer_idx,
-        "text_hidden_act": clip.config.hidden_act,
-        "projection_dim": clip.config.projection_dim,
-        "initializer_range": clip.config.initializer_range,
-        "initializer_factor": clip.config.initializer_factor,
-    }
-    return new_model_state, new_config
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
-    )
-    parser.add_argument(
-        "--original_config_file",
-        default="v2-inference.yaml",
-        type=str,
-        help="The YAML config file corresponding to the original architecture.",
-    )
-    parser.add_argument(
-        "--scheduler_type",
-        default="ddim",
-        type=str,
-        choices=["ddim", "lms", "pndm", "euler-ancest"],
-        help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim', 'euler-ancest']",
-    )
-    parser.add_argument(
-        "--extract_ema",
-        action="store_true",
-        help=(
-            "Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights"
-            " or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield"
-            " higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning."
-        ),
-    )
-    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
-
-    args = parser.parse_args()
-
-    if args.original_config_file is None:
-        os.system(
-            "wget https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/v2-inference.yaml"
-        )
-        args.original_config_file = "./v2-inference.yaml"
-
-    original_config = OmegaConf.load(args.original_config_file)
-
-    checkpoint = torch.load(args.checkpoint_path, map_location="cpu")
-    checkpoint = checkpoint.get("state_dict", checkpoint)
-
-    # 1. Convert the UNet2DConditionModel model.
-    diffusers_unet_config = create_unet_diffusers_config(original_config)
-    diffusers_unet_checkpoint = convert_ldm_unet_checkpoint(
-        checkpoint, diffusers_unet_config, path=args.checkpoint_path, extract_ema=args.extract_ema
-    )
-    unet = UNet2DConditionModel(**diffusers_unet_config)
-    ppdiffusers_unet_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(unet, diffusers_unet_checkpoint)
-    check_keys(unet, ppdiffusers_unet_checkpoint)
-    unet.load_dict(ppdiffusers_unet_checkpoint)
-
-    # 2. Convert the VAE model.
-    vae_config = create_vae_diffusers_config(original_config)
-    diffusers_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
-    vae = AutoencoderKL(**vae_config)
-    ppdiffusers_vae_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(vae, diffusers_vae_checkpoint)
-    check_keys(vae, ppdiffusers_vae_checkpoint)
-    vae.load_dict(ppdiffusers_vae_checkpoint)
-
-    # 3. Convert the text model.
-    text_model_type = original_config.model.params.cond_stage_config.target.split(".")[-1]
-    layer = original_config.model.params.cond_stage_config.params.layer
-    if layer == "last":
-        layer_idx = 0
-    elif layer == "penultimate":
-        layer_idx = 1
-    else:
-        raise NotImplementedError()
-
-    if text_model_type != "FrozenOpenCLIPEmbedder":
-        print("We only support FrozenOpenCLIPEmbedder as text_encoder!")
-
-    clip = HFCLIPTextModel.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K")
-    ppdiffusers_clip_checkpoint, clip_config = convert_hf_clip_to_ppnlp_clip(clip, layer_idx)
-
-    text_encoder = CLIPTextModel(CLIPTextConfig.from_dict(clip_config))
-    text_encoder.load_dict(ppdiffusers_clip_checkpoint)
-
-    # 5. load tokenizer.
-    pp_tokenizer = CLIPTokenizer.from_pretrained(
-        "laion/CLIP-ViT-H-14-laion2B-s32B-b79K", pad_token="!", model_max_length=77
-    )
-
-    # 6. Convert scheduler.
-    num_train_timesteps = original_config.model.params.timesteps
-    beta_start = original_config.model.params.linear_start
-    beta_end = original_config.model.params.linear_end
-    if args.scheduler_type == "pndm":
-        scheduler = PNDMScheduler(
-            beta_start=beta_start,
-            beta_end=beta_end,
-            beta_schedule="scaled_linear",
-            # Make sure the scheduler compatible with DDIM
-            set_alpha_to_one=False,
-            steps_offset=1,
-            # Make sure the scheduler compatible with PNDM
-            skip_prk_steps=True,
-        )
-    elif args.scheduler_type == "lms":
-        scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
-    elif args.scheduler_type == "euler-ancestral":
-        scheduler = EulerAncestralDiscreteScheduler(
-            beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear"
-        )
-    elif args.scheduler_type == "ddim":
-        scheduler = DDIMScheduler(
-            beta_start=beta_start,
-            beta_end=beta_end,
-            beta_schedule="scaled_linear",
-            # Make sure the scheduler compatible with DDIM
-            clip_sample=False,
-            set_alpha_to_one=False,
-            steps_offset=1,
-        )
-    else:
-        raise ValueError(f"Scheduler of type {args.scheduler_type} doesn't exist!")
-
-    pipe = StableDiffusionPipeline(
-        vae=vae,
-        text_encoder=text_encoder,
-        tokenizer=pp_tokenizer,
-        unet=unet,
-        scheduler=scheduler,
-        safety_checker=None,
-        feature_extractor=None,
-        requires_safety_checker=False,
-    )
-
-    pipe.save_pretrained(args.dump_path)
diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_ppdiffusers_stable_diffusion_to_fastdeploy.py b/ppdiffusers/scripts/convert_diffusers_model/convert_ppdiffusers_stable_diffusion_to_fastdeploy.py
deleted file mode 100644
index 2b06739e1214..000000000000
--- a/ppdiffusers/scripts/convert_diffusers_model/convert_ppdiffusers_stable_diffusion_to_fastdeploy.py
+++ /dev/null
@@ -1,169 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-from pathlib import Path
-from types import MethodType
-
-import paddle
-
-from ppdiffusers import (
-    FastDeployStableDiffusionInpaintPipeline,
-    FastDeployStableDiffusionMegaPipeline,
-    StableDiffusionPipeline,
-)
-from ppdiffusers.fastdeploy_utils import FastDeployRuntimeModel
-
-
-def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(model_path: str, output_path: str, mode: bool = False):
-    pipeline = StableDiffusionPipeline.from_pretrained(model_path, safety_checker=None, feature_extractor=None)
-    output_path = Path(output_path)
-
-    # get arguments
-    cross_attention_dim = pipeline.unet.config.cross_attention_dim  # 768 or 1024 or 1280
-    unet_channels = pipeline.unet.config.in_channels  # 4 or 9
-    vae_in_channels = pipeline.vae.config.in_channels  # 3
-    vae_latent_channels = pipeline.vae.config.latent_channels  # 4
-    print(
-        f"cross_attention_dim: {cross_attention_dim}\n",
-        f"unet_in_channels: {unet_channels}\n",
-        f"vae_encoder_in_channels: {vae_in_channels}\n",
-        f"vae_decoder_latent_channels: {vae_latent_channels}",
-    )
-    # 1. Convert text_encoder
-    text_encoder = paddle.jit.to_static(
-        pipeline.text_encoder,
-        input_spec=[paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids")],  # input_ids
-    )
-    save_path = os.path.join(args.output_path, "text_encoder", "inference")
-    paddle.jit.save(text_encoder, save_path)
-    print(f"Save text_encoder model in {save_path} successfully.")
-    del pipeline.text_encoder
-
-    # 2. Convert unet
-    unet = paddle.jit.to_static(
-        pipeline.unet,
-        input_spec=[
-            paddle.static.InputSpec(shape=[None, unet_channels, None, None], dtype="float32", name="sample"),  # sample
-            paddle.static.InputSpec(shape=[1], dtype="int64", name="timestep"),  # timestep
-            paddle.static.InputSpec(
-                shape=[None, None, cross_attention_dim], dtype="float32", name="encoder_hidden_states"
-            ),  # encoder_hidden_states
-        ],
-    )
-    save_path = os.path.join(args.output_path, "unet", "inference")
-    paddle.jit.save(unet, save_path)
-    print(f"Save unet model in {save_path} successfully.")
-    del pipeline.unet
-
-    def forward_vae_encoder_mode(self, z):
-        return self.encode(z, True).latent_dist.mode()
-
-    def forward_vae_encoder_sample(self, z):
-        return self.encode(z, True).latent_dist.sample()
-
-    # 3. Convert vae encoder
-    vae_encoder = pipeline.vae
-    if mode:
-        vae_encoder.forward = MethodType(forward_vae_encoder_mode, vae_encoder)
-    else:
-        vae_encoder.forward = MethodType(forward_vae_encoder_sample, vae_encoder)
-
-    vae_encoder = paddle.jit.to_static(
-        vae_encoder,
-        input_spec=[
-            paddle.static.InputSpec(
-                shape=[None, vae_in_channels, None, None], dtype="float32", name="sample"  # N, C, H, W
-            ),  # latent
-        ],
-    )
-    # Save vae_encoder in static graph model.
-    save_path = os.path.join(args.output_path, "vae_encoder", "inference")
-    paddle.jit.save(vae_encoder, save_path)
-    print(f"Save vae_encoder model in {save_path} successfully.")
-
-    # 4. Convert vae encoder
-    vae_decoder = pipeline.vae
-
-    def forward_vae_decoder(self, z):
-        return self.decode(z, True).sample
-
-    vae_decoder.forward = MethodType(forward_vae_decoder, vae_decoder)
-    vae_decoder = paddle.jit.to_static(
-        vae_decoder,
-        input_spec=[
-            paddle.static.InputSpec(
-                shape=[None, vae_latent_channels, None, None], dtype="float32", name="latent_sample"
-            ),  # latent_sample
-        ],
-    )
-    # Save vae_decoder in static graph model.
-    save_path = os.path.join(args.output_path, "vae_decoder", "inference")
-    paddle.jit.save(vae_decoder, save_path)
-    print(f"Save vae_decoder model in {save_path} successfully.")
-    del pipeline.vae
-
-    if "inpainting" in model_path:
-        fd_pipe_cls = FastDeployStableDiffusionInpaintPipeline
-    else:
-        fd_pipe_cls = FastDeployStableDiffusionMegaPipeline
-
-    fastdeploy_pipeline = fd_pipe_cls(
-        vae_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_encoder"),
-        vae_decoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_decoder"),
-        text_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "text_encoder"),
-        unet=FastDeployRuntimeModel.from_pretrained(output_path / "unet"),
-        tokenizer=pipeline.tokenizer,
-        scheduler=pipeline.scheduler,
-        safety_checker=None,
-        feature_extractor=None,
-        requires_safety_checker=False,
-    )
-    fastdeploy_pipeline.save_pretrained(output_path)
-    print("FastDeploy pipeline saved to", output_path)
-
-    # if "inpainting" in model_path:
-    # from ppdiffusers.utils import load_image
-    #     img_url = (
-    #         "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
-    #     )
-    #     mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png"
-    #     image = load_image(img_url).resize((512, 512))
-    #     mask_image = load_image(mask_url).resize((512, 512))
-    #     prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
-    #     image_inpainting = fastdeploy_pipeline(
-    #         prompt=prompt, image=image, mask_image=mask_image, num_inference_steps=10
-    #     ).images[0]
-    #     image_inpainting.save("image_inpainting_fd_test.png")
-    # else:
-    #     prompt = "a portrait of shiba inu with a red cap growing on its head. intricate. lifelike. soft light. sony a 7 r iv 5 5 mm. cinematic post - processing "
-    #     image_text2img = fastdeploy_pipeline.text2img(prompt, num_inference_steps=10).images[0]
-    #     image_text2img.save("text2img_fd_test.png")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--model_path",
-        type=str,
-        required=True,
-        help="Path to the `ppdiffusers` checkpoint to convert (either a local directory or on the bos).",
-    )
-    parser.add_argument("--output_path", type=str, required=True, help="Path to the output model.")
-    parser.add_argument("--mode", action="store_true", default=False, help="Export the vae encoder in mode or sample")
-    args = parser.parse_args()
-
-    convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(args.model_path, args.output_path, args.mode)
diff --git a/ppdiffusers/scripts/convert_diffusers_model/requirements.txt b/ppdiffusers/scripts/convert_diffusers_model/requirements.txt
deleted file mode 100644
index 19e0c9bb6ada..000000000000
--- a/ppdiffusers/scripts/convert_diffusers_model/requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-ppdiffusers>0.9.0
-paddlenlp>=2.5.0
-paddlepaddle-gpu
-torch
-diffusers
-transformers
-omegaconf
\ No newline at end of file
diff --git a/ppdiffusers/scripts/fid_clip_score/README.md b/ppdiffusers/scripts/fid_clip_score/README.md
deleted file mode 100644
index 1ea4a625931f..000000000000
--- a/ppdiffusers/scripts/fid_clip_score/README.md
+++ /dev/null
@@ -1,79 +0,0 @@
-# FID score for PaddlePaddle
-
-FID（Frechet Inception Distance score，FID）是计算真实图像和生成图像的特征向量之间距离的一种度量，最常用于评估生成性对抗网络样本的质量。FID 从原始图像的计算机视觉特征的统计方面的相似度来衡量两组图像的相似度，这种视觉特征是使用 `Inception v3` 图像分类模型计算的得到的。分数越低代表两组图像越相似，或者说二者的统计量越相似，FID 在最佳情况下的得分为 0.0，表示两组图像相同。
-
-
-## 依赖
-
-- PaddlePaddle
-- Pillow
-- Numpy
-- Scipy
-
-## 快速使用
-
-计算两个图片数据集的FID，`path/to/dataset1`/`path/to/dataset2`为图片文件夹
-```
-python fid_score.py path/to/dataset1 path/to/dataset2
-```
-
-使用CPU计算
-```
-python fid_score.py path/to/dataset1 path/to/dataset2 --device cpu
-```
-
-参数说明
-- `batch-size`：使用批次的大小，默认为50
-- `num-workers`： 用于加载数据的子进程个数，默认为`min(8, num_cpus)`。
-- `device`：使用设备，支持GPU、CPU。
-- `dims`：要使用的Inception特征的维度。默认使用2048.
-
-## 参考
-
-- [https://github.com/mseitzer/pytorch-fid](https://github.com/mseitzer/pytorch-fid)
-- [https://github.com/bioinf-jku/TTUR](https://github.com/bioinf-jku/TTUR)
-
-# 在COCO英文1k（或30k）数据集上评估 FID score 和 Clip Score指标
-
-```shell
-├── outputs
-    ├── mscoco.en_g3 # guidance_scales为3的输出图片
-        ├── 00000_000.png
-        ├── 00001_000.png
-        ......
-        ├── 00999_000.png
-    ├── mscoco.en_g4 # guidance_scales为4的输出图片
-        ├── 00000_000.png
-        ├── 00001_000.png
-        ......
-        ├── 00999_000.png
-    ......
-    ├── mscoco.en_g8 # guidance_scales为8的输出图片
-        ├── 00000_000.png
-        ├── 00001_000.png
-        ......
-        ├── 00999_000.png
-```
-假设我们已经有了上述目录结构的图片，那么我们可以使用`compute_fid_clip_score.py`计算fid score和clip score两个指标。
-
-```shell
-python compute_fid_clip_score.py \
-    --image_path outputs/mscoco.en_g3 outputs/mscoco.en_g4 outputs/mscoco.en_g5 outputs/mscoco.en_g6 outputs/mscoco.en_g7 outputs/mscoco.en_g8 \
-    --text_file_name coco30k \
-    --clip_model_name_or_path openai/clip-vit-base-patch32 \
-    --resolution 256 \
-    --fid_batch_size 32 \
-    --clip_batch_size 64 \
-    --device gpu
-```
-
-参数说明
-- `image_path`：我们需要评估的图片文件夹地址，两个地址之间需要用空格分隔。
-- `text_file_name`： clip评估所需要的文件名称，可从`["coco1k", "coco10k", "coco30k"]`选择，1k表示1k图片，30k表示30k图片。
-- `clip_model_name_or_path`：clip评估所使用的模型。
-- `resolution`：fid评估时候所使用的图片的分辨率。
-- `fid_batch_size`：fid评估时候所使用的批次。
-- `clip_batch_size`：clip评估时候所使用的批次。
-- `device`：使用设备，支持GPU、CPU，如"cpu", "gpu:0", "gpu:1"。
-
-![ddim-19w-30k-256](https://user-images.githubusercontent.com/50394665/203267067-6367d675-8580-4c3e-90b0-d8c1ed0d58aa.png)
diff --git a/ppdiffusers/scripts/fid_clip_score/compute_fid_clip_score.py b/ppdiffusers/scripts/fid_clip_score/compute_fid_clip_score.py
deleted file mode 100644
index 7e8b040da8ec..000000000000
--- a/ppdiffusers/scripts/fid_clip_score/compute_fid_clip_score.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import json
-import math
-import os
-import pathlib
-
-import paddle
-import pandas as pd
-from fid_score import IMAGE_EXTENSIONS, calculate_fid_given_paths
-from paddle.utils.download import get_path_from_url
-from PIL import Image
-from tqdm.auto import tqdm
-
-from paddlenlp.transformers import CLIPModel, CLIPProcessor
-from ppdiffusers.utils import DOWNLOAD_SERVER, PPDIFFUSERS_CACHE
-
-base_url = DOWNLOAD_SERVER + "/CompVis/data/"
-cache_path = os.path.join(PPDIFFUSERS_CACHE, "data")
-
-
-def save_json(data, file_path="statistic_results.json"):
-    with open(str(file_path), "w", encoding="utf8") as f:
-        json.dump(data, f, ensure_ascii=False)
-
-
-def batchify(data, batch_size=16):
-    one_batch = []
-    for example in data:
-        one_batch.append(example)
-        if len(one_batch) == batch_size:
-            yield one_batch
-            one_batch = []
-    if one_batch:
-        yield one_batch
-
-
-@paddle.no_grad()
-def compute_clip_score(model, processor, texts, images_path, batch_size=64):
-    all_text_embeds = []
-    all_image_embeds = []
-    for text, image_path in tqdm(
-        zip(batchify(texts, batch_size), batchify(images_path, batch_size)), total=math.ceil(len(texts) / batch_size)
-    ):
-        assert len(text) == len(image_path)
-        batch_inputs = processor(
-            text=text,
-            images=[Image.open(image) for image in image_path],
-            return_tensors="pd",
-            max_length=processor.tokenizer.model_max_length,
-            padding="max_length",
-            truncation=True,
-        )
-        text_embeds = model.get_text_features(input_ids=batch_inputs["input_ids"])
-        image_embeds = model.get_image_features(pixel_values=batch_inputs["pixel_values"])
-        all_text_embeds.append(text_embeds)
-        all_image_embeds.append(image_embeds)
-
-    all_text_embeds = paddle.concat(all_text_embeds)
-    all_image_embeds = paddle.concat(all_image_embeds)
-    all_text_embeds = all_text_embeds / all_text_embeds.norm(axis=-1, keepdim=True)
-    all_image_embeds = all_image_embeds / all_image_embeds.norm(axis=-1, keepdim=True)
-    clip_score = (all_image_embeds * all_text_embeds).sum(-1) * model.logit_scale.exp()
-    return clip_score
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--image_path", default=None, nargs="+", type=str, help="image_path")
-    parser.add_argument(
-        "--text_file_name",
-        default="coco30k",
-        choices=["coco1k", "coco10k", "coco30k"],
-        type=str,
-        help="text file.",
-    )
-    parser.add_argument(
-        "--clip_model_name_or_path", default="openai/clip-vit-base-patch32", type=str, help="clip_model_name_or_path"
-    )
-    parser.add_argument("--fid_batch_size", default=32, type=int, help="fid_batch_size")
-    parser.add_argument("--clip_batch_size", default=64, type=int, help="clip_batch_size")
-    parser.add_argument("--resolution", default=256, type=int, help="resolution of images")
-    parser.add_argument("--device", default="gpu", type=str, help="device")
-    parser.add_argument(
-        "--only_fid",
-        action="store_true",
-        help=("Only eval fid. "),
-    )
-    args = parser.parse_args()
-
-    paddle.set_device(args.device)
-    all_path = args.image_path
-    text_file_name = args.text_file_name
-    # dont change
-    image_num = text_file_name.replace("coco", "")
-    if image_num == "30k":
-        os.environ["FLAG_IMAGE_NUM"] = "30000"
-    elif image_num == "10k":
-        os.environ["FLAG_IMAGE_NUM"] = "10000"
-    else:
-        os.environ["FLAG_IMAGE_NUM"] = "1000"
-    dataset_name = f"coco_{args.resolution}_{image_num}.npz"
-    fid_target_file = get_path_from_url(base_url + dataset_name, cache_path) + ".npz"
-
-    text_file = get_path_from_url(base_url + text_file_name + ".tsv", cache_path)
-    df = pd.read_csv(text_file, sep="\t")
-    texts = df["caption_en"].tolist()
-    if not args.only_fid:
-        model = CLIPModel.from_pretrained(args.clip_model_name_or_path)
-        model.eval()
-        processor = CLIPProcessor.from_pretrained(args.clip_model_name_or_path)
-        # pad_token_id must be set to zero!
-        processor.tokenizer.pad_token_id = 0
-
-    results = {"file": [], "fid": []}
-    for path in all_path:
-        results["file"].append(path)
-        # fid score
-        fid_value = calculate_fid_given_paths(
-            [fid_target_file, path],
-            batch_size=args.fid_batch_size,
-            dims=2048,
-            num_workers=4,
-        )
-        results["fid"].append(fid_value)
-
-        if not args.only_fid:
-            # clip score
-            images_path = sorted(
-                [image_path for ext in IMAGE_EXTENSIONS for image_path in pathlib.Path(path).glob("*.{}".format(ext))]
-            )
-            clip_score = compute_clip_score(model, processor, texts, images_path, args.clip_batch_size)
-            if "clip_score" not in results:
-                results["clip_score"] = []
-            _clip_score = clip_score.mean().item()
-            results["clip_score"].append()
-            if image_num == "30k":
-                print(f"=====> clip_score 1k: {clip_score[:1000].mean().item()}")
-                print(f"=====> clip_score 10k: {clip_score[:10000].mean().item()}")
-            print(f"fid: {fid_value}, clip_score: {_clip_score}")
-        else:
-            print(f"fid: {fid_value}")
-    # save json file results
-    save_json(results)
-    print(results)
diff --git a/ppdiffusers/scripts/fid_clip_score/fid_score.py b/ppdiffusers/scripts/fid_clip_score/fid_score.py
deleted file mode 100755
index 0dc26c31c601..000000000000
--- a/ppdiffusers/scripts/fid_clip_score/fid_score.py
+++ /dev/null
@@ -1,331 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# Copyright (c) mseitzer Author. All Rights Reserved.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Calculates the Frechet Inception Distance (FID) to evalulate GANs
-
-The FID metric calculates the distance between two distributions of images.
-Typically, we have summary statistics (mean & covariance matrix) of one
-of these distributions, while the 2nd distribution is given by a GAN.
-
-When run as a stand-alone program, it compares the distribution of
-images that are stored as PNG/JPEG at a specified location with a
-distribution given by summary statistics (in pickle format).
-
-The FID is calculated by assuming that X_1 and X_2 are the activations of
-the pool_3 layer of the inception net for generated samples and real world
-samples respectively.
-
-See --help to see further details.
-
-Code apapted from https://github.com/bioinf-jku/TTUR to use PyTorch instead
-of Tensorflow
-
-Copyright 2018 Institute of Bioinformatics, JKU Linz
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-
-import os
-import pathlib
-from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
-
-import numpy as np
-import paddle
-import paddle.vision.transforms as TF
-from paddle.nn.functional import adaptive_avg_pool2d
-from PIL import Image
-from scipy import linalg
-
-try:
-    from tqdm import tqdm
-except ImportError:
-    # If tqdm is not available, provide a mock version of it
-    def tqdm(x):
-        return x
-
-
-from inception import InceptionV3
-
-parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
-parser.add_argument("--batch-size", type=int, default=50, help="Batch size to use")
-parser.add_argument("--resolution", type=int, default=None, help="The resolution to resize.")
-parser.add_argument(
-    "--num-workers", type=int, help=("Number of processes to use for data loading. " "Defaults to `min(8, num_cpus)`")
-)
-parser.add_argument("--device", type=str, default=None, help="Device to use. Like cuda, cuda:0 or cpu")
-parser.add_argument(
-    "--dims",
-    type=int,
-    default=2048,
-    choices=list(InceptionV3.BLOCK_INDEX_BY_DIM),
-    help=("Dimensionality of Inception features to use. " "By default, uses pool3 features"),
-)
-parser.add_argument(
-    "--save-stats",
-    action="store_true",
-    help=(
-        "Generate an npz archive from a directory of samples. "
-        "The first path is used as input and the second as output."
-    ),
-)
-parser.add_argument("path", type=str, nargs=2, help=("Paths to the generated images or " "to .npz statistic files"))
-
-IMAGE_EXTENSIONS = {"bmp", "jpg", "jpeg", "pgm", "png", "ppm", "tif", "tiff", "webp"}
-
-
-class ImagePathDataset(paddle.io.Dataset):
-    def __init__(self, files, transforms=None, resolution=None):
-        self.files = files
-        self.transforms = transforms
-        self.resolution = resolution
-
-    def __len__(self):
-        return len(self.files)
-
-    def __getitem__(self, i):
-        path = self.files[i]
-        img = Image.open(path).convert("RGB")
-        if self.resolution is not None:
-            if img.size != (self.resolution, self.resolution):
-                img = img.resize((self.resolution, self.resolution))
-        if self.transforms is not None:
-            img = self.transforms(img)
-        return {"img": img}
-
-
-def get_activations(files, model, batch_size=50, dims=2048, num_workers=1, resolution=None):
-    """Calculates the activations of the pool_3 layer for all images.
-
-    Params:
-    -- files       : List of image files paths
-    -- model       : Instance of inception model
-    -- batch_size  : Batch size of images for the model to process at once.
-                     Make sure that the number of samples is a multiple of
-                     the batch size, otherwise some samples are ignored. This
-                     behavior is retained to match the original FID score
-                     implementation.
-    -- dims        : Dimensionality of features returned by Inception
-    -- num_workers : Number of parallel dataloader workers
-
-    Returns:
-    -- A numpy array of dimension (num images, dims) that contains the
-       activations of the given tensor when feeding inception with the
-       query tensor.
-    """
-    model.eval()
-
-    if batch_size > len(files):
-        print(("Warning: batch size is bigger than the data size. " "Setting batch size to data size"))
-        batch_size = len(files)
-
-    dataset = ImagePathDataset(files, transforms=TF.ToTensor(), resolution=resolution)
-    dataloader = paddle.io.DataLoader(
-        dataset,
-        batch_size=batch_size,
-        shuffle=False,
-        drop_last=False,
-        num_workers=num_workers,
-    )
-
-    pred_arr = np.empty((len(files), dims))
-
-    start_idx = 0
-
-    for batch in tqdm(dataloader):
-        batch = batch["img"]
-        with paddle.no_grad():
-            pred = model(batch)[0]
-
-        # If model output is not scalar, apply global spatial average pooling.
-        # This happens if you choose a dimensionality not equal 2048.
-        if pred.shape[2] != 1 or pred.shape[3] != 1:
-            pred = adaptive_avg_pool2d(pred, output_size=(1, 1))
-
-        pred = pred.squeeze(3).squeeze(2).cpu().numpy()
-
-        pred_arr[start_idx : start_idx + pred.shape[0]] = pred
-
-        start_idx = start_idx + pred.shape[0]
-
-    return pred_arr
-
-
-def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
-    """Numpy implementation of the Frechet Distance.
-    The Frechet distance between two multivariate Gaussians X_1 ~ N(mu_1, C_1)
-    and X_2 ~ N(mu_2, C_2) is
-            d^2 = ||mu_1 - mu_2||^2 + Tr(C_1 + C_2 - 2*sqrt(C_1*C_2)).
-
-    Stable version by Dougal J. Sutherland.
-
-    Params:
-    -- mu1   : Numpy array containing the activations of a layer of the
-               inception net (like returned by the function 'get_predictions')
-               for generated samples.
-    -- mu2   : The sample mean over activations, precalculated on an
-               representative data set.
-    -- sigma1: The covariance matrix over activations for generated samples.
-    -- sigma2: The covariance matrix over activations, precalculated on an
-               representative data set.
-
-    Returns:
-    --   : The Frechet Distance.
-    """
-
-    mu1 = np.atleast_1d(mu1)
-    mu2 = np.atleast_1d(mu2)
-
-    sigma1 = np.atleast_2d(sigma1)
-    sigma2 = np.atleast_2d(sigma2)
-
-    assert mu1.shape == mu2.shape, "Training and test mean vectors have different lengths"
-    assert sigma1.shape == sigma2.shape, "Training and test covariances have different dimensions"
-
-    diff = mu1 - mu2
-
-    # Product might be almost singular
-    covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
-    if not np.isfinite(covmean).all():
-        msg = ("fid calculation produces singular product; " "adding %s to diagonal of cov estimates") % eps
-        print(msg)
-        offset = np.eye(sigma1.shape[0]) * eps
-        covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
-
-    # Numerical error might give slight imaginary component
-    if np.iscomplexobj(covmean):
-        if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
-            m = np.max(np.abs(covmean.imag))
-            raise ValueError("Imaginary component {}".format(m))
-        covmean = covmean.real
-
-    tr_covmean = np.trace(covmean)
-
-    return diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) - 2 * tr_covmean
-
-
-def calculate_activation_statistics(files, model, batch_size=50, dims=2048, num_workers=1, resolution=None):
-    """Calculation of the statistics used by the FID.
-    Params:
-    -- files       : List of image files paths
-    -- model       : Instance of inception model
-    -- batch_size  : The images numpy array is split into batches with
-                     batch size batch_size. A reasonable batch size
-                     depends on the hardware.
-    -- dims        : Dimensionality of features returned by Inception
-    -- num_workers : Number of parallel dataloader workers
-
-    Returns:
-    -- mu    : The mean over samples of the activations of the pool_3 layer of
-               the inception model.
-    -- sigma : The covariance matrix of the activations of the pool_3 layer of
-               the inception model.
-    """
-    act = get_activations(files, model, batch_size, dims, num_workers, resolution=resolution)
-    mu = np.mean(act, axis=0)
-    sigma = np.cov(act, rowvar=False)
-    return mu, sigma
-
-
-def compute_statistics_of_path(path, model, batch_size, dims, num_workers=1, resolution=None):
-    if path.endswith(".npz"):
-        with np.load(path) as f:
-            m, s = f["mu"][:], f["sigma"][:]
-    else:
-        path = pathlib.Path(path)
-        files = sorted([file for ext in IMAGE_EXTENSIONS for file in path.glob("*.{}".format(ext))])
-        FLAG_IMAGE_NUM = os.getenv("FLAG_IMAGE_NUM", None)
-        if FLAG_IMAGE_NUM is not None:
-            files = files[: int(FLAG_IMAGE_NUM)]
-        m, s = calculate_activation_statistics(files, model, batch_size, dims, num_workers, resolution=resolution)
-
-    return m, s
-
-
-def calculate_fid_given_paths(paths, batch_size, dims, num_workers=1, resolution=None):
-    """Calculates the FID of two paths"""
-    for p in paths:
-        if not os.path.exists(p):
-            raise RuntimeError("Invalid path: %s" % p)
-
-    block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[dims]
-
-    model = InceptionV3([block_idx])
-
-    m1, s1 = compute_statistics_of_path(paths[0], model, batch_size, dims, num_workers, resolution=resolution)
-
-    m2, s2 = compute_statistics_of_path(paths[1], model, batch_size, dims, num_workers, resolution=resolution)
-
-    fid_value = calculate_frechet_distance(m1, s1, m2, s2)
-
-    return fid_value
-
-
-def save_fid_stats(paths, batch_size, dims, num_workers=1, resolution=None):
-    """Calculates the FID of two paths"""
-    if not os.path.exists(paths[0]):
-        raise RuntimeError("Invalid path: %s" % paths[0])
-
-    if os.path.exists(paths[1]):
-        raise RuntimeError("Existing output file: %s" % paths[1])
-
-    block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[dims]
-
-    model = InceptionV3([block_idx])
-
-    print(f"Saving statistics for {paths[0]}")
-
-    m1, s1 = compute_statistics_of_path(paths[0], model, batch_size, dims, num_workers, resolution=resolution)
-
-    np.savez_compressed(paths[1], mu=m1, sigma=s1)
-
-
-def main():
-    args = parser.parse_args()
-    if args.device is not None:
-        paddle.set_device(args.device)
-
-    if args.num_workers is None:
-        try:
-            num_cpus = len(os.sched_getaffinity(0))
-        except AttributeError:
-            # os.sched_getaffinity is not available under Windows, use
-            # os.cpu_count instead (which may not return the *available* number
-            # of CPUs).
-            num_cpus = os.cpu_count()
-
-        num_workers = min(num_cpus, 8) if num_cpus is not None else 0
-    else:
-        num_workers = args.num_workers
-
-    if args.save_stats:
-        save_fid_stats(args.path, args.batch_size, args.dims, num_workers, resolution=args.resolution)
-        return
-
-    fid_value = calculate_fid_given_paths(
-        args.path, args.batch_size, args.dims, num_workers, resolution=args.resolution
-    )
-    print("FID: ", fid_value)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/ppdiffusers/scripts/fid_clip_score/inception.py b/ppdiffusers/scripts/fid_clip_score/inception.py
deleted file mode 100644
index 9a024ff75803..000000000000
--- a/ppdiffusers/scripts/fid_clip_score/inception.py
+++ /dev/null
@@ -1,493 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# Copyright (c) mseitzer Author. All Rights Reserved.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle.utils.download import get_weights_path_from_url
-
-# Inception weights ported to Pytorch from
-# http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz
-FID_WEIGHTS_URL = (
-    "https://paddlenlp.bj.bcebos.com/models/mseitzer/pp_inception-2015-12-05-6726825d.pdparams",
-    "8e2ae24c34c5c8b81d45167bb9361f4c",
-)
-WEIGHTS_PATH = "pp_inception-2015-12-05-6726825d.pdparams"
-
-
-class ConvNormActivation(nn.Sequential):
-    """
-    Configurable block used for Convolution-Normalzation-Activation blocks.
-    This code is based on the torchvision code with modifications.
-    You can also see at https://github.com/pytorch/vision/blob/main/torchvision/ops/misc.py#L68
-    Args:
-        in_channels (int): Number of channels in the input image
-        out_channels (int): Number of channels produced by the Convolution-Normalzation-Activation block
-        kernel_size: (int|list|tuple, optional): Size of the convolving kernel. Default: 3
-        stride (int|list|tuple, optional): Stride of the convolution. Default: 1
-        padding (int|str|tuple|list, optional): Padding added to all four sides of the input. Default: None,
-            in wich case it will calculated as ``padding = (kernel_size - 1) // 2 * dilation``
-        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
-        norm_layer (Callable[..., paddle.nn.Layer], optional): Norm layer that will be stacked on top of the convolutiuon layer.
-            If ``None`` this layer wont be used. Default: ``paddle.nn.BatchNorm2D``
-        activation_layer (Callable[..., paddle.nn.Layer], optional): Activation function which will be stacked on top of the normalization
-            layer (if not ``None``), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``paddle.nn.ReLU``
-        dilation (int): Spacing between kernel elements. Default: 1
-        bias (bool, optional): Whether to use bias in the convolution layer. By default, biases are included if ``norm_layer is None``.
-    """
-
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        kernel_size=3,
-        stride=1,
-        padding=None,
-        groups=1,
-        norm_layer=nn.BatchNorm2D,
-        activation_layer=nn.ReLU,
-        dilation=1,
-        bias=None,
-    ):
-        if padding is None:
-            padding = (kernel_size - 1) // 2 * dilation
-        if bias is None:
-            bias = norm_layer is None
-        layers = [
-            nn.Conv2D(
-                in_channels,
-                out_channels,
-                kernel_size,
-                stride,
-                padding,
-                dilation=dilation,
-                groups=groups,
-                bias_attr=bias,
-            )
-        ]
-        if norm_layer is not None:
-            # The hyperparameter of BatchNorm2D is different from PaddlePaddle.
-            layers.append(norm_layer(out_channels, momentum=0.1, epsilon=0.001))
-        if activation_layer is not None:
-            layers.append(activation_layer())
-        super().__init__(*layers)
-
-
-class InceptionV3(nn.Layer):
-    """Pretrained InceptionV3 network returning feature maps"""
-
-    # Index of default block of inception to return,
-    # corresponds to output of final average pooling
-    DEFAULT_BLOCK_INDEX = 3
-
-    # Maps feature dimensionality to their output blocks indices
-    BLOCK_INDEX_BY_DIM = {
-        64: 0,  # First max pooling features
-        192: 1,  # Second max pooling featurs
-        768: 2,  # Pre-aux classifier features
-        2048: 3,  # Final average pooling features
-    }
-
-    def __init__(
-        self,
-        output_blocks=(DEFAULT_BLOCK_INDEX,),
-        resize_input=True,
-        normalize_input=True,
-        requires_grad=False,
-        use_fid_inception=True,
-    ):
-        """Build pretrained InceptionV3
-
-        Parameters
-        ----------
-        output_blocks : list of int
-            Indices of blocks to return features of. Possible values are:
-                - 0: corresponds to output of first max pooling
-                - 1: corresponds to output of second max pooling
-                - 2: corresponds to output which is fed to aux classifier
-                - 3: corresponds to output of final average pooling
-        resize_input : bool
-            If true, bilinearly resizes input to width and height 299 before
-            feeding input to model. As the network without fully connected
-            layers is fully convolutional, it should be able to handle inputs
-            of arbitrary size, so resizing might not be strictly needed
-        normalize_input : bool
-            If true, scales the input from range (0, 1) to the range the
-            pretrained Inception network expects, namely (-1, 1)
-        requires_grad : bool
-            If true, parameters of the model require gradients. Possibly useful
-            for finetuning the network
-        use_fid_inception : bool
-            If true, uses the pretrained Inception model used in Tensorflow's
-            FID implementation. If false, uses the pretrained Inception model
-            available in paddle.vision. The FID Inception model has different
-            weights and a slightly different structure from paddle.vision's
-            Inception model. If you want to compute FID scores, you are
-            strongly advised to set this parameter to true to get comparable
-            results.
-        """
-        super(InceptionV3, self).__init__()
-
-        self.resize_input = resize_input
-        self.normalize_input = normalize_input
-        self.output_blocks = sorted(output_blocks)
-        self.last_needed_block = max(output_blocks)
-
-        assert self.last_needed_block <= 3, "Last possible output block index is 3"
-
-        self.blocks = nn.LayerList()
-
-        if use_fid_inception:
-            inception = fid_inception_v3()
-        else:
-            inception = _inception_v3(pretrained=True)
-
-        # Block 0: input to maxpool1
-        block0 = [
-            inception.inception_stem.conv_1a_3x3,
-            inception.inception_stem.conv_2a_3x3,
-            inception.inception_stem.conv_2b_3x3,
-            inception.inception_stem.max_pool,
-        ]
-        self.blocks.append(nn.Sequential(*block0))
-
-        # Block 1: maxpool1 to maxpool2
-        if self.last_needed_block >= 1:
-            block1 = [
-                inception.inception_stem.conv_3b_1x1,
-                inception.inception_stem.conv_4a_3x3,
-                inception.inception_stem.max_pool,
-            ]
-            self.blocks.append(nn.Sequential(*block1))
-
-        # Block 2: maxpool2 to aux classifier
-        if self.last_needed_block >= 2:
-            block2 = [
-                inception.inception_block_list[0],
-                inception.inception_block_list[1],
-                inception.inception_block_list[2],
-                inception.inception_block_list[3],
-                inception.inception_block_list[4],
-                inception.inception_block_list[5],
-                inception.inception_block_list[6],
-                inception.inception_block_list[7],
-            ]
-            self.blocks.append(nn.Sequential(*block2))
-
-        # Block 3: aux classifier to final avgpool
-        if self.last_needed_block >= 3:
-            block3 = [
-                inception.inception_block_list[8],
-                inception.inception_block_list[9],
-                inception.inception_block_list[10],
-                inception.avg_pool,
-            ]
-            self.blocks.append(nn.Sequential(*block3))
-
-        for param in self.parameters():
-            param.stop_gradient = requires_grad
-
-    def forward(self, inp):
-        """Get Inception feature maps
-
-        Parameters
-        ----------
-        inp : paddle.Tensor
-            Input tensor of shape Bx3xHxW. Values are expected to be in
-            range (0, 1)
-
-        Returns
-        -------
-        List of paddle.Tensor, corresponding to the selected output
-        block, sorted ascending by index
-        """
-        outp = []
-        x = inp
-        if self.resize_input:
-            x = F.interpolate(x, size=(299, 299), mode="bilinear", align_corners=False)
-
-        if self.normalize_input:
-            x = 2 * x - 1  # Scale from range (0, 1) to range (-1, 1)
-        for idx, block in enumerate(self.blocks):
-            x = block(x)
-            if idx in self.output_blocks:
-                outp.append(x)
-
-            if idx == self.last_needed_block:
-                break
-
-        return outp
-
-
-def hack_bn_layer(layer):
-    if isinstance(layer, nn.BatchNorm2D):
-        layer._momentum = 0.1
-        layer._epsilon = 0.001
-
-
-def _inception_v3(*args, **kwargs):
-    """Wraps `paddle.vision.models.inception_v3`"""
-    return paddle.vision.models.inception_v3(*args, **kwargs).apply(hack_bn_layer)
-
-
-def fid_inception_v3():
-    """Build pretrained Inception model for FID computation
-
-    The Inception model for FID computation uses a different set of weights
-    and has a slightly different structure than paddle.vision's Inception.
-
-    This method first constructs paddle.vision's Inception and then patches the
-    necessary parts that are different in the FID Inception model.
-    """
-    inception = _inception_v3(num_classes=1008, with_pool=True, pretrained=False)
-    inception.inception_block_list[0] = InceptionA(192, pool_features=32)
-    inception.inception_block_list[1] = InceptionA(256, pool_features=64)
-    inception.inception_block_list[2] = InceptionA(288, pool_features=64)
-    inception.inception_block_list[4] = InceptionC(768, channels_7x7=128)
-    inception.inception_block_list[5] = InceptionC(768, channels_7x7=160)
-    inception.inception_block_list[6] = InceptionC(768, channels_7x7=160)
-    inception.inception_block_list[7] = InceptionC(768, channels_7x7=192)
-    inception.inception_block_list[9] = InceptionE_1(1280)
-    inception.inception_block_list[10] = InceptionE_2(2048)
-
-    weight_path = get_weights_path_from_url(FID_WEIGHTS_URL[0], FID_WEIGHTS_URL[1])
-    state_dict = paddle.load(weight_path)
-    inception.set_state_dict(state_dict)
-    return inception
-
-
-class InceptionA(nn.Layer):
-    def __init__(self, num_channels, pool_features):
-        super().__init__()
-        self.branch1x1 = ConvNormActivation(
-            in_channels=num_channels, out_channels=64, kernel_size=1, padding=0, activation_layer=nn.ReLU
-        )
-
-        self.branch5x5_1 = ConvNormActivation(
-            in_channels=num_channels, out_channels=48, kernel_size=1, padding=0, activation_layer=nn.ReLU
-        )
-        self.branch5x5_2 = ConvNormActivation(
-            in_channels=48, out_channels=64, kernel_size=5, padding=2, activation_layer=nn.ReLU
-        )
-
-        self.branch3x3dbl_1 = ConvNormActivation(
-            in_channels=num_channels, out_channels=64, kernel_size=1, padding=0, activation_layer=nn.ReLU
-        )
-        self.branch3x3dbl_2 = ConvNormActivation(
-            in_channels=64, out_channels=96, kernel_size=3, padding=1, activation_layer=nn.ReLU
-        )
-        self.branch3x3dbl_3 = ConvNormActivation(
-            in_channels=96, out_channels=96, kernel_size=3, padding=1, activation_layer=nn.ReLU
-        )
-        # Patch: Tensorflow's average pool does not use the padded zero's in
-        # its average calculation
-        self.branch_pool = nn.AvgPool2D(kernel_size=3, stride=1, padding=1, exclusive=True)
-        self.branch_pool_conv = ConvNormActivation(
-            in_channels=num_channels, out_channels=pool_features, kernel_size=1, padding=0, activation_layer=nn.ReLU
-        )
-
-    def forward(self, x):
-        branch1x1 = self.branch1x1(x)
-        branch5x5 = self.branch5x5_1(x)
-        branch5x5 = self.branch5x5_2(branch5x5)
-
-        branch3x3dbl = self.branch3x3dbl_1(x)
-        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
-        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
-
-        branch_pool = self.branch_pool(x)
-        branch_pool = self.branch_pool_conv(branch_pool)
-        x = paddle.concat([branch1x1, branch5x5, branch3x3dbl, branch_pool], axis=1)
-        return x
-
-
-class InceptionC(nn.Layer):
-    def __init__(self, num_channels, channels_7x7):
-        super().__init__()
-        self.branch1x1 = ConvNormActivation(
-            in_channels=num_channels, out_channels=192, kernel_size=1, padding=0, activation_layer=nn.ReLU
-        )
-
-        self.branch7x7_1 = ConvNormActivation(
-            in_channels=num_channels,
-            out_channels=channels_7x7,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            activation_layer=nn.ReLU,
-        )
-        self.branch7x7_2 = ConvNormActivation(
-            in_channels=channels_7x7,
-            out_channels=channels_7x7,
-            kernel_size=(1, 7),
-            stride=1,
-            padding=(0, 3),
-            activation_layer=nn.ReLU,
-        )
-        self.branch7x7_3 = ConvNormActivation(
-            in_channels=channels_7x7,
-            out_channels=192,
-            kernel_size=(7, 1),
-            stride=1,
-            padding=(3, 0),
-            activation_layer=nn.ReLU,
-        )
-
-        self.branch7x7dbl_1 = ConvNormActivation(
-            in_channels=num_channels, out_channels=channels_7x7, kernel_size=1, padding=0, activation_layer=nn.ReLU
-        )
-        self.branch7x7dbl_2 = ConvNormActivation(
-            in_channels=channels_7x7,
-            out_channels=channels_7x7,
-            kernel_size=(7, 1),
-            padding=(3, 0),
-            activation_layer=nn.ReLU,
-        )
-        self.branch7x7dbl_3 = ConvNormActivation(
-            in_channels=channels_7x7,
-            out_channels=channels_7x7,
-            kernel_size=(1, 7),
-            padding=(0, 3),
-            activation_layer=nn.ReLU,
-        )
-        self.branch7x7dbl_4 = ConvNormActivation(
-            in_channels=channels_7x7,
-            out_channels=channels_7x7,
-            kernel_size=(7, 1),
-            padding=(3, 0),
-            activation_layer=nn.ReLU,
-        )
-        self.branch7x7dbl_5 = ConvNormActivation(
-            in_channels=channels_7x7, out_channels=192, kernel_size=(1, 7), padding=(0, 3), activation_layer=nn.ReLU
-        )
-        # Patch: Tensorflow's average pool does not use the padded zero's in
-        # its average calculation
-        self.branch_pool = nn.AvgPool2D(kernel_size=3, stride=1, padding=1, exclusive=True)
-        self.branch_pool_conv = ConvNormActivation(
-            in_channels=num_channels, out_channels=192, kernel_size=1, padding=0, activation_layer=nn.ReLU
-        )
-
-    def forward(self, x):
-        branch1x1 = self.branch1x1(x)
-
-        branch7x7 = self.branch7x7_1(x)
-        branch7x7 = self.branch7x7_2(branch7x7)
-        branch7x7 = self.branch7x7_3(branch7x7)
-
-        branch7x7dbl = self.branch7x7dbl_1(x)
-        branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl)
-        branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl)
-        branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl)
-        branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl)
-
-        branch_pool = self.branch_pool(x)
-        branch_pool = self.branch_pool_conv(branch_pool)
-
-        x = paddle.concat([branch1x1, branch7x7, branch7x7dbl, branch_pool], axis=1)
-
-        return x
-
-
-class InceptionE_1(nn.Layer):
-    def __init__(self, num_channels):
-        super().__init__()
-        self.branch1x1 = ConvNormActivation(
-            in_channels=num_channels, out_channels=320, kernel_size=1, padding=0, activation_layer=nn.ReLU
-        )
-        self.branch3x3_1 = ConvNormActivation(
-            in_channels=num_channels, out_channels=384, kernel_size=1, padding=0, activation_layer=nn.ReLU
-        )
-        self.branch3x3_2a = ConvNormActivation(
-            in_channels=384, out_channels=384, kernel_size=(1, 3), padding=(0, 1), activation_layer=nn.ReLU
-        )
-        self.branch3x3_2b = ConvNormActivation(
-            in_channels=384, out_channels=384, kernel_size=(3, 1), padding=(1, 0), activation_layer=nn.ReLU
-        )
-
-        self.branch3x3dbl_1 = ConvNormActivation(
-            in_channels=num_channels, out_channels=448, kernel_size=1, padding=0, activation_layer=nn.ReLU
-        )
-        self.branch3x3dbl_2 = ConvNormActivation(
-            in_channels=448, out_channels=384, kernel_size=3, padding=1, activation_layer=nn.ReLU
-        )
-        self.branch3x3dbl_3a = ConvNormActivation(
-            in_channels=384, out_channels=384, kernel_size=(1, 3), padding=(0, 1), activation_layer=nn.ReLU
-        )
-        self.branch3x3dbl_3b = ConvNormActivation(
-            in_channels=384, out_channels=384, kernel_size=(3, 1), padding=(1, 0), activation_layer=nn.ReLU
-        )
-
-        # Patch: Tensorflow's average pool does not use the padded zero's in
-        # its average calculation
-        self.branch_pool = nn.AvgPool2D(kernel_size=3, stride=1, padding=1, exclusive=True)
-        self.branch_pool_conv = ConvNormActivation(
-            in_channels=num_channels, out_channels=192, kernel_size=1, padding=0, activation_layer=nn.ReLU
-        )
-
-    def forward(self, x):
-        branch1x1 = self.branch1x1(x)
-
-        branch3x3 = self.branch3x3_1(x)
-        branch3x3 = [
-            self.branch3x3_2a(branch3x3),
-            self.branch3x3_2b(branch3x3),
-        ]
-        branch3x3 = paddle.concat(branch3x3, axis=1)
-
-        branch3x3dbl = self.branch3x3dbl_1(x)
-        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
-        branch3x3dbl = [
-            self.branch3x3dbl_3a(branch3x3dbl),
-            self.branch3x3dbl_3b(branch3x3dbl),
-        ]
-        branch3x3dbl = paddle.concat(branch3x3dbl, axis=1)
-
-        branch_pool = self.branch_pool(x)
-        branch_pool = self.branch_pool_conv(branch_pool)
-
-        x = paddle.concat([branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1)
-        return x
-
-
-class InceptionE_2(InceptionE_1):
-    def __init__(self, num_channels):
-        super(InceptionE_2, self).__init__(num_channels)
-
-    def forward(self, x):
-        branch1x1 = self.branch1x1(x)
-
-        branch3x3 = self.branch3x3_1(x)
-        branch3x3 = [
-            self.branch3x3_2a(branch3x3),
-            self.branch3x3_2b(branch3x3),
-        ]
-        branch3x3 = paddle.concat(branch3x3, axis=1)
-
-        branch3x3dbl = self.branch3x3dbl_1(x)
-        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
-        branch3x3dbl = [
-            self.branch3x3dbl_3a(branch3x3dbl),
-            self.branch3x3dbl_3b(branch3x3dbl),
-        ]
-        branch3x3dbl = paddle.concat(branch3x3dbl, axis=1)
-
-        # Patch: The FID Inception model uses max pooling instead of average
-        # pooling. This is likely an error in this specific Inception
-        # implementation, as other Inception models use average pooling here
-        # (which matches the description in the paper).
-        branch_pool = F.max_pool2d(x, kernel_size=3, stride=1, padding=1)
-        branch_pool = self.branch_pool_conv(branch_pool)
-
-        x = paddle.concat([branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1)
-        return x
diff --git a/ppdiffusers/setup.py b/ppdiffusers/setup.py
deleted file mode 100644
index 5e9a202cac9d..000000000000
--- a/ppdiffusers/setup.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-
-from setuptools import find_packages, setup
-
-description = "PPDiffusers: Diffusers toolbox implemented based on PaddlePaddle"
-
-with open("requirements.txt") as fin:
-    REQUIRED_PACKAGES = fin.read()
-
-
-def read(file: str):
-    current_dir = os.path.dirname(__file__)
-    path = os.path.join(current_dir, file)
-    with open(path, "r", encoding="utf-8") as f:
-        content = f.read().strip()
-    return content
-
-
-def read_version():
-    """read version of ppdiffusers"""
-    return read("VERSION")
-
-
-def read_readme():
-    return read("README.md")
-
-
-def read_requirements():
-    content = read("requirements.txt")
-    packages = content.split("\n")
-    return packages
-
-
-setup(
-    name="ppdiffusers",
-    packages=find_packages(),
-    version=read_version(),
-    author="PaddleNLP Team",
-    author_email="paddlenlp@baidu.com",
-    description=description,
-    long_description=read_readme(),
-    long_description_content_type="text/markdown",
-    url="https://github.com/PaddlePaddle/PaddleNLP/ppdiffusers",
-    keywords=["ppdiffusers", "paddle", "paddlenlp"],
-    install_requires=REQUIRED_PACKAGES,
-    python_requires=">=3.6",
-    entry_points={"console_scripts": ["ppdiffusers-cli=ppdiffusers.commands.ppdiffusers_cli:main"]},
-    classifiers=[
-        "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.6",
-        "Programming Language :: Python :: 3.7",
-        "Programming Language :: Python :: 3.8",
-        "Programming Language :: Python :: 3.9",
-        "License :: OSI Approved :: Apache Software License",
-        "Operating System :: OS Independent",
-    ],
-    license="Apache 2.0",
-)
diff --git a/ppdiffusers/tests/__init__.py b/ppdiffusers/tests/__init__.py
deleted file mode 100644
index a72d388cc895..000000000000
--- a/ppdiffusers/tests/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/ppdiffusers/tests/fixtures/custom_pipeline/__init__.py b/ppdiffusers/tests/fixtures/custom_pipeline/__init__.py
deleted file mode 100644
index a72d388cc895..000000000000
--- a/ppdiffusers/tests/fixtures/custom_pipeline/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/ppdiffusers/tests/fixtures/custom_pipeline/pipeline.py b/ppdiffusers/tests/fixtures/custom_pipeline/pipeline.py
deleted file mode 100644
index 17825f9e6e80..000000000000
--- a/ppdiffusers/tests/fixtures/custom_pipeline/pipeline.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-
-# limitations under the License.
-
-
-from typing import Optional, Tuple, Union
-
-import paddle
-
-from ppdiffusers import DiffusionPipeline, ImagePipelineOutput
-
-
-class CustomLocalPipeline(DiffusionPipeline):
-    r"""
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Parameters:
-        unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
-            [`DDPMScheduler`], or [`DDIMScheduler`].
-    """
-
-    def __init__(self, unet, scheduler):
-        super().__init__()
-        self.register_modules(unet=unet, scheduler=scheduler)
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        batch_size: int = 1,
-        generator: Optional[paddle.Generator] = None,
-        num_inference_steps: int = 50,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        **kwargs,
-    ) -> Union[ImagePipelineOutput, Tuple]:
-        r"""
-        Args:
-            batch_size (`int`, *optional*, defaults to 1):
-                The number of images to generate.
-            generator (`paddle.Generator`, *optional*):
-                A [v generator] to make generation deterministic.
-            eta (`float`, *optional*, defaults to 0.0):
-                The eta parameter which controls the scale of the variance (0 is DDIM and 1 is one type of DDPM).
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
-
-        Returns:
-            [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if
-            `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the
-            generated images.
-        """
-
-        # Sample gaussian noise to begin loop
-        image = paddle.randn(
-            (batch_size, self.unet.config.in_channels, self.unet.config.sample_size, self.unet.config.sample_size),
-            generator=generator,
-        )
-
-        # set step values
-        self.scheduler.set_timesteps(num_inference_steps)
-
-        for t in self.progress_bar(self.scheduler.timesteps):
-            # 1. predict noise model_output
-            model_output = self.unet(image, t).sample
-
-            # 2. predict previous mean of image x_t-1 and add variance depending on eta
-            # eta corresponds to η in paper and should be between [0, 1]
-            # do x_t -> x_t-1
-            image = self.scheduler.step(model_output, t, image).prev_sample
-
-        image = (image / 2 + 0.5).clip(0, 1)
-        image = image.cast("float32").transpose([0, 2, 3, 1]).numpy()
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image,), "This is a local test"
-
-        return ImagePipelineOutput(images=image), "This is a local test"
diff --git a/ppdiffusers/tests/fixtures/custom_pipeline/what_ever.py b/ppdiffusers/tests/fixtures/custom_pipeline/what_ever.py
deleted file mode 100644
index 7e7fbd6f85df..000000000000
--- a/ppdiffusers/tests/fixtures/custom_pipeline/what_ever.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-
-# limitations under the License.
-
-
-from typing import Optional, Tuple, Union
-
-import paddle
-
-from ppdiffusers.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
-
-
-class CustomLocalPipeline(DiffusionPipeline):
-    r"""
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Parameters:
-        unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
-            [`DDPMScheduler`], or [`DDIMScheduler`].
-    """
-
-    def __init__(self, unet, scheduler):
-        super().__init__()
-        self.register_modules(unet=unet, scheduler=scheduler)
-
-    @paddle.no_grad()
-    def __call__(
-        self,
-        batch_size: int = 1,
-        generator: Optional[paddle.Generator] = None,
-        num_inference_steps: int = 50,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        **kwargs,
-    ) -> Union[ImagePipelineOutput, Tuple]:
-        r"""
-        Args:
-            batch_size (`int`, *optional*, defaults to 1):
-                The number of images to generate.
-            generator (`paddle.Generator`, *optional*):
-                A [paddle generator] to make generation deterministic.
-            eta (`float`, *optional*, defaults to 0.0):
-                The eta parameter which controls the scale of the variance (0 is DDIM and 1 is one type of DDPM).
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple.
-
-        Returns:
-            [`~pipeline_utils.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if
-            `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the
-            generated images.
-        """
-
-        # Sample gaussian noise to begin loop
-        image = paddle.randn(
-            (batch_size, self.unet.config.in_channels, self.unet.config.sample_size, self.unet.config.sample_size),
-            generator=generator,
-        )
-
-        # set step values
-        self.scheduler.set_timesteps(num_inference_steps)
-
-        for t in self.progress_bar(self.scheduler.timesteps):
-            # 1. predict noise model_output
-            model_output = self.unet(image, t).sample
-
-            # 2. predict previous mean of image x_t-1 and add variance depending on eta
-            # eta corresponds to η in paper and should be between [0, 1]
-            # do x_t -> x_t-1
-            image = self.scheduler.step(model_output, t, image).prev_sample
-
-        image = (image / 2 + 0.5).clip(0, 1)
-        image = image.cast("float32").transpose([0, 2, 3, 1]).numpy()
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image,), "This is a local test"
-
-        return ImagePipelineOutput(images=image), "This is a local test"
diff --git a/ppdiffusers/tests/fixtures/elise_format0.mid b/ppdiffusers/tests/fixtures/elise_format0.mid
deleted file mode 100644
index 33dbabe7ab1d4d28e43d9911255a510a8a672d77..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 14210
zcmeHOUu;xYdjAr(DZ$1#K;3MYddK!Kz8?FIXU1b@#$4=ckH;RyV>}Q@F!{4U>>9Q#
zAct(dFLASZs3Q6>+LuVJYW2a=YG1se5~`3W+NV`jBzE7H?LWGxQe&wv+iX`NtxA8t
z@0@#Q42c3woK=!Oz@2-}cfRwT@BjCm>*=Fs=0xNv{Cnbwf0;hI_=h6db8>&n(-Z%_
z7%84R``WzRf3l-+;o^4|&n}#~6!}3UolIsTr@s>!n_oOX7nxdoeSTqnQSLvnyYl+n
zADvx@eEZaS%*|gqbK%?b=P~|t^}?$cBHy}j=>lH%Ow2D{I=iqi|JtSbbLTEaPJJtq
zPo$CwBHczL$1cvDx-`%6fn*|w@l;yJ$4|XJzc>_04`p(Z>9glwzchay<41q%q-+80
z_m6*N%a-P?a)0(oz8&7OegD77k3`=6?K#=z%ZtlE9XUPs>Zwe6-&`u0IhEdb>ec+*
ztEZFMWG0gx%w-4D1N8-ORfr<GYd0U4+oV>BI{dlVrH|PrefYO9Au`pWZ_c_tTJPA0
z)TTr#-Dj|_`QB=)LPo26yL-KvkChMh@bTmC8%UR%HUINh_3^7;>efwvsgLL`#TnhD
z`%OAalOpB*CBV15vqNfSky1C`c<5S5q&Oh%-srpj^3j?rlId8=U`!<6K_J~QFXDAB
zc_MyH(L{eDH6K9n5Dnlb6XH0#-t{HsFOSq5#bjud8Zd&0T<1~_IHF5fJFX-sLF1UL
zV**^s6*u-2FuDG6QEBQTO&~SjwFFxHu1-uo&m)kjz*dR*d~?=we6-#%obKcxxXD4V
z(J-z@xp{N#oU(ZD64{=3)<CrK@ew>#)*jtyc-pNzP3rOAcRq}#NpYiT@N}Y1KY7~8
zCD=l0b)N3!SWCdu9z{ygkf*LVu_hdL$X^~%o_6UpPiT1>)q%w>sgbve3yP;)DLFcK
zV@1V0X=L~GpGyrqE;``xPGE+#e5ElG$UVYUt;gObjlT<sQ+VC}&VS<MQSxs4`wlg6
z`|T^w@^NED-&PlN?sA(^-41<Q?9#c}CVlg7>6^2z>ss$9x9_c5&0nHzMCX7K5>sYV
zwv>*`LC97It!y7rt9v1V>Ccr`Pw=dZU9Co|Ef>|ELpQQVHMdjeHn<TmjvU6rs=D%2
zRigRkni)%wi!f?^Jg%zWzVZ>O9`nKI+^4MSMwh7tR@Kb_)!C>z4Bx1DOV8gos`%-V
zhgF<xCHeGls^Hsan-npvG7^WHA0o&f|D#$a{E_-0wVz-8ogyzAbKh}Mc~BXUWM62R
zNrXW)FLnP}xJ%<<y*a7DZ9=sR#ulH0Q!a>1Czhr$+^hJB5fJ6H^wu%<K8SIDx0-=B
zr)KcYXQZ|tYeYs3Vh)Vbl~Ly9I%0bJw77M|WI&x%a5is9{aYvqd=i|jZ!lWdT23gD
z<Q8!m%sdw$C)488G1KK6%)clHGV{An_|i_!(r|sV)em1bPTM_Ed1#Jg(4qI)xt8Le
zf;(ncN*<>>o;(C`HP?=t#oTNvPHSlXulwU>D;iAb@zX>#71v;TZ&hu{SvS^@3@A`v
zV>mq9v|HUy4;R1`w_(rrsV<Lim)ks@+pAbs7ple-RU|i7R6tZRdM!r>XPb1;Qa%+u
zLoBzi(Dk*ne9uBE>h~Q!RFiSWN#p04Oh6IK8Xm*wZR{Y3-UyZrk>+z1y{S|hy_MDe
zEvJE<&bqP%36hb9I;?xEa|*JTTxFclxrBw(2SrN;Hil@j|KYSJfe4s~-x}Ezzh&JE
zq?a@D;y^t_uP*4(A2&o(eRvw!tvW8X&A@5(YP6=pl-v!TOOwCtf#*tpxjYw5`#0jb
zXy4XzQQD8@xm3;XUe`wuv(<r7;WdYi2f9yPpjV~0p(ESI0i;Q_scy9d|CVu9J?gAx
zo6;abT`|+csV;R)m)nd9?bQ{l3!a)+r>?lMqGlO5v~G21s|y<)n&!_s1=rO$Wjz@8
zq6gEM5_&K#EyDT(hS*upl{e<0k5_-hKEwK>Om_RXnvDm`8Rln<P?`-JIS4PsL6&jG
zk(s3Iq#1kGKo}t=_TUX7)rU=!{aea*TU>rf{Sori#MU4xp#IoI=XULdAi!Ja#v&+m
zZY@l}>dDzJOCOh<*{m#ZYO0Hfhs;NOM@k7nyqDTgC#q28#_cN;Oslws$dFO|8!I|R
zOB?NxQa@@ffU6>hb@(=}LY9P8c{PO{*O+efN}*RuDam$<?!o<c|HYqZ>V{+u(Aa4R
z2^kjmwqt#<XDi}H8>Pkl;E79F8SGmc1l|}cRL@jp4Byx>Zvp^8O@=j{7jG}grOg-!
z?zv-dLTbzlGeZ@5lGy3$v80j*_7*U~_m|@}>@sBXEMf#*@GRv48P;7iffxpa*^?9i
z0{-vFFd;Y-BU2!w-L)ekhjT-R&0cgR06m<;Udl>m{RwRO`~I>z4Mu0|d{|^AhdNUs
zW}8%0;BO5ann7L*HbiA8D%q$4AYbA!Ru`Nan1`d0b`YFJ7eE1StSG?kD=H{z5fJe_
ziYQB5jAWVxnsWdHP?h$vk3ifuhk%eK_JQ(kAo+m5oFc9HC}lALc@0>JD0e84=m`jI
zqEAmoE&>Rk#T4K?%oNfd1P?&0%<ITRMhWF^1{M?y#-Ro><}W97w@#J3N!)l6-A8Gp
z6pZ9?E-%JT3M4X>4{~o}#XKGPe#xXlNi#b)`efuySB$mbPi4r?;P>>1q+>E1BQYa9
zI)t27ZWo7S(5&PR;0QI<mux=^mjZPOtVB--JkL5y<~MmtwZj;AS_X8_(|~sT<w-)1
z#ypTdnOAusdJvq4g^#CU`Qo|ayIZgVinE8aBpR4z%o3Ps4T@6mbtM%8jGoC`ZL#Jm
z1OWjQfUjdr%0c|hh`#x^NKzA_CdW_#p}JB8VKuDu%#5@H_o%uD%@qM&8W%Npj}(n@
z9_0zf%M&u3kkNz`5<K6`DDlB(ZzD4mOIXAJx>6D^C4Nc@DXYF3-e*R_Hvk<ict=H^
z8}^=44CXk83(y+L$CcdHJNkf&uhFcAJ*=88c=P}VJO~yl0hyJHl8wv$xY9LZS2#TT
zbTL>@K3ids-M~;*21DNo((=Yn_3^WR&{PP8BTb01(&3ixHN%~$uqev9ux?*z6xNLu
zWj@VCWu|XTAV7*rN~y8VmT9;OEcKz1Vp<QC`IJQjLLL+isRb)%MmLD2sG{*Vie_Si
zXdIP^r!w(%F^@#qWE*5N+8`UF#PpMtjp{<nwfBrP28luABDBl!A5MLLTyk+@n@+3L
z(+c(Ll9D`S-dO*_F(9~NStU>C$&?4)RrV<~Ws0#W^-nF|{11~#7`OqYn;H?q@A#iT
zGw?=p$sW3${CCd~afWS=N?u(qdn!bnQWvHLGUjsMJws(<(ai-PRd*B3Cf41W%1)VN
z;IZrOH-2i)?7YO2*mR37ng=pLg{Q?4Odlo8u#9UDGAz{HUG=ut*Msiq*U7HO%UVCn
zuKDM*zqomII;|RU7Sgt<EQ6m(Wf=lZ-STMtMH-X)X#K@L?Jt_v+K2ZSGhePMoe2tD
z<7eMrY^c&HdH8@YN`Eo%bG=%|BYLZYtLKAsaDD3PgKI%E7dW{74G!*;G#2-6QdzDv
zAzi^$C32-tb}jfU-S*A3lVsAA=`UVgU`7*Oo%HCN?akObs@6MSA;{h9IXv{<jX|VS
zuLn&Jj!H$vHi0GJEm|JY&GTtArVT!gDL`1BYt0(^Hcj@y!;iQgEWx8`>Pwt%*9p8K
zt#f1i;dbi(<~OIbB3|s;TJv2gxH93&!9m<TkVUA|*z{(@gb!z3A2#@gmxoVbH*I5u
zSB6__ZcOvaa7D(f4?!))*(QK$J`3a&CL%j`MCQ+}AflZCa*Jwp0V}$dxCbvPmDF%Y
zU4L(tJMx4;g@t;N_P{xu_4a^WtfpP9N<aWJ<(6NUf=P@>?VwgTNTg9tj;2_8miLR>
zztwD@Ti#sT!?&yEYPIza=h)!m!@u<;w%Tb%EHANReB*l6SgXb>AL!iMl{U)<d{WQ|
zO`_bS9npn1*YvUSLBx<@=eTj$Yjk^+977zdEP|R(79fD&#A+F^0;mqE#Dn{!g3J9h
zG={b8K~*p<&B21{32=ckNVUsG^~lTdQ7Mng^t8BxBm=muRP2UI+s!o{yt<%#yL&yP
z)bftqIycG5`uc{2;H?tu4l2@6aTMFlHT4iz7pMkL@sxHqHul`Kx&gDo;s!QVBdf}b
zT`DD9kaPRh5Y2XTkFGkQM;3e!F+L*Y5t*J6cb}d%ENtzemFT<I14&{>3r12rY|#4G
zV*&3sVlH}8@>nIgS;9e7AspMyZb}bpRnA%G+XK9+vlSfJU30Taj6Hft=z|o7YKuJv
zlvpc47!&EZxn`BjNFeKQ@ZM?zB=~8mp;*-?(XF|r@VWM$E`&P4mr>1%Qa&OvTzz_q
z%6vWpvmwtUX4W(Ji@_B)9s!gnL01=)ppXQs65JchX?RrgS}Y~*5a&Q%1F@ELfatoL
zYj#`O4Gem>+1AcA?q_Yza^>Et-LD<S1IzIbNa>)&(A>>adg#3H)NGaD)aFns?%@M6
z3AjFSGc+aiFc9o9kFM5OL5efd%~Nx#yKZ9Lach$p>qd1mXy?0o#7#B8vZW6Wk7?EK
zjY;v4aOpq*-hhTQ(s}`o4uLl=#i}^SD{_Q&q}>Tn<2PX$EDfwbCZ$8TUA8B902Zgk
znawHta0wkh;fVA}yZ$C;cKpuI+WEZ<#)y}}3mgC+yqpvZ(h<<ufhZOSG3~Ca|C`@^
z;cty-3@w(bhvS;Ha{D2d4MvyS7;M_zSc$t^)Aw(MF$<j^1S{(^8iUQ{Hq~*vB8))6
zLI_tqfJUJ1J|M@4t~b6`)hkR*U>xRi52M6TMyhn+<@j?Gqm;%~!v_r(_DCA0<2q?L
zS3i=T%*MQXtA>F|HNv~sc}B!5>m$@)>V60(0y{sWK(wh*$T1Rah+8<A2E00(^zdP%
z)3=omly`4e^zF6x^zD_C+n|H$sO-qw6=+1damC&1R5v;#!(WRgTxj9|_BLHCMCAWl
zR04L*I!$nT=J|gQgH8i#Po_(fsE4J5+AjRD)Dn*tmJ+7d%dnJa^yyF#ha}i6g@Mz6
zQgb-Leox|K!V<_yd`ws}6mMr(O4P%W9i^>JGpCV4QKpVa!c$3vlC&aF`)8Q4%NV0J
z9Oh#KRH7c85|0*|EZ-QH64^%zN+1qxLpH__Nco^7@+T5{v{7jv(HND;a0GsOMB0Z7
zGFp&{83-4H>X6BdM4E4jLH$1!%#e9ZiPc+=j>)hFn8biW>54#LaLI7!0302c;c*$S
zN`LM|zqI$K_&=c-PWm$&$JpUUhGkbCU=SHUDE(=L(%ea7M+1fs8qwD~AjN}{%)WIJ
zpL`(%vF|N!Uc^<v<d9^#CD-@nMHX5ba5M7ONs__7P(CglT@kd|lN_-ZLS7$Z72?Bg
zUhDX-Dk;A6t6_ZyF!x*K^ca3hy;ZQYVyby6EOMQ4zxWaHz9#vvDJEW%u-NQXOuU!F
zl1bwkFSqF+FJKS~4eh9!i@2}p5O0s<_vmi8k}(wOg-gkh017!6euAi%1eK&hS`%VO
z14SX;!|5t6HIDXIaS3v~U#TO=agpaU8;Pd(ex9I;j{S4qo0yI5jyxBxQR>KX`-1CT
z5*;q$ZUahYXzja+>G&W);KTI6Pbfe`i0S3Q>dPW0)76tAbLr}w$Z1?HTAo%<V=Bd|
zZ=|c=Xjt_MU>cZ)$mG>t5qUXHWG|(vFEzlvI8c32<V31^LS!~sodrb$RgmyPs`>($
zk*<=ut56Z};8?nP497^3Y0ssUVC$P{o8Z6^<vCnHc$g!&lQo0(+cugOcu@`5X(eZc
zE43<aLkx3qy@S1jJ0e^N^NlwaxW720nFCCLgFBA>LLb+(AZ+dYq0|ncKvNJ@8l2np
z{t-T4a{8c^!P!;1d3C>}Jr#42%TEJ%;m|Wj=mz%--LapHa4maN`q5w5eBbza1Jc*M
z|Nn6>8dXbRqlQQ_)nt30`p3PS*QlbdX4F?c^pPAabhDc8F>St^KRF1R@1_<1CH+M1
zH`aXr;(pv4___E?^mDe)$;z32&h&z*{15xNjhBr4Gm=SU4JnrSU-&=G(GFYKgWu@d
zd^$SQXV-d!d^9nE88Z23zRwT(?Hl6-KZmP-VyJ&kgp}%&|2dBN%Id*>`;K@1prPO1
z`cMCAKBv*U#NX@utA@|vcK!`+*+G0UDjm;=`&Ir$&Xm9W<}S*cDIoqR&DZa<cE0g*
Ix!*7U3od?3yZ`_I

diff --git a/ppdiffusers/tests/models/__init__.py b/ppdiffusers/tests/models/__init__.py
deleted file mode 100644
index a72d388cc895..000000000000
--- a/ppdiffusers/tests/models/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/ppdiffusers/tests/models/test_attention_processor.py b/ppdiffusers/tests/models/test_attention_processor.py
deleted file mode 100644
index 3780c1f2730c..000000000000
--- a/ppdiffusers/tests/models/test_attention_processor.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import paddle
-
-from ppdiffusers.models.attention_processor import Attention, AttnAddedKVProcessor
-
-
-class AttnAddedKVProcessorTests(unittest.TestCase):
-    def get_constructor_arguments(self, only_cross_attention: bool = False):
-        query_dim = 10
-
-        if only_cross_attention:
-            cross_attention_dim = 12
-        else:
-            # when only cross attention is not set, the cross attention dim must be the same as the query dim
-            cross_attention_dim = query_dim
-
-        return {
-            "query_dim": query_dim,
-            "cross_attention_dim": cross_attention_dim,
-            "heads": 2,
-            "dim_head": 4,
-            "added_kv_proj_dim": 6,
-            "norm_num_groups": 1,
-            "only_cross_attention": only_cross_attention,
-            "processor": AttnAddedKVProcessor(),
-        }
-
-    def get_forward_arguments(self, query_dim, added_kv_proj_dim):
-        batch_size = 2
-
-        hidden_states = paddle.rand((batch_size, query_dim, 3, 2))
-        encoder_hidden_states = paddle.rand((batch_size, 4, added_kv_proj_dim))
-        attention_mask = None
-
-        return {
-            "hidden_states": hidden_states,
-            "encoder_hidden_states": encoder_hidden_states,
-            "attention_mask": attention_mask,
-        }
-
-    def test_only_cross_attention(self):
-        # self and cross attention
-
-        paddle.seed(0)
-
-        constructor_args = self.get_constructor_arguments(only_cross_attention=False)
-        attn = Attention(**constructor_args)
-
-        self.assertTrue(attn.to_k is not None)
-        self.assertTrue(attn.to_v is not None)
-
-        forward_args = self.get_forward_arguments(
-            query_dim=constructor_args["query_dim"], added_kv_proj_dim=constructor_args["added_kv_proj_dim"]
-        )
-
-        self_and_cross_attn_out = attn(**forward_args)
-
-        # only self attention
-
-        paddle.seed(0)
-
-        constructor_args = self.get_constructor_arguments(only_cross_attention=True)
-        attn = Attention(**constructor_args)
-
-        self.assertTrue(attn.to_k is None)
-        self.assertTrue(attn.to_v is None)
-
-        forward_args = self.get_forward_arguments(
-            query_dim=constructor_args["query_dim"], added_kv_proj_dim=constructor_args["added_kv_proj_dim"]
-        )
-
-        only_cross_attn_out = attn(**forward_args)
-
-        self.assertTrue((only_cross_attn_out != self_and_cross_attn_out).all())
diff --git a/ppdiffusers/tests/models/test_layers_utils.py b/ppdiffusers/tests/models/test_layers_utils.py
deleted file mode 100644
index 122f3bd84567..000000000000
--- a/ppdiffusers/tests/models/test_layers_utils.py
+++ /dev/null
@@ -1,674 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import paddle
-import paddle.nn
-
-from ppdiffusers.models.attention import (
-    GEGLU,
-    AdaLayerNorm,
-    ApproximateGELU,
-    AttentionBlock,
-)
-from ppdiffusers.models.embeddings import get_timestep_embedding
-from ppdiffusers.models.resnet import Downsample2D, ResnetBlock2D, Upsample2D
-from ppdiffusers.models.transformer_2d import Transformer2DModel
-
-
-class EmbeddingsTests(unittest.TestCase):
-    def test_timestep_embeddings(self):
-        embedding_dim = 256
-        timesteps = paddle.arange(start=16)
-        t1 = get_timestep_embedding(timesteps, embedding_dim)
-        assert (t1[0, : embedding_dim // 2] - 0).abs().sum() < 1e-05
-        assert (t1[0, embedding_dim // 2 :] - 1).abs().sum() < 1e-05
-        assert (t1[:, -1] - 1).abs().sum() < 1e-05
-        grad_mean = np.abs(np.gradient(t1, axis=-1)).mean(axis=1)
-        prev_grad = 0.0
-        for grad in grad_mean:
-            assert grad > prev_grad
-            prev_grad = grad
-
-    def test_timestep_defaults(self):
-        embedding_dim = 16
-        timesteps = paddle.arange(start=10)
-        t1 = get_timestep_embedding(timesteps, embedding_dim)
-        t2 = get_timestep_embedding(
-            timesteps, embedding_dim, flip_sin_to_cos=False, downscale_freq_shift=1, max_period=10000
-        )
-        assert paddle.allclose(t1.cpu(), t2.cpu(), atol=0.01)
-
-    def test_timestep_flip_sin_cos(self):
-        embedding_dim = 16
-        timesteps = paddle.arange(start=10)
-        t1 = get_timestep_embedding(timesteps, embedding_dim, flip_sin_to_cos=True)
-        t1 = paddle.concat(x=[t1[:, embedding_dim // 2 :], t1[:, : embedding_dim // 2]], axis=-1)
-        t2 = get_timestep_embedding(timesteps, embedding_dim, flip_sin_to_cos=False)
-        assert paddle.allclose(t1.cpu(), t2.cpu(), atol=0.01)
-
-    def test_timestep_downscale_freq_shift(self):
-        embedding_dim = 16
-        timesteps = paddle.arange(start=10)
-        t1 = get_timestep_embedding(timesteps, embedding_dim, downscale_freq_shift=0)
-        t2 = get_timestep_embedding(timesteps, embedding_dim, downscale_freq_shift=1)
-        cosine_half = (t1 - t2)[:, embedding_dim // 2 :]
-        assert (np.abs((cosine_half <= 0).numpy()) - 1).sum() < 1e-05
-
-    def test_sinoid_embeddings_hardcoded(self):
-        embedding_dim = 64
-        timesteps = paddle.arange(start=128)
-        t1 = get_timestep_embedding(timesteps, embedding_dim, downscale_freq_shift=1, flip_sin_to_cos=False)
-        t2 = get_timestep_embedding(timesteps, embedding_dim, downscale_freq_shift=0, flip_sin_to_cos=True)
-        t3 = get_timestep_embedding(timesteps, embedding_dim, scale=1000)
-        assert paddle.allclose(
-            t1[23:26, 47:50].flatten().cpu(),
-            paddle.to_tensor([0.9646, 0.9804, 0.9892, 0.9615, 0.9787, 0.9882, 0.9582, 0.9769, 0.9872]),
-            atol=0.01,
-        )
-        assert paddle.allclose(
-            t2[23:26, 47:50].flatten().cpu(),
-            paddle.to_tensor([0.3019, 0.228, 0.1716, 0.3146, 0.2377, 0.179, 0.3272, 0.2474, 0.1864]),
-            atol=0.01,
-        )
-        assert paddle.allclose(
-            t3[23:26, 47:50].flatten().cpu(),
-            paddle.to_tensor([-0.9801, -0.9464, -0.9349, -0.3952, 0.8887, -0.9709, 0.5299, -0.2853, -0.9927]),
-            atol=0.01,
-        )
-
-
-class Upsample2DBlockTests(unittest.TestCase):
-    def test_upsample_default(self):
-        paddle.seed(0)
-        sample = paddle.randn(shape=[1, 32, 32, 32])
-        upsample = Upsample2D(channels=32, use_conv=False)
-        with paddle.no_grad():
-            upsampled = upsample(sample)
-        assert tuple(upsampled.shape) == (1, 32, 64, 64)
-        output_slice = upsampled[0, -1, -3:, -3:]
-        expected_slice = paddle.to_tensor(
-            [
-                -1.50215650,
-                -0.12905766,
-                -0.12905766,
-                -1.97015178,
-                0.78776687,
-                0.78776687,
-                -1.97015178,
-                0.78776687,
-                0.78776687,
-            ]
-        )
-        assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
-
-    def test_upsample_with_conv(self):
-        paddle.seed(0)
-        sample = paddle.randn(shape=[1, 32, 32, 32])
-        upsample = Upsample2D(channels=32, use_conv=True)
-        with paddle.no_grad():
-            upsampled = upsample(sample)
-        assert tuple(upsampled.shape) == (1, 32, 64, 64)
-        output_slice = upsampled[0, -1, -3:, -3:]
-        expected_slice = paddle.to_tensor(
-            [
-                0.4583871364593506,
-                -0.8221798539161682,
-                -0.8228907585144043,
-                0.3325321078300476,
-                -0.24422502517700195,
-                1.344732642173767,
-                0.5239212512969971,
-                -0.4814918637275696,
-                0.17928099632263184,
-            ]
-        )
-        assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
-
-    def test_upsample_with_conv_out_dim(self):
-        paddle.seed(0)
-        sample = paddle.randn(shape=[1, 32, 32, 32])
-        upsample = Upsample2D(channels=32, use_conv=True, out_channels=64)
-        with paddle.no_grad():
-            upsampled = upsample(sample)
-        assert tuple(upsampled.shape) == (1, 64, 64, 64)
-        output_slice = upsampled[0, -1, -3:, -3:]
-        expected_slice = paddle.to_tensor(
-            [
-                0.9049283266067505,
-                -1.6125869750976562,
-                -1.0837469100952148,
-                0.24520659446716309,
-                -0.6669139266014099,
-                0.5660533905029297,
-                1.1056761741638184,
-                2.1717309951782227,
-                0.7197026610374451,
-            ]
-        )
-        assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
-
-    def test_upsample_with_transpose(self):
-        paddle.seed(0)
-        sample = paddle.randn(shape=[1, 32, 32, 32])
-        upsample = Upsample2D(channels=32, use_conv=False, use_conv_transpose=True)
-        with paddle.no_grad():
-            upsampled = upsample(sample)
-        assert tuple(upsampled.shape) == (1, 32, 64, 64)
-        output_slice = upsampled[0, -1, -3:, -3:]
-        expected_slice = paddle.to_tensor(
-            [
-                -0.05951342731714249,
-                0.26951998472213745,
-                0.2600363492965698,
-                1.12237548828125,
-                -0.07744798064231873,
-                0.006375734228640795,
-                0.6678807735443115,
-                0.44324278831481934,
-                -0.10978640615940094,
-            ]
-        )
-        assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
-
-
-class Downsample2DBlockTests(unittest.TestCase):
-    def test_downsample_default(self):
-        paddle.seed(0)
-        sample = paddle.randn(shape=[1, 32, 64, 64])
-        downsample = Downsample2D(channels=32, use_conv=False)
-        with paddle.no_grad():
-            downsampled = downsample(sample)
-        assert tuple(downsampled.shape) == (1, 32, 32, 32)
-        output_slice = downsampled[0, -1, -3:, -3:]
-        expected_slice = paddle.to_tensor(
-            [
-                -0.24012964963912964,
-                -0.034197285771369934,
-                -1.0328047275543213,
-                0.7861506938934326,
-                -0.2086063176393509,
-                -0.3999312222003937,
-                0.25081655383110046,
-                -0.23891538381576538,
-                -1.4398303031921387,
-            ]
-        )
-        max_diff = (output_slice.flatten() - expected_slice).abs().sum().item()
-        assert max_diff <= 0.001
-
-    def test_downsample_with_conv(self):
-        paddle.seed(0)
-        sample = paddle.randn(shape=[1, 32, 64, 64])
-        downsample = Downsample2D(channels=32, use_conv=True)
-        with paddle.no_grad():
-            downsampled = downsample(sample)
-        assert tuple(downsampled.shape) == (1, 32, 32, 32)
-        output_slice = downsampled[0, -1, -3:, -3:]
-        expected_slice = paddle.to_tensor(
-            [
-                -0.009430217556655407,
-                0.8657761216163635,
-                1.7985490560531616,
-                -0.61894291639328,
-                -2.5752196311950684,
-                1.2352519035339355,
-                0.6046919822692871,
-                -1.6499173641204834,
-                -1.5272349119186401,
-            ]
-        )
-        assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
-
-    def test_downsample_with_conv_pad1(self):
-        paddle.seed(0)
-        sample = paddle.randn(shape=[1, 32, 64, 64])
-        downsample = Downsample2D(channels=32, use_conv=True, padding=1)
-        with paddle.no_grad():
-            downsampled = downsample(sample)
-        assert tuple(downsampled.shape) == (1, 32, 32, 32)
-        output_slice = downsampled[0, -1, -3:, -3:]
-        expected_slice = paddle.to_tensor(
-            [
-                -0.009430217556655407,
-                0.8657761216163635,
-                1.7985490560531616,
-                -0.61894291639328,
-                -2.5752196311950684,
-                1.2352519035339355,
-                0.6046919822692871,
-                -1.6499173641204834,
-                -1.5272349119186401,
-            ]
-        )
-        assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
-
-    def test_downsample_with_conv_out_dim(self):
-        paddle.seed(0)
-        sample = paddle.randn(shape=[1, 32, 64, 64])
-        downsample = Downsample2D(channels=32, use_conv=True, out_channels=16)
-        with paddle.no_grad():
-            downsampled = downsample(sample)
-        assert tuple(downsampled.shape) == (1, 16, 32, 32)
-        output_slice = downsampled[0, -1, -3:, -3:]
-        expected_slice = paddle.to_tensor(
-            [
-                0.10819266736507416,
-                0.43043053150177,
-                -0.7322822213172913,
-                -1.923148512840271,
-                1.0195047855377197,
-                0.48796477913856506,
-                1.6765365600585938,
-                -4.072991847991943,
-                0.8763526082038879,
-            ]
-        )
-        assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
-
-
-class ResnetBlock2DTests(unittest.TestCase):
-    def test_resnet_default(self):
-        paddle.seed(0)
-        sample = paddle.randn(shape=[1, 32, 64, 64])
-        temb = paddle.randn(shape=[1, 128])
-        resnet_block = ResnetBlock2D(in_channels=32, temb_channels=128)
-        with paddle.no_grad():
-            output_tensor = resnet_block(sample, temb)
-        assert tuple(output_tensor.shape) == (1, 32, 64, 64)
-        output_slice = output_tensor[0, -1, -3:, -3:]
-        expected_slice = paddle.to_tensor(
-            [
-                1.9816107749938965,
-                1.4443503618240356,
-                -1.0354782342910767,
-                0.23985600471496582,
-                -1.0868161916732788,
-                -1.5830397605895996,
-                -0.041037797927856445,
-                -1.2574901580810547,
-                -0.5504958629608154,
-            ]
-        )
-        assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
-
-    def test_restnet_with_use_in_shortcut(self):
-        paddle.seed(0)
-        sample = paddle.randn(shape=[1, 32, 64, 64])
-        temb = paddle.randn(shape=[1, 128])
-        resnet_block = ResnetBlock2D(in_channels=32, temb_channels=128, use_in_shortcut=True)
-        with paddle.no_grad():
-            output_tensor = resnet_block(sample, temb)
-        assert tuple(output_tensor.shape) == (1, 32, 64, 64)
-        output_slice = output_tensor[0, -1, -3:, -3:]
-        expected_slice = paddle.to_tensor(
-            [
-                -0.9861348867416382,
-                -1.097771406173706,
-                0.268703430891037,
-                0.40997087955474854,
-                -4.26219367980957,
-                1.758486270904541,
-                -0.8979732990264893,
-                0.30774950981140137,
-                3.2780206203460693,
-            ]
-        )
-        assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
-
-    def test_resnet_up(self):
-        paddle.seed(0)
-        sample = paddle.randn(shape=[1, 32, 64, 64])
-        temb = paddle.randn(shape=[1, 128])
-        resnet_block = ResnetBlock2D(in_channels=32, temb_channels=128, up=True)
-        with paddle.no_grad():
-            output_tensor = resnet_block(sample, temb)
-        assert tuple(output_tensor.shape) == (1, 32, 128, 128)
-        output_slice = output_tensor[0, -1, -3:, -3:]
-        expected_slice = paddle.to_tensor(
-            [
-                0.2874237298965454,
-                -2.6432056427001953,
-                -2.1900298595428467,
-                -0.48899877071380615,
-                -1.1637755632400513,
-                -1.084446907043457,
-                -1.1333439350128174,
-                0.2726985812187195,
-                -0.014697253704071045,
-            ]
-        )
-        assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
-
-    def test_resnet_down(self):
-        paddle.seed(0)
-        sample = paddle.randn(shape=[1, 32, 64, 64])
-        temb = paddle.randn(shape=[1, 128])
-        resnet_block = ResnetBlock2D(in_channels=32, temb_channels=128, down=True)
-        with paddle.no_grad():
-            output_tensor = resnet_block(sample, temb)
-        assert tuple(output_tensor.shape) == (1, 32, 32, 32)
-        output_slice = output_tensor[0, -1, -3:, -3:]
-        expected_slice = paddle.to_tensor(
-            [
-                1.54087495803833,
-                0.26700693368911743,
-                -0.540952742099762,
-                2.7190208435058594,
-                -0.09766747057437897,
-                0.23407122492790222,
-                0.47980907559394836,
-                0.6348602771759033,
-                -0.75424242019653322,
-            ]
-        )
-        assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
-
-    def test_restnet_with_kernel_fir(self):
-        paddle.seed(0)
-        sample = paddle.randn(shape=[1, 32, 64, 64])
-        temb = paddle.randn(shape=[1, 128])
-        resnet_block = ResnetBlock2D(in_channels=32, temb_channels=128, kernel="fir", down=True)
-        with paddle.no_grad():
-            output_tensor = resnet_block(sample, temb)
-        assert tuple(output_tensor.shape) == (1, 32, 32, 32)
-        output_slice = output_tensor[0, -1, -3:, -3:]
-        expected_slice = paddle.to_tensor(
-            [
-                0.9914248585700989,
-                0.4773162007331848,
-                -0.021942138671875,
-                2.482321262359619,
-                0.18839354813098907,
-                0.1516135334968567,
-                0.7221578359603882,
-                0.3920581340789795,
-                -0.24661940336227417,
-            ]
-        )
-        assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
-
-    def test_restnet_with_kernel_sde_vp(self):
-        paddle.seed(0)
-        sample = paddle.randn(shape=[1, 32, 64, 64])
-        temb = paddle.randn(shape=[1, 128])
-        resnet_block = ResnetBlock2D(in_channels=32, temb_channels=128, kernel="sde_vp", down=True)
-        with paddle.no_grad():
-            output_tensor = resnet_block(sample, temb)
-        assert tuple(output_tensor.shape) == (1, 32, 32, 32)
-        output_slice = output_tensor[0, -1, -3:, -3:]
-        expected_slice = paddle.to_tensor(
-            [
-                1.54087495803833,
-                0.26700693368911743,
-                -0.540952742099762,
-                2.7190208435058594,
-                -0.09766747057437897,
-                0.23407122492790222,
-                0.47980907559394836,
-                0.6348602771759033,
-                -0.7542424201965332,
-            ]
-        )
-        assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
-
-
-class AttentionBlockTests(unittest.TestCase):
-    def test_attention_block_default(self):
-        paddle.seed(0)
-        sample = paddle.randn(shape=[1, 32, 64, 64])
-        attentionBlock = AttentionBlock(
-            channels=32, num_head_channels=1, rescale_output_factor=1.0, eps=1e-06, norm_num_groups=32
-        )
-        with paddle.no_grad():
-            attention_scores = attentionBlock(sample)
-        assert attention_scores.shape == [1, 32, 64, 64]
-        output_slice = attention_scores[0, -1, -3:, -3:]
-        expected_slice = paddle.to_tensor(
-            [
-                1.638939619064331,
-                -0.15776772797107697,
-                -1.1130025386810303,
-                -0.8540273904800415,
-                -0.5696781873703003,
-                -2.0493741035461426,
-                -0.3732607960700989,
-                -1.740313172340393,
-                -0.5271167755126953,
-            ]
-        )
-        assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
-
-    def test_attention_block_sd(self):
-        paddle.seed(0)
-        sample = paddle.randn(shape=[1, 512, 64, 64])
-        attentionBlock = AttentionBlock(channels=512, rescale_output_factor=1.0, eps=1e-06, norm_num_groups=32)
-        with paddle.no_grad():
-            attention_scores = attentionBlock(sample)
-        assert attention_scores.shape == [1, 512, 64, 64]
-        output_slice = attention_scores[0, -1, -3:, -3:]
-        expected_slice = paddle.to_tensor(
-            [
-                -0.8007570505142212,
-                -0.770350992679596,
-                -3.5278191566467285,
-                -2.0540268421173096,
-                -0.7711739540100098,
-                -0.8278288245201111,
-                -0.48292720317840576,
-                1.6039936542510986,
-                0.626724362373352,
-            ]
-        )
-        assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
-
-
-class Transformer2DModelTests(unittest.TestCase):
-    def test_spatial_transformer_default(self):
-        paddle.seed(0)
-        sample = paddle.randn(shape=[1, 32, 64, 64])
-        spatial_transformer_block = Transformer2DModel(
-            in_channels=32, num_attention_heads=1, attention_head_dim=32, dropout=0.0, cross_attention_dim=None
-        )
-        with paddle.no_grad():
-            attention_scores = spatial_transformer_block(sample).sample
-        assert attention_scores.shape == [1, 32, 64, 64]
-        output_slice = attention_scores[0, -1, -3:, -3:]
-        expected_slice = paddle.to_tensor(
-            [
-                2.6310853958129883,
-                5.990478515625,
-                0.5715246200561523,
-                -2.5269505977630615,
-                -2.853764057159424,
-                -5.163403511047363,
-                0.2880846858024597,
-                -5.925153732299805,
-                2.316770076751709,
-            ]
-        )
-        assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
-
-    def test_spatial_transformer_cross_attention_dim(self):
-        paddle.seed(0)
-        sample = paddle.randn(shape=[1, 64, 64, 64])
-        spatial_transformer_block = Transformer2DModel(
-            in_channels=64, num_attention_heads=2, attention_head_dim=32, dropout=0.0, cross_attention_dim=64
-        )
-        with paddle.no_grad():
-            context = paddle.randn(shape=[1, 4, 64])
-            attention_scores = spatial_transformer_block(sample, context).sample
-        assert attention_scores.shape == [1, 64, 64, 64]
-        output_slice = attention_scores[0, -1, -3:, -3:]
-        expected_slice = paddle.to_tensor(
-            [
-                -0.08756911754608154,
-                -3.94197940826416,
-                -0.25678586959838867,
-                2.1481714248657227,
-                2.327033042907715,
-                0.29948690533638,
-                1.3845969438552856,
-                0.7825677394866943,
-                1.4856826066970825,
-            ]
-        )
-        assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
-
-    def test_spatial_transformer_timestep(self):
-        paddle.seed(0)
-        num_embeds_ada_norm = 5
-        sample = paddle.randn(shape=[1, 64, 64, 64])
-        spatial_transformer_block = Transformer2DModel(
-            in_channels=64,
-            num_attention_heads=2,
-            attention_head_dim=32,
-            dropout=0.0,
-            cross_attention_dim=64,
-            num_embeds_ada_norm=num_embeds_ada_norm,
-        )
-        with paddle.no_grad():
-            timestep_1 = paddle.to_tensor(1, dtype="int64")
-            timestep_2 = paddle.to_tensor(2, dtype="int64")
-            attention_scores_1 = spatial_transformer_block(sample, timestep=timestep_1).sample
-            attention_scores_2 = spatial_transformer_block(sample, timestep=timestep_2).sample
-        assert tuple(attention_scores_1.shape) == (1, 64, 64, 64)
-        assert tuple(attention_scores_2.shape) == (1, 64, 64, 64)
-        output_slice_1 = attention_scores_1[0, -1, -3:, -3:]
-        output_slice_2 = attention_scores_2[0, -1, -3:, -3:]
-        expected_slice_1 = paddle.to_tensor(
-            [
-                -0.15322405099868774,
-                -1.265586018562317,
-                -5.424124717712402,
-                -0.7333418130874634,
-                -0.5904415249824524,
-                0.9293081760406494,
-                1.1033945083618164,
-                -5.200987815856934,
-                -0.7598087787628174,
-            ]
-        )
-        expected_slice_2 = paddle.to_tensor(
-            [
-                0.12572699785232544,
-                -1.0498149394989014,
-                -5.207070350646973,
-                -0.41757693886756897,
-                -0.25374162197113037,
-                1.152648687362671,
-                1.422953724861145,
-                -4.933906078338623,
-                -0.564710259437561,
-            ]
-        )
-        assert paddle.allclose(output_slice_1.flatten(), expected_slice_1, atol=0.01)
-        assert paddle.allclose(output_slice_2.flatten(), expected_slice_2, atol=0.01)
-
-    def test_spatial_transformer_dropout(self):
-        paddle.seed(0)
-        sample = paddle.randn(shape=[1, 32, 64, 64])
-        spatial_transformer_block = Transformer2DModel(
-            in_channels=32, num_attention_heads=2, attention_head_dim=16, dropout=0.3, cross_attention_dim=None
-        ).eval()
-        with paddle.no_grad():
-            attention_scores = spatial_transformer_block(sample).sample
-        assert attention_scores.shape == [1, 32, 64, 64]
-        output_slice = attention_scores[0, -1, -3:, -3:]
-        expected_slice = paddle.to_tensor(
-            [
-                2.535370349884033,
-                6.2350993156433105,
-                0.8244613409042358,
-                -2.6684911251068115,
-                -2.758057117462158,
-                -5.176937103271484,
-                0.3372979760169983,
-                -5.837750434875488,
-                2.3483340740203857,
-            ]
-        )
-        assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
-
-    def test_spatial_transformer_discrete(self):
-        paddle.seed(0)
-        num_embed = 5
-        sample = paddle.randint(0, num_embed, (1, 32))
-        spatial_transformer_block = Transformer2DModel(
-            num_attention_heads=1, attention_head_dim=32, num_vector_embeds=num_embed, sample_size=16
-        ).eval()
-        with paddle.no_grad():
-            attention_scores = spatial_transformer_block(sample).sample
-        assert attention_scores.shape == [1, num_embed - 1, 32]
-        output_slice = attention_scores[0, -2:, -3:]
-        expected_slice = paddle.to_tensor(
-            [
-                -0.14130862057209015,
-                -0.14278407394886017,
-                -0.498604953289032,
-                -3.2408740520477295,
-                -3.852043390274048,
-                -2.099970579147339,
-            ]
-        )
-        assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01)
-
-    def test_spatial_transformer_default_norm_layers(self):
-        spatial_transformer_block = Transformer2DModel(num_attention_heads=1, attention_head_dim=32, in_channels=32)
-        assert spatial_transformer_block.transformer_blocks[0].norm1.__class__ == paddle.nn.LayerNorm
-        assert spatial_transformer_block.transformer_blocks[0].norm3.__class__ == paddle.nn.LayerNorm
-
-    def test_spatial_transformer_ada_norm_layers(self):
-        spatial_transformer_block = Transformer2DModel(
-            num_attention_heads=1, attention_head_dim=32, in_channels=32, num_embeds_ada_norm=5
-        )
-        assert spatial_transformer_block.transformer_blocks[0].norm1.__class__ == AdaLayerNorm
-        assert spatial_transformer_block.transformer_blocks[0].norm3.__class__ == paddle.nn.LayerNorm
-
-    def test_spatial_transformer_default_ff_layers(self):
-        spatial_transformer_block = Transformer2DModel(num_attention_heads=1, attention_head_dim=32, in_channels=32)
-        assert spatial_transformer_block.transformer_blocks[0].ff.net[0].__class__ == GEGLU
-        assert spatial_transformer_block.transformer_blocks[0].ff.net[1].__class__ == paddle.nn.Dropout
-        assert spatial_transformer_block.transformer_blocks[0].ff.net[2].__class__ == paddle.nn.Linear
-        dim = 32
-        inner_dim = 128
-        assert spatial_transformer_block.transformer_blocks[0].ff.net[0].proj.weight.shape[0] == dim
-        assert spatial_transformer_block.transformer_blocks[0].ff.net[0].proj.weight.shape[1] == inner_dim * 2
-        assert spatial_transformer_block.transformer_blocks[0].ff.net[2].weight.shape[0] == inner_dim
-        assert spatial_transformer_block.transformer_blocks[0].ff.net[2].weight.shape[1] == dim
-
-    def test_spatial_transformer_geglu_approx_ff_layers(self):
-        spatial_transformer_block = Transformer2DModel(
-            num_attention_heads=1, attention_head_dim=32, in_channels=32, activation_fn="geglu-approximate"
-        )
-        assert spatial_transformer_block.transformer_blocks[0].ff.net[0].__class__ == ApproximateGELU
-        assert spatial_transformer_block.transformer_blocks[0].ff.net[1].__class__ == paddle.nn.Dropout
-        assert spatial_transformer_block.transformer_blocks[0].ff.net[2].__class__ == paddle.nn.Linear
-        dim = 32
-        inner_dim = 128
-        assert spatial_transformer_block.transformer_blocks[0].ff.net[0].proj.weight.shape[0] == dim
-        assert spatial_transformer_block.transformer_blocks[0].ff.net[0].proj.weight.shape[1] == inner_dim
-        assert spatial_transformer_block.transformer_blocks[0].ff.net[2].weight.shape[0] == inner_dim
-        assert spatial_transformer_block.transformer_blocks[0].ff.net[2].weight.shape[1] == dim
-
-    def test_spatial_transformer_attention_bias(self):
-        spatial_transformer_block = Transformer2DModel(
-            num_attention_heads=1, attention_head_dim=32, in_channels=32, attention_bias=True
-        )
-        assert spatial_transformer_block.transformer_blocks[0].attn1.to_q.bias is not None
-        assert spatial_transformer_block.transformer_blocks[0].attn1.to_k.bias is not None
-        assert spatial_transformer_block.transformer_blocks[0].attn1.to_v.bias is not None
diff --git a/ppdiffusers/tests/models/test_lora_layers.py b/ppdiffusers/tests/models/test_lora_layers.py
deleted file mode 100644
index b0ba2ad0fad7..000000000000
--- a/ppdiffusers/tests/models/test_lora_layers.py
+++ /dev/null
@@ -1,222 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import tempfile
-import unittest
-
-import paddle
-import paddle.nn as nn
-
-from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    StableDiffusionPipeline,
-    UNet2DConditionModel,
-)
-from ppdiffusers.loaders import AttnProcsLayers, LoraLoaderMixin
-from ppdiffusers.models.attention_processor import LoRAAttnProcessor
-from ppdiffusers.utils import TEXT_ENCODER_ATTN_MODULE, floats_tensor
-
-
-def create_unet_lora_layers(unet: nn.Layer):
-    lora_attn_procs = {}
-    for name in unet.attn_processors.keys():
-        cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
-        if name.startswith("mid_block"):
-            hidden_size = unet.config.block_out_channels[-1]
-        elif name.startswith("up_blocks"):
-            block_id = int(name[len("up_blocks.")])
-            hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
-        elif name.startswith("down_blocks"):
-            block_id = int(name[len("down_blocks.")])
-            hidden_size = unet.config.block_out_channels[block_id]
-        lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
-    unet_lora_layers = AttnProcsLayers(lora_attn_procs)
-    return lora_attn_procs, unet_lora_layers
-
-
-def create_text_encoder_lora_layers(text_encoder: nn.Layer):
-    text_lora_attn_procs = {}
-    for name, module in text_encoder.named_sublayers(include_self=True):
-        if name.endswith(TEXT_ENCODER_ATTN_MODULE):
-            text_lora_attn_procs[name] = LoRAAttnProcessor(
-                hidden_size=module.out_proj.weight.shape[1], cross_attention_dim=None
-            )
-
-    text_encoder_lora_layers = AttnProcsLayers(text_lora_attn_procs)
-    return text_encoder_lora_layers
-
-
-class LoraLoaderMixinTests(unittest.TestCase):
-    def get_dummy_components(self):
-        paddle.seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-            steps_offset=1,
-        )
-        paddle.seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config)
-        text_encoder.eval()
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        unet_lora_attn_procs, unet_lora_layers = create_unet_lora_layers(unet)
-        text_encoder_lora_layers = create_text_encoder_lora_layers(text_encoder)
-
-        pipeline_components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "safety_checker": None,
-            "feature_extractor": None,
-        }
-        lora_components = {
-            "unet_lora_layers": unet_lora_layers,
-            "text_encoder_lora_layers": text_encoder_lora_layers,
-            "unet_lora_attn_procs": unet_lora_attn_procs,
-        }
-        return pipeline_components, lora_components
-
-    def get_dummy_inputs(self):
-        batch_size = 1
-        sequence_length = 10
-        num_channels = 4
-        sizes = (32, 32)
-
-        generator = paddle.Generator().manual_seed(0)
-        noise = floats_tensor((batch_size, num_channels) + sizes)
-        input_ids = paddle.randint(1, sequence_length, size=(batch_size, sequence_length), generator=generator)
-
-        pipeline_inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "numpy",
-        }
-
-        return noise, input_ids, pipeline_inputs
-
-    def test_lora_save_load(self):
-        pipeline_components, lora_components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**pipeline_components)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        noise, input_ids, pipeline_inputs = self.get_dummy_inputs()
-
-        original_images = sd_pipe(**pipeline_inputs).images
-        orig_image_slice = original_images[0, -3:, -3:, -1]
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            LoraLoaderMixin.save_lora_weights(
-                save_directory=tmpdirname,
-                unet_lora_layers=lora_components["unet_lora_layers"],
-                text_encoder_lora_layers=lora_components["text_encoder_lora_layers"],
-                to_diffusers=False,
-            )
-            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
-            sd_pipe.load_lora_weights(tmpdirname, from_diffusers=False)
-
-        lora_images = sd_pipe(**pipeline_inputs).images
-        lora_image_slice = lora_images[0, -3:, -3:, -1]
-
-        # Outputs shouldn't match.
-        self.assertFalse(paddle.allclose(paddle.to_tensor(orig_image_slice), paddle.to_tensor(lora_image_slice)))
-
-    def test_lora_save_load_safetensors(self):
-        pipeline_components, lora_components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**pipeline_components)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        noise, input_ids, pipeline_inputs = self.get_dummy_inputs()
-
-        original_images = sd_pipe(**pipeline_inputs).images
-        orig_image_slice = original_images[0, -3:, -3:, -1]
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            LoraLoaderMixin.save_lora_weights(
-                save_directory=tmpdirname,
-                unet_lora_layers=lora_components["unet_lora_layers"],
-                text_encoder_lora_layers=lora_components["text_encoder_lora_layers"],
-                safe_serialization=True,
-                to_diffusers=True,
-            )
-            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors")))
-            sd_pipe.load_lora_weights(tmpdirname, from_diffusers=True)
-
-        lora_images = sd_pipe(**pipeline_inputs).images
-        lora_image_slice = lora_images[0, -3:, -3:, -1]
-
-        # Outputs shouldn't match.
-        self.assertFalse(paddle.allclose(paddle.to_tensor(orig_image_slice), paddle.to_tensor(lora_image_slice)))
-
-    def test_lora_save_load_legacy(self):
-        pipeline_components, lora_components = self.get_dummy_components()
-        unet_lora_attn_procs = lora_components["unet_lora_attn_procs"]
-        sd_pipe = StableDiffusionPipeline(**pipeline_components)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        noise, input_ids, pipeline_inputs = self.get_dummy_inputs()
-
-        original_images = sd_pipe(**pipeline_inputs).images
-        orig_image_slice = original_images[0, -3:, -3:, -1]
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            unet = sd_pipe.unet
-            unet.set_attn_processor(unet_lora_attn_procs)
-            unet.save_attn_procs(tmpdirname, to_diffusers=False)
-            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
-            sd_pipe.load_lora_weights(tmpdirname, from_diffusers=False)
-
-        lora_images = sd_pipe(**pipeline_inputs).images
-        lora_image_slice = lora_images[0, -3:, -3:, -1]
-
-        # Outputs shouldn't match.
-        self.assertFalse(paddle.allclose(paddle.to_tensor(orig_image_slice), paddle.to_tensor(lora_image_slice)))
diff --git a/ppdiffusers/tests/models/test_modeling_common.py b/ppdiffusers/tests/models/test_modeling_common.py
deleted file mode 100644
index 3e7290d25c16..000000000000
--- a/ppdiffusers/tests/models/test_modeling_common.py
+++ /dev/null
@@ -1,381 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-import tempfile
-import unittest
-import unittest.mock as mock
-from typing import Dict, List, Tuple
-
-import numpy as np
-import paddle
-import requests_mock
-from requests.exceptions import HTTPError
-
-from ppdiffusers.models import UNet2DConditionModel
-from ppdiffusers.training_utils import EMAModel
-from ppdiffusers.utils import logging
-from ppdiffusers.utils.testing_utils import CaptureLogger
-
-
-class ModelUtilsTest(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-
-        import ppdiffusers
-
-        ppdiffusers.utils.import_utils._safetensors_available = True
-
-    def test_cached_files_are_used_when_no_internet(self):
-        response_mock = mock.Mock()
-        response_mock.status_code = 500
-        response_mock.headers = {}
-        response_mock.raise_for_status.side_effect = HTTPError
-        response_mock.json.return_value = {}
-        orig_model = UNet2DConditionModel.from_pretrained(
-            "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="unet"
-        )
-        with mock.patch("requests.request", return_value=response_mock):
-            model = UNet2DConditionModel.from_pretrained(
-                "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="unet", local_files_only=True
-            )
-        for p1, p2 in zip(orig_model.parameters(), model.parameters()):
-            if (p1 != p2).cast("int64").sum() > 0:
-                assert False, "Parameters not the same!"
-
-    def test_one_request_upon_cached(self):
-        import ppdiffusers
-
-        ppdiffusers.utils.import_utils._safetensors_available = False
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            with requests_mock.mock(real_http=True) as m:
-                UNet2DConditionModel.from_pretrained(
-                    "hf-internal-testing/tiny-stable-diffusion-torch",
-                    subfolder="unet",
-                    cache_dir=tmpdirname,
-                    from_hf_hub=True,
-                    from_diffusers=True,
-                )
-
-            download_requests = [r.method for r in m.request_history]
-            assert download_requests.count("HEAD") == 2, "2 HEAD requests one for config, one for model"
-            assert download_requests.count("GET") == 2, "2 GET requests one for config, one for model"
-
-            with requests_mock.mock(real_http=True) as m:
-                UNet2DConditionModel.from_pretrained(
-                    "hf-internal-testing/tiny-stable-diffusion-torch",
-                    subfolder="unet",
-                    cache_dir=tmpdirname,
-                    from_hf_hub=True,
-                    from_diffusers=True,
-                )
-
-            cache_requests = [r.method for r in m.request_history]
-            # TODO check this
-            assert (
-                "HEAD" == cache_requests[0] and len(cache_requests) == 2
-            ), "We should call only `model_info` to check for _commit hash and `send_telemetry`"
-
-        ppdiffusers.utils.import_utils._safetensors_available = True
-
-    def test_weight_overwrite(self):
-        with tempfile.TemporaryDirectory() as tmpdirname, self.assertRaises(RuntimeError) as error_context:
-            UNet2DConditionModel.from_pretrained(
-                "hf-internal-testing/tiny-stable-diffusion-torch",
-                subfolder="unet",
-                cache_dir=tmpdirname,
-                in_channels=9,
-                from_hf_hub=True,
-                from_diffusers=True,
-            )
-
-        # make sure that error message states what keys are missing
-        assert "size mismatch" in str(error_context.exception)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model = UNet2DConditionModel.from_pretrained(
-                "hf-internal-testing/tiny-stable-diffusion-torch",
-                subfolder="unet",
-                cache_dir=tmpdirname,
-                in_channels=9,
-                low_cpu_mem_usage=False,
-                ignore_mismatched_sizes=True,
-                from_hf_hub=True,
-                from_diffusers=True,
-            )
-
-        assert model.config.in_channels == 9
-
-
-class ModelTesterMixin:
-    def test_from_save_pretrained(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict)
-        if hasattr(model, "set_default_attn_processor"):
-            model.set_default_attn_processor()
-        model.eval()
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_pretrained(tmpdirname)
-            new_model = self.model_class.from_pretrained(tmpdirname)
-            if hasattr(new_model, "set_default_attn_processor"):
-                new_model.set_default_attn_processor()
-        with paddle.no_grad():
-            image = model(**inputs_dict)
-            if isinstance(image, dict):
-                image = image.sample
-            new_image = new_model(**inputs_dict)
-            if isinstance(new_image, dict):
-                new_image = new_image.sample
-        max_diff = (image - new_image).abs().sum().item()
-        self.assertLessEqual(max_diff, 5e-05, "Models give different forward passes")
-
-    def test_getattr_is_correct(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict)
-
-        # save some things to test
-        model.dummy_attribute = 5
-        model.register_to_config(test_attribute=5)
-
-        logger = logging.get_logger("diffusers.models.modeling_utils")
-        # 30 for warning
-        logger.setLevel(30)
-        with CaptureLogger(logger) as cap_logger:
-            assert hasattr(model, "dummy_attribute")
-            assert getattr(model, "dummy_attribute") == 5
-            assert model.dummy_attribute == 5
-
-        # no warning should be thrown
-        assert cap_logger.out == ""
-
-        logger = logging.get_logger("diffusers.models.modeling_utils")
-        # 30 for warning
-        logger.setLevel(30)
-        with CaptureLogger(logger) as cap_logger:
-            assert hasattr(model, "save_pretrained")
-            fn = model.save_pretrained
-            fn_1 = getattr(model, "save_pretrained")
-
-            assert fn == fn_1
-        # no warning should be thrown
-        assert cap_logger.out == ""
-
-        # warning should be thrown
-        with self.assertWarns(FutureWarning):
-            assert model.test_attribute == 5
-
-        with self.assertWarns(FutureWarning):
-            assert getattr(model, "test_attribute") == 5
-
-        with self.assertRaises(AttributeError) as error:
-            model.does_not_exist
-
-        assert str(error.exception) == f"'{type(model).__name__}' object has no attribute 'does_not_exist'"
-
-    def test_from_save_pretrained_variant(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict)
-        if hasattr(model, "set_default_attn_processor"):
-            model.set_default_attn_processor()
-        model.eval()
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_pretrained(tmpdirname, variant="fp16")
-            new_model = self.model_class.from_pretrained(tmpdirname, variant="fp16")
-            if hasattr(new_model, "set_default_attn_processor"):
-                new_model.set_default_attn_processor()
-            # non-variant cannot be loaded
-            with self.assertRaises(OSError) as error_context:
-                self.model_class.from_pretrained(tmpdirname)
-
-            # make sure that error message states what keys are missing
-            # support diffusion_pytorch_model.bin and model_state.pdparams
-            assert "Error no file named model_state.pdparams found in directory" in str(
-                error_context.exception
-            ) or "Error no file named diffusion_pytorch_model.bin found in directory" in str(error_context.exception)
-        with paddle.no_grad():
-
-            image = model(**inputs_dict)
-            if isinstance(image, dict):
-                image = image.sample
-            new_image = new_model(**inputs_dict)
-            if isinstance(new_image, dict):
-                new_image = new_image.sample
-        max_diff = (image - new_image).abs().sum().item()
-        self.assertLessEqual(max_diff, 5e-05, "Models give different forward passes")
-
-    def test_from_save_pretrained_dtype(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict)
-        model.eval()
-        for dtype in [paddle.float32, paddle.float16, paddle.bfloat16]:
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.to(dtype=dtype)
-                model.save_pretrained(tmpdirname)
-                new_model = self.model_class.from_pretrained(tmpdirname, paddle_dtype=dtype)
-                assert new_model.dtype == dtype
-                new_model = self.model_class.from_pretrained(tmpdirname, paddle_dtype=dtype)
-                assert new_model.dtype == dtype
-
-    def test_determinism(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict)
-        model.eval()
-        with paddle.no_grad():
-            first = model(**inputs_dict)
-            if isinstance(first, dict):
-                first = first.sample
-            second = model(**inputs_dict)
-            if isinstance(second, dict):
-                second = second.sample
-        out_1 = first.cpu().numpy()
-        out_2 = second.cpu().numpy()
-        out_1 = out_1[~np.isnan(out_1)]
-        out_2 = out_2[~np.isnan(out_2)]
-        max_diff = np.amax(np.abs(out_1 - out_2))
-        self.assertLessEqual(max_diff, 1e-05)
-
-    def test_output(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict)
-        model.eval()
-        with paddle.no_grad():
-            output = model(**inputs_dict)
-            if isinstance(output, dict):
-                output = output.sample
-        self.assertIsNotNone(output)
-        expected_shape = inputs_dict["sample"].shape
-        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
-
-    def test_forward_with_norm_groups(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        init_dict["norm_num_groups"] = 16
-        init_dict["block_out_channels"] = 16, 32
-        model = self.model_class(**init_dict)
-        model.eval()
-        with paddle.no_grad():
-            output = model(**inputs_dict)
-            if isinstance(output, dict):
-                output = output.sample
-        self.assertIsNotNone(output)
-        expected_shape = inputs_dict["sample"].shape
-        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
-
-    def test_forward_signature(self):
-        init_dict, _ = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict)
-        signature = inspect.signature(model.forward)
-        arg_names = [*signature.parameters.keys()]
-        expected_arg_names = ["sample", "timestep"]
-        self.assertListEqual(arg_names[:2], expected_arg_names)
-
-    def test_model_from_pretrained(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict)
-        model.eval()
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_pretrained(tmpdirname)
-            new_model = self.model_class.from_pretrained(tmpdirname)
-            new_model.eval()
-        for param_name in model.state_dict().keys():
-            param_1 = model.state_dict()[param_name]
-            param_2 = new_model.state_dict()[param_name]
-            self.assertEqual(param_1.shape, param_2.shape)
-        with paddle.no_grad():
-            output_1 = model(**inputs_dict)
-            if isinstance(output_1, dict):
-                output_1 = output_1.sample
-            output_2 = new_model(**inputs_dict)
-            if isinstance(output_2, dict):
-                output_2 = output_2.sample
-        self.assertEqual(output_1.shape, output_2.shape)
-
-    def test_training(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict)
-        model.train()
-        output = model(**inputs_dict)
-        if isinstance(output, dict):
-            output = output.sample
-        noise = paddle.randn(shape=list((inputs_dict["sample"].shape[0],) + self.output_shape))
-        loss = paddle.nn.functional.mse_loss(input=output, label=noise)
-        loss.backward()
-
-    def test_ema_training(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict)
-        model.train()
-        ema_model = EMAModel(model.parameters())
-        output = model(**inputs_dict)
-        if isinstance(output, dict):
-            output = output.sample
-        noise = paddle.randn(shape=list((inputs_dict["sample"].shape[0],) + self.output_shape))
-        loss = paddle.nn.functional.mse_loss(input=output, label=noise)
-        loss.backward()
-        ema_model.step(model.parameters())
-
-    def test_outputs_equivalence(self):
-        def set_nan_tensor_to_zero(t):
-            # t[t != t] = 0
-            return t
-
-        def recursive_check(tuple_object, dict_object):
-            if isinstance(tuple_object, (List, Tuple)):
-                for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object.values()):
-                    recursive_check(tuple_iterable_value, dict_iterable_value)
-            elif isinstance(tuple_object, Dict):
-                for tuple_iterable_value, dict_iterable_value in zip(tuple_object.values(), dict_object.values()):
-                    recursive_check(tuple_iterable_value, dict_iterable_value)
-            elif tuple_object is None:
-                return
-            else:
-                self.assertTrue(
-                    paddle.allclose(
-                        set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-05
-                    ),
-                    msg=f"Tuple and dict output are not equal. Difference: {paddle.max(x=paddle.abs(x=tuple_object - dict_object))}. Tuple has `nan`: {paddle.isnan(x=tuple_object).any()} and `inf`: {paddle.isinf(x=tuple_object)}. Dict has `nan`: {paddle.isnan(x=dict_object).any()} and `inf`: {paddle.isinf(x=dict_object)}.",
-                )
-
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict)
-        model.eval()
-        with paddle.no_grad():
-            outputs_dict = model(**inputs_dict)
-            outputs_tuple = model(**inputs_dict, return_dict=False)
-        recursive_check(outputs_tuple, outputs_dict)
-
-    def test_enable_disable_gradient_checkpointing(self):
-        if not self.model_class._supports_gradient_checkpointing:
-            return
-        init_dict, _ = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict)
-        self.assertFalse(model.is_gradient_checkpointing)
-        model.enable_gradient_checkpointing()
-        self.assertTrue(model.is_gradient_checkpointing)
-        model.disable_gradient_checkpointing()
-        self.assertFalse(model.is_gradient_checkpointing)
-
-    def test_deprecated_kwargs(self):
-        has_kwarg_in_model_class = "kwargs" in inspect.signature(self.model_class.__init__).parameters
-        has_deprecated_kwarg = len(self.model_class._deprecated_kwargs) > 0
-        if has_kwarg_in_model_class and not has_deprecated_kwarg:
-            raise ValueError(
-                f"{self.model_class} has `**kwargs` in its __init__ method but has not defined any deprecated kwargs under the `_deprecated_kwargs` class attribute. Make sure to either remove `**kwargs` if there are no deprecated arguments or add the deprecated argument with `_deprecated_kwargs = [<deprecated_argument>]`"
-            )
-        if not has_kwarg_in_model_class and has_deprecated_kwarg:
-            raise ValueError(
-                f"{self.model_class} doesn't have `**kwargs` in its __init__ method but has defined deprecated kwargs under the `_deprecated_kwargs` class attribute. Make sure to either add the `**kwargs` argument to {self.model_class}.__init__ if there are deprecated arguments or remove the deprecated argument from `_deprecated_kwargs = [<deprecated_argument>]`"
-            )
diff --git a/ppdiffusers/tests/models/test_models_unet_1d.py b/ppdiffusers/tests/models/test_models_unet_1d.py
deleted file mode 100644
index 1595d64bb66c..000000000000
--- a/ppdiffusers/tests/models/test_models_unet_1d.py
+++ /dev/null
@@ -1,234 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import paddle
-
-from ppdiffusers import UNet1DModel
-from ppdiffusers.utils import floats_tensor, slow
-
-from .test_modeling_common import ModelTesterMixin
-
-
-class UNet1DModelTests(ModelTesterMixin, unittest.TestCase):
-    model_class = UNet1DModel
-
-    @property
-    def dummy_input(self):
-        batch_size = 4
-        num_features = 14
-        seq_len = 16
-        noise = floats_tensor((batch_size, num_features, seq_len))
-        time_step = paddle.to_tensor([10] * batch_size)
-        return {"sample": noise, "timestep": time_step}
-
-    @property
-    def input_shape(self):
-        return 4, 14, 16
-
-    @property
-    def output_shape(self):
-        return 4, 14, 16
-
-    def test_ema_training(self):
-        pass
-
-    def test_training(self):
-        pass
-
-    def test_determinism(self):
-        super().test_determinism()
-
-    def test_outputs_equivalence(self):
-        super().test_outputs_equivalence()
-
-    def test_from_save_pretrained(self):
-        super().test_from_save_pretrained()
-
-    def test_model_from_pretrained(self):
-        super().test_model_from_pretrained()
-
-    def test_output(self):
-        super().test_output()
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "block_out_channels": (32, 64, 128, 256),
-            "in_channels": 14,
-            "out_channels": 14,
-            "time_embedding_type": "positional",
-            "use_timestep_embedding": True,
-            "flip_sin_to_cos": False,
-            "freq_shift": 1.0,
-            "out_block_type": "OutConv1DBlock",
-            "mid_block_type": "MidResTemporalBlock1D",
-            "down_block_types": ("DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D"),
-            "up_block_types": ("UpResnetBlock1D", "UpResnetBlock1D", "UpResnetBlock1D"),
-            "act_fn": "mish",
-        }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    def test_from_pretrained_hub(self):
-        model, loading_info = UNet1DModel.from_pretrained(
-            "bglick13/hopper-medium-v2-value-function-hor32", output_loading_info=True, subfolder="unet"
-        )
-        self.assertIsNotNone(model)
-        self.assertEqual(len(loading_info["missing_keys"]), 0)
-        image = model(**self.dummy_input)
-        assert image is not None, "Make sure output is not None"
-
-    def test_output_pretrained(self):
-        model = UNet1DModel.from_pretrained("bglick13/hopper-medium-v2-value-function-hor32", subfolder="unet")
-        paddle.seed(0)
-        num_features = model.config.in_channels
-        seq_len = 16
-        noise = paddle.randn(shape=(1, seq_len, num_features)).transpose(perm=[0, 2, 1])
-        time_step = paddle.full(shape=(num_features,), fill_value=0)
-        with paddle.no_grad():
-            output = model(noise, time_step).sample.permute(0, 2, 1)
-        output_slice = output[0, -3:, -3:].flatten()
-        expected_output_slice = paddle.to_tensor(
-            [
-                -0.2857576608657837,
-                -0.9908187389373779,
-                0.2976357340812683,
-                -0.8677187561988831,
-                -0.21778395771980286,
-                0.08095654845237732,
-                -0.5871752500534058,
-                0.3299727439880371,
-                -0.17421625554561615,
-            ]
-        )
-        self.assertTrue(paddle.allclose(output_slice, expected_output_slice, rtol=0.001))
-
-    def test_forward_with_norm_groups(self):
-        pass
-
-    # TODO, check this why not pass
-    @slow
-    def test_unet_1d_maestro(self):
-        model_id = "harmonai/maestro-150k"
-        model = UNet1DModel.from_pretrained(model_id, subfolder="unet")
-        sample_size = 65536
-        noise = paddle.sin(
-            x=paddle.arange(start=sample_size, dtype=paddle.float32)[None, None, :].tile(repeat_times=[1, 2, 1])
-        )
-        timestep = paddle.to_tensor([1.0])  # must cast float32
-        with paddle.no_grad():
-            output = model(noise, timestep).sample
-        output_sum = output.abs().sum()
-        output_max = output.abs().max()
-        assert (output_sum - 224.0896).abs() < 0.04
-        assert (output_max - 0.0607).abs() < 0.0004
-
-
-class UNetRLModelTests(ModelTesterMixin, unittest.TestCase):
-    model_class = UNet1DModel
-
-    @property
-    def dummy_input(self):
-        batch_size = 4
-        num_features = 14
-        seq_len = 16
-        noise = floats_tensor((batch_size, num_features, seq_len))
-        time_step = paddle.to_tensor([10] * batch_size)
-        return {"sample": noise, "timestep": time_step}
-
-    @property
-    def input_shape(self):
-        return 4, 14, 16
-
-    @property
-    def output_shape(self):
-        return 4, 14, 1
-
-    def test_determinism(self):
-        super().test_determinism()
-
-    def test_outputs_equivalence(self):
-        super().test_outputs_equivalence()
-
-    def test_from_save_pretrained(self):
-        super().test_from_save_pretrained()
-
-    def test_model_from_pretrained(self):
-        super().test_model_from_pretrained()
-
-    def test_output(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict)
-        model.eval()
-        with paddle.no_grad():
-            output = model(**inputs_dict)
-            if isinstance(output, dict):
-                output = output.sample
-        self.assertIsNotNone(output)
-        expected_shape = [inputs_dict["sample"].shape[0], 1]
-        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
-
-    def test_ema_training(self):
-        pass
-
-    def test_training(self):
-        pass
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "in_channels": 14,
-            "out_channels": 14,
-            "down_block_types": ["DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D"],
-            "up_block_types": [],
-            "out_block_type": "ValueFunction",
-            "mid_block_type": "ValueFunctionMidBlock1D",
-            "block_out_channels": [32, 64, 128, 256],
-            "layers_per_block": 1,
-            "downsample_each_block": True,
-            "use_timestep_embedding": True,
-            "freq_shift": 1.0,
-            "flip_sin_to_cos": False,
-            "time_embedding_type": "positional",
-            "act_fn": "mish",
-        }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    def test_from_pretrained_hub(self):
-        value_function, vf_loading_info = UNet1DModel.from_pretrained(
-            "bglick13/hopper-medium-v2-value-function-hor32", output_loading_info=True, subfolder="value_function"
-        )
-        self.assertIsNotNone(value_function)
-        self.assertEqual(len(vf_loading_info["missing_keys"]), 0)
-        image = value_function(**self.dummy_input)
-        assert image is not None, "Make sure output is not None"
-
-    def test_output_pretrained(self):
-        value_function, vf_loading_info = UNet1DModel.from_pretrained(
-            "bglick13/hopper-medium-v2-value-function-hor32", output_loading_info=True, subfolder="value_function"
-        )
-        paddle.seed(0)
-        num_features = value_function.config.in_channels
-        seq_len = 14
-        noise = paddle.randn(shape=(1, seq_len, num_features)).transpose(perm=[0, 2, 1])
-        time_step = paddle.full(shape=(num_features,), fill_value=0)
-        with paddle.no_grad():
-            output = value_function(noise, time_step).sample
-        expected_output_slice = paddle.to_tensor([291.51135254] * seq_len)
-        self.assertTrue(paddle.allclose(output.squeeze(-1), expected_output_slice, rtol=0.001))
-
-    def test_forward_with_norm_groups(self):
-        pass
diff --git a/ppdiffusers/tests/models/test_models_unet_2d.py b/ppdiffusers/tests/models/test_models_unet_2d.py
deleted file mode 100644
index 4be528cccdc7..000000000000
--- a/ppdiffusers/tests/models/test_models_unet_2d.py
+++ /dev/null
@@ -1,244 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import math
-import unittest
-
-import paddle
-
-from ppdiffusers import UNet2DModel
-from ppdiffusers.utils import floats_tensor, logging, paddle_all_close, slow
-
-from .test_modeling_common import ModelTesterMixin
-
-logger = logging.get_logger(__name__)
-
-
-class Unet2DModelTests(ModelTesterMixin, unittest.TestCase):
-    model_class = UNet2DModel
-
-    @property
-    def dummy_input(self):
-        batch_size = 4
-        num_channels = 3
-        sizes = 32, 32
-        noise = floats_tensor((batch_size, num_channels) + sizes)
-        time_step = paddle.to_tensor([10])
-        return {"sample": noise, "timestep": time_step}
-
-    @property
-    def input_shape(self):
-        return 3, 32, 32
-
-    @property
-    def output_shape(self):
-        return 3, 32, 32
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "block_out_channels": (32, 64),
-            "down_block_types": ("DownBlock2D", "AttnDownBlock2D"),
-            "up_block_types": ("AttnUpBlock2D", "UpBlock2D"),
-            "attention_head_dim": None,
-            "out_channels": 3,
-            "in_channels": 3,
-            "layers_per_block": 2,
-            "sample_size": 32,
-        }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-
-class UNetLDMModelTests(ModelTesterMixin, unittest.TestCase):
-    model_class = UNet2DModel
-
-    @property
-    def dummy_input(self):
-        batch_size = 4
-        num_channels = 4
-        sizes = 32, 32
-        noise = floats_tensor((batch_size, num_channels) + sizes)
-        time_step = paddle.to_tensor([10])
-        return {"sample": noise, "timestep": time_step}
-
-    @property
-    def input_shape(self):
-        return 4, 32, 32
-
-    @property
-    def output_shape(self):
-        return 4, 32, 32
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "sample_size": 32,
-            "in_channels": 4,
-            "out_channels": 4,
-            "layers_per_block": 2,
-            "block_out_channels": (32, 64),
-            "attention_head_dim": 32,
-            "down_block_types": ("DownBlock2D", "DownBlock2D"),
-            "up_block_types": ("UpBlock2D", "UpBlock2D"),
-        }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    def test_from_pretrained_hub(self):
-        model, loading_info = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True)
-        self.assertIsNotNone(model)
-        self.assertEqual(len(loading_info["missing_keys"]), 0)
-        image = model(**self.dummy_input).sample
-        assert image is not None, "Make sure output is not None"
-
-    def test_from_pretrained_accelerate(self):
-        model, _ = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True)
-        image = model(**self.dummy_input).sample
-        assert image is not None, "Make sure output is not None"
-
-    def test_from_pretrained_accelerate_wont_change_results(self):
-        model_accelerate, _ = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True)
-        model_accelerate
-        model_accelerate.eval()
-        noise = paddle.randn(
-            shape=[
-                1,
-                model_accelerate.config.in_channels,
-                model_accelerate.config.sample_size,
-                model_accelerate.config.sample_size,
-            ],
-            generator=paddle.Generator().manual_seed(0),
-        )
-        time_step = paddle.to_tensor([10] * noise.shape[0])
-        arr_accelerate = model_accelerate(noise, time_step)["sample"]
-        del model_accelerate
-        paddle.device.cuda.empty_cache()
-        gc.collect()
-        model_normal_load, _ = UNet2DModel.from_pretrained(
-            "fusing/unet-ldm-dummy-update",
-            output_loading_info=True,
-        )
-        model_normal_load.eval()
-        arr_normal_load = model_normal_load(noise, time_step)["sample"]
-        assert paddle_all_close(arr_accelerate, arr_normal_load, rtol=0.001)
-
-    def test_output_pretrained(self):
-        model = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update")
-        model.eval()
-        noise = paddle.randn(
-            shape=[1, model.config.in_channels, model.config.sample_size, model.config.sample_size],
-            generator=paddle.Generator().manual_seed(0),
-        )
-        time_step = paddle.to_tensor([10] * noise.shape[0])
-        with paddle.no_grad():
-            output = model(noise, time_step).sample
-        output_slice = output[0, -1, -3:, -3:].flatten().cpu()
-        expected_output_slice = paddle.to_tensor(
-            [
-                0.43855608,
-                -10.29346752,
-                -9.60953522,
-                -8.39902020,
-                -16.29206276,
-                -13.07511997,
-                -9.30383205,
-                -13.69859409,
-                -10.52999401,
-            ]
-        )
-        self.assertTrue(paddle_all_close(output_slice, expected_output_slice, rtol=0.001))
-
-
-class NCSNppModelTests(ModelTesterMixin, unittest.TestCase):
-    model_class = UNet2DModel
-
-    @property
-    def dummy_input(self, sizes=(32, 32)):
-        batch_size = 4
-        num_channels = 3
-        noise = floats_tensor((batch_size, num_channels) + sizes)
-        time_step = paddle.to_tensor(batch_size * [10]).cast("int32")
-        return {"sample": noise, "timestep": time_step}
-
-    @property
-    def input_shape(self):
-        return 3, 32, 32
-
-    @property
-    def output_shape(self):
-        return 3, 32, 32
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "block_out_channels": [32, 64, 64, 64],
-            "in_channels": 3,
-            "layers_per_block": 1,
-            "out_channels": 3,
-            "time_embedding_type": "fourier",
-            "norm_eps": 1e-06,
-            "mid_block_scale_factor": math.sqrt(2.0),
-            "norm_num_groups": None,
-            "down_block_types": ["SkipDownBlock2D", "AttnSkipDownBlock2D", "SkipDownBlock2D", "SkipDownBlock2D"],
-            "up_block_types": ["SkipUpBlock2D", "SkipUpBlock2D", "AttnSkipUpBlock2D", "SkipUpBlock2D"],
-        }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    @slow
-    def test_from_pretrained_hub(self):
-        model, loading_info = UNet2DModel.from_pretrained("google/ncsnpp-celebahq-256", output_loading_info=True)
-        self.assertIsNotNone(model)
-        self.assertEqual(len(loading_info["missing_keys"]), 0)
-        inputs = self.dummy_input
-        noise = floats_tensor((4, 3) + (256, 256))
-        inputs["sample"] = noise
-        image = model(**inputs)
-        assert image is not None, "Make sure output is not None"
-
-    @slow
-    def test_output_pretrained_ve_mid(self):
-        model = UNet2DModel.from_pretrained("google/ncsnpp-celebahq-256")
-        paddle.seed(0)
-        batch_size = 4
-        num_channels = 3
-        sizes = 256, 256
-        noise = paddle.ones(shape=(batch_size, num_channels, *sizes))
-        time_step = paddle.to_tensor(batch_size * [0.0001])
-        with paddle.no_grad():
-            output = model(noise, time_step).sample
-        output_slice = output[0, -3:, -3:, -1].flatten().cpu()
-        expected_output_slice = paddle.to_tensor(
-            [-4836.2231, -6487.1387, -3816.7969, -7964.9253, -10966.2842, -20043.6016, 8137.0571, 2340.3499, 544.6114]
-        )
-        self.assertTrue(paddle_all_close(output_slice, expected_output_slice, rtol=0.01))
-
-    def test_output_pretrained_ve_large(self):
-        model = UNet2DModel.from_pretrained("fusing/ncsnpp-ffhq-ve-dummy-update")
-        paddle.seed(0)
-        batch_size = 4
-        num_channels = 3
-        sizes = 32, 32
-        noise = paddle.ones(shape=(batch_size, num_channels, *sizes))
-        time_step = paddle.to_tensor(batch_size * [0.0001])
-        with paddle.no_grad():
-            output = model(noise, time_step).sample
-        output_slice = output[0, -3:, -3:, -1].flatten().cpu()
-        expected_output_slice = paddle.to_tensor(
-            [-0.0325, -0.09, -0.0869, -0.0332, -0.0725, -0.027, -0.0101, 0.0227, 0.0256]
-        )
-        self.assertTrue(paddle_all_close(output_slice, expected_output_slice, rtol=0.01))
-
-    def test_forward_with_norm_groups(self):
-        pass
diff --git a/ppdiffusers/tests/models/test_models_unet_2d_condition.py b/ppdiffusers/tests/models/test_models_unet_2d_condition.py
deleted file mode 100644
index 12b53290049f..000000000000
--- a/ppdiffusers/tests/models/test_models_unet_2d_condition.py
+++ /dev/null
@@ -1,797 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import os
-import tempfile
-import unittest
-
-import paddle
-import paddle.nn as nn
-from parameterized import parameterized
-
-from ppdiffusers import UNet2DConditionModel
-from ppdiffusers.models.attention_processor import (
-    CustomDiffusionAttnProcessor,
-    LoRAAttnProcessor,
-)
-from ppdiffusers.utils import (
-    floats_tensor,
-    load_ppnlp_numpy,
-    logging,
-    paddle_all_close,
-    require_paddle_gpu,
-    slow,
-)
-from ppdiffusers.utils.import_utils import is_ppxformers_available
-
-from .test_modeling_common import ModelTesterMixin
-
-logger = logging.get_logger(__name__)
-
-
-def create_lora_layers(model, mock_weights: bool = True):
-    lora_attn_procs = {}
-    for name in model.attn_processors.keys():
-        cross_attention_dim = None if name.endswith("attn1.processor") else model.config.cross_attention_dim
-        if name.startswith("mid_block"):
-            hidden_size = model.config.block_out_channels[-1]
-        elif name.startswith("up_blocks"):
-            block_id = int(name[len("up_blocks.")])
-            hidden_size = list(reversed(model.config.block_out_channels))[block_id]
-        elif name.startswith("down_blocks"):
-            block_id = int(name[len("down_blocks.")])
-            hidden_size = model.config.block_out_channels[block_id]
-        lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
-        if mock_weights:
-            with paddle.no_grad():
-                lora_attn_procs[name].to_q_lora.up.weight.set_value(lora_attn_procs[name].to_q_lora.up.weight + 1)
-                lora_attn_procs[name].to_k_lora.up.weight.set_value(lora_attn_procs[name].to_k_lora.up.weight + 1)
-                lora_attn_procs[name].to_v_lora.up.weight.set_value(lora_attn_procs[name].to_v_lora.up.weight + 1)
-                lora_attn_procs[name].to_out_lora.up.weight.set_value(lora_attn_procs[name].to_out_lora.up.weight + 1)
-    return lora_attn_procs
-
-
-def create_custom_ppdiffusion_layers(model, mock_weights: bool = True):
-    train_kv = True
-    train_q_out = True
-    custom_diffusion_attn_procs = {}
-
-    st = model.state_dict()
-    for name, _ in model.attn_processors.items():
-        cross_attention_dim = None if name.endswith("attn1.processor") else model.config.cross_attention_dim
-        if name.startswith("mid_block"):
-            hidden_size = model.config.block_out_channels[-1]
-        elif name.startswith("up_blocks"):
-            block_id = int(name[len("up_blocks.")])
-            hidden_size = list(reversed(model.config.block_out_channels))[block_id]
-        elif name.startswith("down_blocks"):
-            block_id = int(name[len("down_blocks.")])
-            hidden_size = model.config.block_out_channels[block_id]
-        layer_name = name.split(".processor")[0]
-        weights = {
-            "to_k_custom_diffusion.weight": st[layer_name + ".to_k.weight"],
-            "to_v_custom_diffusion.weight": st[layer_name + ".to_v.weight"],
-        }
-        if train_q_out:
-            weights["to_q_custom_diffusion.weight"] = st[layer_name + ".to_q.weight"]
-            weights["to_out_custom_diffusion.0.weight"] = st[layer_name + ".to_out.0.weight"]
-            weights["to_out_custom_diffusion.0.bias"] = st[layer_name + ".to_out.0.bias"]
-        if cross_attention_dim is not None:
-            custom_diffusion_attn_procs[name] = CustomDiffusionAttnProcessor(
-                train_kv=train_kv,
-                train_q_out=train_q_out,
-                hidden_size=hidden_size,
-                cross_attention_dim=cross_attention_dim,
-            )
-            custom_diffusion_attn_procs[name].load_dict(weights)
-            if mock_weights:
-                # add 1 to weights to mock trained weights
-                with paddle.no_grad():
-                    custom_diffusion_attn_procs[name].to_k_custom_diffusion.weight.set_value(
-                        custom_diffusion_attn_procs[name].to_k_custom_diffusion.weight + 1
-                    )
-                    custom_diffusion_attn_procs[name].to_v_custom_diffusion.weight.set_value(
-                        custom_diffusion_attn_procs[name].to_v_custom_diffusion.weight + 1
-                    )
-        else:
-            custom_diffusion_attn_procs[name] = CustomDiffusionAttnProcessor(
-                train_kv=False,
-                train_q_out=False,
-                hidden_size=hidden_size,
-                cross_attention_dim=cross_attention_dim,
-            )
-    del st
-    return custom_diffusion_attn_procs
-
-
-class UNet2DConditionModelTests(ModelTesterMixin, unittest.TestCase):
-    model_class = UNet2DConditionModel
-
-    @property
-    def dummy_input(self):
-        batch_size = 4
-        num_channels = 4
-        sizes = 32, 32
-        noise = floats_tensor((batch_size, num_channels) + sizes)
-        time_step = paddle.to_tensor([10])
-        encoder_hidden_states = floats_tensor((batch_size, 4, 32))
-        return {"sample": noise, "timestep": time_step, "encoder_hidden_states": encoder_hidden_states}
-
-    @property
-    def input_shape(self):
-        return 4, 32, 32
-
-    @property
-    def output_shape(self):
-        return 4, 32, 32
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "block_out_channels": (32, 64),
-            "down_block_types": ("CrossAttnDownBlock2D", "DownBlock2D"),
-            "up_block_types": ("UpBlock2D", "CrossAttnUpBlock2D"),
-            "cross_attention_dim": 32,
-            "attention_head_dim": 8,
-            "out_channels": 4,
-            "in_channels": 4,
-            "layers_per_block": 2,
-            "sample_size": 32,
-        }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    def test_xformers_enable_works(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict)
-        model.enable_xformers_memory_efficient_attention()
-        assert (
-            model.mid_block.attentions[0].transformer_blocks[0].attn1.processor.__class__.__name__
-            == "XFormersAttnProcessor"
-        ), "xformers is not enabled"
-
-    def test_gradient_checkpointing(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict)
-        assert not model.is_gradient_checkpointing and model.training
-        out = model(**inputs_dict).sample
-        model.clear_gradients()
-        labels = paddle.randn_like(out)
-        loss = (out - labels).mean()
-        loss.backward()
-        model_2 = self.model_class(**init_dict)
-        model_2.set_state_dict(state_dict=model.state_dict())
-        model_2.enable_gradient_checkpointing()
-        assert model_2.is_gradient_checkpointing and model_2.training
-        out_2 = model_2(**inputs_dict).sample
-        model_2.clear_gradients()
-        loss_2 = (out_2 - labels).mean()
-        loss_2.backward()
-        self.assertTrue((loss - loss_2).abs() < 1e-05)
-        named_params = dict(model.named_parameters())
-        named_params_2 = dict(model_2.named_parameters())
-        for name, param in named_params.items():
-            self.assertTrue(paddle_all_close(param.grad, named_params_2[name].grad, atol=5e-05))
-
-    def test_model_with_attention_head_dim_tuple(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        init_dict["attention_head_dim"] = 8, 16
-        model = self.model_class(**init_dict)
-        model.eval()
-        with paddle.no_grad():
-            output = model(**inputs_dict)
-            if isinstance(output, dict):
-                output = output.sample
-        self.assertIsNotNone(output)
-        expected_shape = inputs_dict["sample"].shape
-        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
-
-    def test_model_with_use_linear_projection(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        init_dict["use_linear_projection"] = True
-        model = self.model_class(**init_dict)
-        model.eval()
-        with paddle.no_grad():
-            output = model(**inputs_dict)
-            if isinstance(output, dict):
-                output = output.sample
-        self.assertIsNotNone(output)
-        expected_shape = inputs_dict["sample"].shape
-        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
-
-    def test_model_with_cross_attention_dim_tuple(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        init_dict["cross_attention_dim"] = (32, 32)
-
-        model = self.model_class(**init_dict)
-        model.eval()
-
-        with paddle.no_grad():
-            output = model(**inputs_dict)
-
-            if isinstance(output, dict):
-                output = output.sample
-
-        self.assertIsNotNone(output)
-        expected_shape = inputs_dict["sample"].shape
-        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
-
-    def test_model_with_simple_projection(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        batch_size, _, _, sample_size = inputs_dict["sample"].shape
-
-        init_dict["class_embed_type"] = "simple_projection"
-        init_dict["projection_class_embeddings_input_dim"] = sample_size
-
-        inputs_dict["class_labels"] = floats_tensor((batch_size, sample_size))
-
-        model = self.model_class(**init_dict)
-        model.eval()
-
-        with paddle.no_grad():
-            output = model(**inputs_dict)
-
-            if isinstance(output, dict):
-                output = output.sample
-
-        self.assertIsNotNone(output)
-        expected_shape = inputs_dict["sample"].shape
-        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
-
-    def test_model_with_class_embeddings_concat(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        batch_size, _, _, sample_size = inputs_dict["sample"].shape
-
-        init_dict["class_embed_type"] = "simple_projection"
-        init_dict["projection_class_embeddings_input_dim"] = sample_size
-        init_dict["class_embeddings_concat"] = True
-
-        inputs_dict["class_labels"] = floats_tensor((batch_size, sample_size))
-
-        model = self.model_class(**init_dict)
-        model.eval()
-
-        with paddle.no_grad():
-            output = model(**inputs_dict)
-
-            if isinstance(output, dict):
-                output = output.sample
-
-        self.assertIsNotNone(output)
-        expected_shape = inputs_dict["sample"].shape
-        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
-
-    def test_model_attention_slicing(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        init_dict["attention_head_dim"] = 8, 16
-        model = self.model_class(**init_dict)
-        model.eval()
-        model.set_attention_slice("auto")
-        with paddle.no_grad():
-            output = model(**inputs_dict)
-        assert output is not None
-        model.set_attention_slice("max")
-        with paddle.no_grad():
-            output = model(**inputs_dict)
-        assert output is not None
-        model.set_attention_slice(2)
-        with paddle.no_grad():
-            output = model(**inputs_dict)
-        assert output is not None
-
-    def test_model_sliceable_head_dim(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        init_dict["attention_head_dim"] = 8, 16
-        model = self.model_class(**init_dict)
-
-        def check_sliceable_dim_attr(module: paddle.nn.Layer):
-            if hasattr(module, "set_attention_slice"):
-                assert isinstance(module.sliceable_head_dim, int)
-            for child in module.children():
-                check_sliceable_dim_attr(child)
-
-        for module in model.children():
-            check_sliceable_dim_attr(module)
-
-    def test_special_attn_proc(self):
-        class AttnEasyProc(nn.Layer):
-            def __init__(self, num):
-                super().__init__()
-                self.weight = self.create_parameter(
-                    (1,), dtype=paddle.get_default_dtype(), default_initializer=nn.initializer.Constant(num)
-                )
-                self.is_run = False
-                self.number = 0
-                self.counter = 0
-
-            def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None, number=None):
-                batch_size, sequence_length, _ = hidden_states.shape
-                attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-                query = attn.to_q(hidden_states)
-                encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
-                key = attn.to_k(encoder_hidden_states)
-                value = attn.to_v(encoder_hidden_states)
-                query = attn.head_to_batch_dim(query)
-                key = attn.head_to_batch_dim(key)
-                value = attn.head_to_batch_dim(value)
-                attention_probs = attn.get_attention_scores(query, key, attention_mask)
-                hidden_states = paddle.matmul(attention_probs, value)
-                hidden_states = attn.batch_to_head_dim(hidden_states)
-                hidden_states = attn.to_out[0](hidden_states)
-                hidden_states = attn.to_out[1](hidden_states)
-                hidden_states += self.weight
-                self.is_run = True
-                self.counter += 1
-                self.number = number
-                return hidden_states
-
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        init_dict["attention_head_dim"] = 8, 16
-        model = self.model_class(**init_dict)
-        processor = AttnEasyProc(5.0)
-        model.set_attn_processor(processor)
-        model(**inputs_dict, cross_attention_kwargs={"number": 123}).sample
-        assert processor.counter == 12
-        assert processor.is_run
-        assert processor.number == 123
-
-    def test_lora_processors(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        init_dict["attention_head_dim"] = 8, 16
-        model = self.model_class(**init_dict)
-        with paddle.no_grad():
-            sample1 = model(**inputs_dict).sample
-        lora_attn_procs = create_lora_layers(model)
-        model.set_attn_processor(lora_attn_procs)
-        model.set_attn_processor(model.attn_processors)
-        with paddle.no_grad():
-            sample2 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
-            sample3 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
-            sample4 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
-        assert (sample1 - sample2).abs().max() < 0.0001
-        assert (sample3 - sample4).abs().max() < 0.0001
-        assert (sample2 - sample3).abs().max() > 0.0001
-
-    def test_lora_save_load(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        init_dict["attention_head_dim"] = 8, 16
-        paddle.seed(0)
-        model = self.model_class(**init_dict)
-        with paddle.no_grad():
-            old_sample = model(**inputs_dict).sample
-        lora_attn_procs = create_lora_layers(model)
-        model.set_attn_processor(lora_attn_procs)
-        with paddle.no_grad():
-            sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_attn_procs(tmpdirname, to_diffusers=False)
-            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
-            paddle.seed(0)
-            new_model = self.model_class(**init_dict)
-            new_model.load_attn_procs(tmpdirname, from_diffusers=False)
-
-        with paddle.no_grad():
-            new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
-
-        assert (sample - new_sample).abs().max() < 1e-4
-
-        # LoRA and no LoRA should NOT be the same
-        assert (sample - old_sample).abs().max() > 1e-4
-
-    def test_lora_save_load_safetensors(self):
-        # enable deterministic behavior for gradient checkpointing
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        init_dict["attention_head_dim"] = (8, 16)
-
-        paddle.seed(0)
-        model = self.model_class(**init_dict)
-
-        with paddle.no_grad():
-            old_sample = model(**inputs_dict).sample
-
-        lora_attn_procs = create_lora_layers(model)
-        model.set_attn_processor(lora_attn_procs)
-
-        with paddle.no_grad():
-            sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_attn_procs(tmpdirname, safe_serialization=True, to_diffusers=True)
-            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors")))
-            paddle.seed(0)
-            new_model = self.model_class(**init_dict)
-            new_model.load_attn_procs(tmpdirname, from_diffusers=True, use_safetensors=True)
-        with paddle.no_grad():
-            new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
-        assert (sample - new_sample).abs().max() < 0.0001
-        assert (sample - old_sample).abs().max() > 0.0001
-
-    def test_lora_save_safetensors_load_torch(self):
-        # enable deterministic behavior for gradient checkpointing
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        init_dict["attention_head_dim"] = (8, 16)
-
-        paddle.seed(0)
-        model = self.model_class(**init_dict)
-
-        lora_attn_procs = create_lora_layers(model, mock_weights=False)
-        model.set_attn_processor(lora_attn_procs)
-        # Saving as torch, properly reloads with directly filename
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_attn_procs(tmpdirname, to_diffusers=True)
-            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.bin")))
-            paddle.seed(0)
-            new_model = self.model_class(**init_dict)
-            new_model.load_attn_procs(
-                tmpdirname, weight_name="pytorch_lora_weights.bin", from_diffusers=True, use_safetensors=False
-            )
-
-    def test_lora_save_torch_force_load_safetensors_error(self):
-        pass
-
-    def test_lora_on_off(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        init_dict["attention_head_dim"] = 8, 16
-        paddle.seed(0)
-        model = self.model_class(**init_dict)
-        with paddle.no_grad():
-            old_sample = model(**inputs_dict).sample
-        lora_attn_procs = create_lora_layers(model)
-        model.set_attn_processor(lora_attn_procs)
-        with paddle.no_grad():
-            sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
-        model.set_default_attn_processor()
-        with paddle.no_grad():
-            new_sample = model(**inputs_dict).sample
-        assert (sample - new_sample).abs().max() < 0.0001
-        assert (sample - old_sample).abs().max() < 0.0001
-
-    @unittest.skipIf(
-        not is_ppxformers_available(),
-        reason="scaled_dot_product_attention attention is only available with CUDA and `scaled_dot_product_attention` installed",
-    )
-    def test_lora_xformers_on_off(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        init_dict["attention_head_dim"] = 8, 16
-        paddle.seed(0)
-        model = self.model_class(**init_dict)
-        lora_attn_procs = create_lora_layers(model)
-        model.set_attn_processor(lora_attn_procs)
-        with paddle.no_grad():
-            sample = model(**inputs_dict).sample
-            model.enable_xformers_memory_efficient_attention()
-            on_sample = model(**inputs_dict).sample
-            model.disable_xformers_memory_efficient_attention()
-            off_sample = model(**inputs_dict).sample
-        assert (sample - on_sample).abs().max() < 0.05
-        assert (sample - off_sample).abs().max() < 0.05
-
-    def test_custom_diffusion_processors(self):
-        # enable deterministic behavior for gradient checkpointing
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        init_dict["attention_head_dim"] = (8, 16)
-
-        model = self.model_class(**init_dict)
-
-        with paddle.no_grad():
-            sample1 = model(**inputs_dict).sample
-
-        custom_diffusion_attn_procs = create_custom_ppdiffusion_layers(model, mock_weights=False)
-
-        # make sure we can set a list of attention processors
-        model.set_attn_processor(custom_diffusion_attn_procs)
-
-        # test that attn processors can be set to itself
-        model.set_attn_processor(model.attn_processors)
-
-        with paddle.no_grad():
-            sample2 = model(**inputs_dict).sample
-
-        assert (sample1 - sample2).abs().max() < 1e-4
-
-    def test_custom_diffusion_save_load(self):
-        # enable deterministic behavior for gradient checkpointing
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        init_dict["attention_head_dim"] = (8, 16)
-
-        paddle.seed(0)
-        model = self.model_class(**init_dict)
-
-        with paddle.no_grad():
-            old_sample = model(**inputs_dict).sample
-
-        custom_diffusion_attn_procs = create_custom_ppdiffusion_layers(model, mock_weights=False)
-        model.set_attn_processor(custom_diffusion_attn_procs)
-
-        with paddle.no_grad():
-            sample = model(**inputs_dict).sample
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_attn_procs(tmpdirname, to_diffusers=False)
-            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_custom_diffusion_weights.pdparams")))
-            paddle.seed(0)
-            new_model = self.model_class(**init_dict)
-            new_model.load_attn_procs(
-                tmpdirname, weight_name="paddle_custom_diffusion_weights.pdparams", from_diffusers=False
-            )
-
-        with paddle.no_grad():
-            new_sample = new_model(**inputs_dict).sample
-
-        assert (sample - new_sample).abs().max() < 1e-4
-
-        # custom diffusion and no custom diffusion should be the same
-        assert (sample - old_sample).abs().max() < 1e-4
-
-    @unittest.skipIf(
-        not is_ppxformers_available(),
-        reason="XFormers attention is only available with CUDA and `xformers` installed",
-    )
-    def test_custom_diffusion_xformers_on_off(self):
-        # enable deterministic behavior for gradient checkpointing
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        init_dict["attention_head_dim"] = (8, 16)
-
-        paddle.seed(0)
-        model = self.model_class(**init_dict)
-        custom_diffusion_attn_procs = create_custom_ppdiffusion_layers(model, mock_weights=False)
-        model.set_attn_processor(custom_diffusion_attn_procs)
-
-        # default
-        with paddle.no_grad():
-            sample = model(**inputs_dict).sample
-
-            model.enable_xformers_memory_efficient_attention()
-            on_sample = model(**inputs_dict).sample
-
-            model.disable_xformers_memory_efficient_attention()
-            off_sample = model(**inputs_dict).sample
-
-        assert (sample - on_sample).abs().max() < 1e-4
-        assert (sample - off_sample).abs().max() < 1e-4
-
-
-@slow
-class UNet2DConditionModelIntegrationTests(unittest.TestCase):
-    def get_file_format(self, seed, shape):
-        return f"gaussian_noise_s={seed}_shape={'_'.join([str(s) for s in shape])}.npy"
-
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def get_latents(self, seed=0, shape=(4, 4, 64, 64), fp16=False):
-        dtype = paddle.float16 if fp16 else paddle.float32
-        image = paddle.to_tensor(data=load_ppnlp_numpy(self.get_file_format(seed, shape))).cast(dtype)
-        return image
-
-    def get_unet_model(self, fp16=False, model_id="CompVis/stable-diffusion-v1-4"):
-        revision = "fp16" if fp16 else None
-        paddle_dtype = paddle.float16 if fp16 else paddle.float32
-        model = UNet2DConditionModel.from_pretrained(
-            model_id, subfolder="unet", paddle_dtype=paddle_dtype, revision=revision
-        )
-        model.eval()
-        return model
-
-    def test_set_attention_slice_auto(self):
-        paddle.device.cuda.empty_cache()
-        unet = self.get_unet_model()
-        unet.set_attention_slice("auto")
-        latents = self.get_latents(33)
-        encoder_hidden_states = self.get_encoder_hidden_states(33)
-        timestep = 1
-        with paddle.no_grad():
-            _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
-        mem_bytes = paddle.device.cuda.memory_allocated()
-        assert mem_bytes < 5 * 10**9
-
-    def test_set_attention_slice_max(self):
-        paddle.device.cuda.empty_cache()
-        unet = self.get_unet_model()
-        unet.set_attention_slice("max")
-        latents = self.get_latents(33)
-        encoder_hidden_states = self.get_encoder_hidden_states(33)
-        timestep = 1
-        with paddle.no_grad():
-            _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
-        mem_bytes = paddle.device.cuda.memory_allocated()
-        assert mem_bytes < 5 * 10**9
-
-    def test_set_attention_slice_int(self):
-        paddle.device.cuda.empty_cache()
-        unet = self.get_unet_model()
-        unet.set_attention_slice(2)
-        latents = self.get_latents(33)
-        encoder_hidden_states = self.get_encoder_hidden_states(33)
-        timestep = 1
-        with paddle.no_grad():
-            _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
-        mem_bytes = paddle.device.cuda.memory_allocated()
-        assert mem_bytes < 5 * 10**9
-
-    def test_set_attention_slice_list(self):
-        paddle.device.cuda.empty_cache()
-        slice_list = 16 * [2, 3]
-        unet = self.get_unet_model()
-        unet.set_attention_slice(slice_list)
-        latents = self.get_latents(33)
-        encoder_hidden_states = self.get_encoder_hidden_states(33)
-        timestep = 1
-        with paddle.no_grad():
-            _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
-        mem_bytes = paddle.device.cuda.memory_allocated()
-        assert mem_bytes < 5 * 10**9
-
-    def get_encoder_hidden_states(self, seed=0, shape=(4, 77, 768), fp16=False):
-        dtype = "float16" if fp16 else "float32"
-        hidden_states = paddle.to_tensor(data=load_ppnlp_numpy(self.get_file_format(seed, shape))).cast(dtype)
-        return hidden_states
-
-    @parameterized.expand(
-        [
-            [33, 4, [-0.4424, 0.151, -0.1937, 0.2118, 0.3746, -0.3957, 0.016, -0.0435]],
-            [47, 0.55, [-0.1508, 0.0379, -0.3075, 0.254, 0.3633, -0.0821, 0.1719, -0.0207]],
-            [21, 0.89, [-0.6479, 0.6364, -0.3464, 0.8697, 0.4443, -0.6289, -0.0091, 0.1778]],
-            [9, 1000, [0.8888, -0.5659, 0.5834, -0.7469, 1.1912, -0.3923, 1.1241, -0.4424]],
-        ]
-    )
-    @require_paddle_gpu
-    def test_compvis_sd_v1_4(self, seed, timestep, expected_slice):
-        model = self.get_unet_model(model_id="CompVis/stable-diffusion-v1-4")
-        latents = self.get_latents(seed)
-        encoder_hidden_states = self.get_encoder_hidden_states(seed)
-        timestep = paddle.to_tensor([timestep], dtype="int64")
-        with paddle.no_grad():
-            sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
-        assert sample.shape == latents.shape
-        output_slice = sample[-1, -2:, -2:, :2].flatten().cast("float32").cpu()
-        expected_output_slice = paddle.to_tensor(expected_slice)
-        assert paddle_all_close(output_slice, expected_output_slice, atol=0.01)
-
-    @parameterized.expand(
-        [
-            [83, 4, [-0.2323, -0.1304, 0.0813, -0.3093, -0.0919, -0.1571, -0.1125, -0.5806]],
-            [17, 0.55, [-0.0831, -0.2443, 0.0901, -0.0919, 0.3396, 0.0103, -0.3743, 0.0701]],
-            [8, 0.89, [-0.4863, 0.0859, 0.0875, -0.1658, 0.9199, -0.0114, 0.4839, 0.4639]],
-            [3, 1000, [-0.5649, 0.2402, -0.5518, 0.1248, 1.1328, -0.2443, -0.0325, -1.0078]],
-        ]
-    )
-    @require_paddle_gpu
-    def test_compvis_sd_v1_4_fp16(self, seed, timestep, expected_slice):
-        model = self.get_unet_model(model_id="CompVis/stable-diffusion-v1-4", fp16=True)
-        latents = self.get_latents(seed, fp16=True)
-        encoder_hidden_states = self.get_encoder_hidden_states(seed, fp16=True)
-        timestep = paddle.to_tensor([timestep], dtype="int64")
-        with paddle.no_grad():
-            sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
-        assert sample.shape == latents.shape
-        output_slice = sample[-1, -2:, -2:, :2].flatten().cast("float32").cpu()
-        expected_output_slice = paddle.to_tensor(expected_slice)
-        assert paddle_all_close(output_slice, expected_output_slice, atol=0.005)
-
-    @parameterized.expand(
-        [
-            [33, 4, [-0.443, 0.157, -0.1867, 0.2376, 0.3205, -0.3681, 0.0525, -0.0722]],
-            [47, 0.55, [-0.1415, 0.0129, -0.3136, 0.2257, 0.343, -0.0536, 0.2114, -0.0436]],
-            [21, 0.89, [-0.7091, 0.6664, -0.3643, 0.9032, 0.4499, -0.6541, 0.0139, 0.175]],
-            [9, 1000, [0.8878, -0.5659, 0.5844, -0.7442, 1.1883, -0.3927, 1.1192, -0.4423]],
-        ]
-    )
-    @require_paddle_gpu
-    def test_compvis_sd_v1_5(self, seed, timestep, expected_slice):
-        model = self.get_unet_model(model_id="runwayml/stable-diffusion-v1-5")
-        latents = self.get_latents(seed)
-        encoder_hidden_states = self.get_encoder_hidden_states(seed)
-        timestep = paddle.to_tensor([timestep], dtype="int64")
-        with paddle.no_grad():
-            sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
-        assert sample.shape == latents.shape
-        output_slice = sample[-1, -2:, -2:, :2].flatten().cast("float32").cpu()
-        expected_output_slice = paddle.to_tensor(expected_slice)
-        assert paddle_all_close(output_slice, expected_output_slice, atol=0.01)
-
-    @parameterized.expand(
-        [
-            [83, 4, [-0.2695, -0.1669, 0.0073, -0.3181, -0.1187, -0.1676, -0.1395, -0.5972]],
-            [17, 0.55, [-0.129, -0.2588, 0.0551, -0.0916, 0.3286, 0.0238, -0.3669, 0.0322]],
-            [8, 0.89, [-0.5283, 0.1198, 0.087, -0.1141, 0.9189, -0.015, 0.5474, 0.4319]],
-            [3, 1000, [-0.5601, 0.2411, -0.5435, 0.1268, 1.1338, -0.2427, -0.028, -1.002]],
-        ]
-    )
-    @require_paddle_gpu
-    def test_compvis_sd_v1_5_fp16(self, seed, timestep, expected_slice):
-        model = self.get_unet_model(model_id="runwayml/stable-diffusion-v1-5", fp16=True)
-        latents = self.get_latents(seed, fp16=True)
-        encoder_hidden_states = self.get_encoder_hidden_states(seed, fp16=True)
-        timestep = paddle.to_tensor([timestep], dtype="int64")
-        with paddle.no_grad():
-            sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
-        assert sample.shape == latents.shape
-        output_slice = sample[-1, -2:, -2:, :2].flatten().cast("float32").cpu()
-        expected_output_slice = paddle.to_tensor(expected_slice)
-        assert paddle_all_close(output_slice, expected_output_slice, atol=0.005)
-
-    @parameterized.expand(
-        [
-            [33, 4, [-0.7639, 0.0106, -0.1615, -0.3487, -0.0423, -0.7972, 0.0085, -0.4858]],
-            [47, 0.55, [-0.6564, 0.0795, -1.9026, -0.6258, 1.8235, 1.2056, 1.2169, 0.9073]],
-            [21, 0.89, [0.0327, 0.4399, -0.6358, 0.3417, 0.412, -0.5621, -0.0397, -1.043]],
-            [9, 1000, [0.16, 0.7303, -1.0556, -0.3515, -0.744, -1.2037, -1.8149, -1.8931]],
-        ]
-    )
-    @require_paddle_gpu
-    def test_compvis_sd_inpaint(self, seed, timestep, expected_slice):
-        model = self.get_unet_model(model_id="runwayml/stable-diffusion-inpainting")
-        latents = self.get_latents(seed, shape=(4, 9, 64, 64))
-        encoder_hidden_states = self.get_encoder_hidden_states(seed)
-        timestep = paddle.to_tensor([timestep], dtype="int64")
-        with paddle.no_grad():
-            sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
-        assert sample.shape == [4, 4, 64, 64]
-        output_slice = sample[-1, -2:, -2:, :2].flatten().cast("float32").cpu()
-        expected_output_slice = paddle.to_tensor(expected_slice)
-        assert paddle_all_close(output_slice, expected_output_slice, atol=0.01)
-
-    @parameterized.expand(
-        [
-            [83, 4, [-0.1047, -1.7227, 0.1067, 0.0164, -0.5698, -0.4172, -0.1388, 1.1387]],
-            [17, 0.55, [0.0975, -0.2856, -0.3508, -0.46, 0.3376, 0.293, -0.2747, -0.7026]],
-            [8, 0.89, [-0.0952, 0.0183, -0.5825, -0.1981, 0.1131, 0.4668, -0.0395, -0.3486]],
-            [3, 1000, [0.479, 0.4949, -1.0732, -0.7158, 0.7959, -0.9478, 0.1105, -0.9741]],
-        ]
-    )
-    @require_paddle_gpu
-    def test_compvis_sd_inpaint_fp16(self, seed, timestep, expected_slice):
-        model = self.get_unet_model(model_id="runwayml/stable-diffusion-inpainting", fp16=True)
-        latents = self.get_latents(seed, shape=(4, 9, 64, 64), fp16=True)
-        encoder_hidden_states = self.get_encoder_hidden_states(seed, fp16=True)
-        timestep = paddle.to_tensor([timestep], dtype="int64")
-        with paddle.no_grad():
-            sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
-        assert sample.shape == [4, 4, 64, 64]
-        output_slice = sample[-1, -2:, -2:, :2].flatten().cast("float32").cpu()
-        expected_output_slice = paddle.to_tensor(expected_slice)
-        assert paddle_all_close(output_slice, expected_output_slice, atol=0.005)
-
-    @parameterized.expand(
-        [
-            [83, 4, [0.1514, 0.0807, 0.1624, 0.1016, -0.1896, 0.0263, 0.0677, 0.231]],
-            [17, 0.55, [0.1164, -0.0216, 0.017, 0.1589, -0.312, 0.1005, -0.0581, -0.1458]],
-            [8, 0.89, [-0.1758, -0.0169, 0.1004, -0.1411, 0.1312, 0.1103, -0.1996, 0.2139]],
-            [3, 1000, [0.1214, 0.0352, -0.0731, -0.1562, -0.0994, -0.0906, -0.234, -0.0539]],
-        ]
-    )
-    @require_paddle_gpu
-    def test_stabilityai_sd_v2_fp16(self, seed, timestep, expected_slice):
-        model = self.get_unet_model(model_id="stabilityai/stable-diffusion-2", fp16=True)
-        latents = self.get_latents(seed, shape=(4, 4, 96, 96), fp16=True)
-        encoder_hidden_states = self.get_encoder_hidden_states(seed, shape=(4, 77, 1024), fp16=True)
-        timestep = paddle.to_tensor([timestep], dtype="int64")
-        with paddle.no_grad():
-            sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
-        assert sample.shape == latents.shape
-        output_slice = sample[-1, -2:, -2:, :2].flatten().cast("float32").cpu()
-        expected_output_slice = paddle.to_tensor(expected_slice)
-        assert paddle_all_close(output_slice, expected_output_slice, atol=0.005)
diff --git a/ppdiffusers/tests/models/test_models_unet_3d_condition.py b/ppdiffusers/tests/models/test_models_unet_3d_condition.py
deleted file mode 100644
index d6da9c7fed65..000000000000
--- a/ppdiffusers/tests/models/test_models_unet_3d_condition.py
+++ /dev/null
@@ -1,345 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import tempfile
-import unittest
-
-import numpy as np
-import paddle
-
-from ppdiffusers.models import UNet3DConditionModel
-from ppdiffusers.models.attention_processor import AttnProcessor, LoRAAttnProcessor
-from ppdiffusers.utils import floats_tensor, logging
-from ppdiffusers.utils.import_utils import is_ppxformers_available
-
-from .test_modeling_common import ModelTesterMixin
-
-logger = logging.get_logger(__name__)
-
-
-def create_lora_layers(model, mock_weights: bool = True):
-    lora_attn_procs = {}
-    for name in model.attn_processors.keys():
-        has_cross_attention = name.endswith("attn2.processor") and not (
-            name.startswith("transformer_in") or "temp_attentions" in name.split(".")
-        )
-        cross_attention_dim = model.config.cross_attention_dim if has_cross_attention else None
-        if name.startswith("mid_block"):
-            hidden_size = model.config.block_out_channels[-1]
-        elif name.startswith("up_blocks"):
-            block_id = int(name[len("up_blocks.")])
-            hidden_size = list(reversed(model.config.block_out_channels))[block_id]
-        elif name.startswith("down_blocks"):
-            block_id = int(name[len("down_blocks.")])
-            hidden_size = model.config.block_out_channels[block_id]
-        elif name.startswith("transformer_in"):
-            # Note that the `8 * ...` comes from: https://github.com/huggingface/diffusers/blob/7139f0e874f10b2463caa8cbd585762a309d12d6/src/diffusers/models/unet_3d_condition.py#L148
-            hidden_size = 8 * model.config.attention_head_dim
-
-        lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
-
-        if mock_weights:
-            # add 1 to weights to mock trained weights
-            with paddle.no_grad():
-                lora_attn_procs[name].to_q_lora.up.weight.set_value(lora_attn_procs[name].to_q_lora.up.weight + 1)
-                lora_attn_procs[name].to_k_lora.up.weight.set_value(lora_attn_procs[name].to_k_lora.up.weight + 1)
-                lora_attn_procs[name].to_v_lora.up.weight.set_value(lora_attn_procs[name].to_v_lora.up.weight + 1)
-                lora_attn_procs[name].to_out_lora.up.weight.set_value(lora_attn_procs[name].to_out_lora.up.weight + 1)
-    return lora_attn_procs
-
-
-class UNet3DConditionModelTests(ModelTesterMixin, unittest.TestCase):
-    model_class = UNet3DConditionModel
-
-    @property
-    def dummy_input(self):
-        batch_size = 4
-        num_channels = 4
-        num_frames = 4
-        sizes = (32, 32)
-        noise = floats_tensor((batch_size, num_channels, num_frames) + sizes)
-        time_step = paddle.to_tensor([10])
-        encoder_hidden_states = floats_tensor((batch_size, 4, 32))
-        return {"sample": noise, "timestep": time_step, "encoder_hidden_states": encoder_hidden_states}
-
-    @property
-    def input_shape(self):
-        return (4, 4, 32, 32)
-
-    @property
-    def output_shape(self):
-        return (4, 4, 32, 32)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "block_out_channels": (32, 64),
-            "down_block_types": (
-                "CrossAttnDownBlock3D",
-                "DownBlock3D",
-            ),
-            "up_block_types": ("UpBlock3D", "CrossAttnUpBlock3D"),
-            "cross_attention_dim": 32,
-            "attention_head_dim": 8,
-            "out_channels": 4,
-            "in_channels": 4,
-            "layers_per_block": 1,
-            "sample_size": 32,
-        }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    @unittest.skipIf(
-        not is_ppxformers_available(), reason="XFormers attention is only available with CUDA and `xformers` installed"
-    )
-    def test_xformers_enable_works(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict)
-
-        model.enable_xformers_memory_efficient_attention()
-
-        assert (
-            model.mid_block.attentions[0].transformer_blocks[0].attn1.processor.__class__.__name__
-            == "XFormersAttnProcessor"
-        ), "xformers is not enabled"
-
-    # Overriding to set `norm_num_groups` needs to be different for this model.
-    def test_forward_with_norm_groups(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        init_dict["norm_num_groups"] = 32
-
-        model = self.model_class(**init_dict)
-
-        model.eval()
-        with paddle.no_grad():
-            output = model(**inputs_dict)
-            if isinstance(output, dict):
-                output = output.sample
-        self.assertIsNotNone(output)
-        expected_shape = inputs_dict["sample"].shape
-        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
-
-    # Overriding since the UNet3D outputs a different structure.
-    def test_determinism(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict)
-        model.eval()
-        with paddle.no_grad():
-            first = model(**inputs_dict)
-            if isinstance(first, dict):
-                first = first.sample
-            second = model(**inputs_dict)
-            if isinstance(second, dict):
-                second = second.sample
-        out_1 = first.cpu().numpy()
-        out_2 = second.cpu().numpy()
-        out_1 = out_1[~np.isnan(out_1)]
-        out_2 = out_2[~np.isnan(out_2)]
-        max_diff = np.amax(np.abs(out_1 - out_2))
-        self.assertLessEqual(max_diff, 1e-05)
-
-    def test_model_attention_slicing(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        init_dict["attention_head_dim"] = 8
-        model = self.model_class(**init_dict)
-        model.eval()
-        model.set_attention_slice("auto")
-        with paddle.no_grad():
-            output = model(**inputs_dict)
-        assert output is not None
-        model.set_attention_slice("max")
-        with paddle.no_grad():
-            output = model(**inputs_dict)
-        assert output is not None
-        model.set_attention_slice(2)
-        with paddle.no_grad():
-            output = model(**inputs_dict)
-        assert output is not None
-
-    def test_lora_processors(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        init_dict["attention_head_dim"] = 8
-
-        model = self.model_class(**init_dict)
-
-        with paddle.no_grad():
-            sample1 = model(**inputs_dict).sample
-
-        lora_attn_procs = create_lora_layers(model)
-
-        # make sure we can set a list of attention processors
-        model.set_attn_processor(lora_attn_procs)
-
-        # test that attn processors can be set to itself
-        model.set_attn_processor(model.attn_processors)
-
-        with paddle.no_grad():
-            sample2 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
-            sample3 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
-            sample4 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
-
-        assert (sample1 - sample2).abs().max() < 1e-4
-        assert (sample3 - sample4).abs().max() < 1e-4
-
-        # sample 2 and sample 3 should be different
-        assert (sample2 - sample3).abs().max() > 1e-4
-
-    def test_lora_save_load(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        init_dict["attention_head_dim"] = 8
-
-        paddle.seed(0)
-        model = self.model_class(**init_dict)
-
-        with paddle.no_grad():
-            old_sample = model(**inputs_dict).sample
-
-        lora_attn_procs = create_lora_layers(model)
-        model.set_attn_processor(lora_attn_procs)
-
-        with paddle.no_grad():
-            sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_attn_procs(
-                tmpdirname,
-                to_diffusers=False,
-            )
-            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
-            paddle.seed(0)
-            new_model = self.model_class(**init_dict)
-            new_model.load_attn_procs(tmpdirname, from_diffusers=False)
-
-        with paddle.no_grad():
-            new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
-
-        assert (sample - new_sample).abs().max() < 1e-4
-
-        # LoRA and no LoRA should NOT be the same
-        assert (sample - old_sample).abs().max() > 1e-4
-
-    def test_lora_save_load_safetensors(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        init_dict["attention_head_dim"] = 8
-
-        paddle.seed(0)
-        model = self.model_class(**init_dict)
-
-        with paddle.no_grad():
-            old_sample = model(**inputs_dict).sample
-
-        lora_attn_procs = create_lora_layers(model)
-        model.set_attn_processor(lora_attn_procs)
-
-        with paddle.no_grad():
-            sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_attn_procs(tmpdirname, safe_serialization=True, to_diffusers=True)
-            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors")))
-            paddle.seed(0)
-            new_model = self.model_class(**init_dict)
-            new_model.load_attn_procs(tmpdirname, use_safetensors=True, from_diffusers=True)
-
-        with paddle.no_grad():
-            new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
-
-        assert (sample - new_sample).abs().max() < 1e-4
-
-        # LoRA and no LoRA should NOT be the same
-        assert (sample - old_sample).abs().max() > 1e-4
-
-    def test_lora_save_safetensors_load_torch(self):
-        # enable deterministic behavior for gradient checkpointing
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        init_dict["attention_head_dim"] = 8
-
-        paddle.seed(0)
-        model = self.model_class(**init_dict)
-
-        lora_attn_procs = create_lora_layers(model, mock_weights=False)
-        model.set_attn_processor(lora_attn_procs)
-        # Saving as paddle, properly reloads with directly filename
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_attn_procs(tmpdirname, to_diffusers=True)
-            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.bin")))
-            paddle.seed(0)
-            new_model = self.model_class(**init_dict)
-            new_model.load_attn_procs(
-                tmpdirname, weight_name="pytorch_lora_weights.bin", use_safetensors=False, from_diffusers=True
-            )
-
-    def test_lora_save_paddle_force_load_safetensors_error(self):
-        pass
-
-    def test_lora_on_off(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        init_dict["attention_head_dim"] = 8
-
-        paddle.seed(0)
-        model = self.model_class(**init_dict)
-
-        with paddle.no_grad():
-            old_sample = model(**inputs_dict).sample
-
-        lora_attn_procs = create_lora_layers(model)
-        model.set_attn_processor(lora_attn_procs)
-
-        with paddle.no_grad():
-            sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
-
-        model.set_attn_processor(AttnProcessor())
-
-        with paddle.no_grad():
-            new_sample = model(**inputs_dict).sample
-
-        assert (sample - new_sample).abs().max() < 1e-4
-        assert (sample - old_sample).abs().max() < 1e-4
-
-    @unittest.skipIf(
-        not is_ppxformers_available(),
-        reason="XFormers attention is only available with CUDA and `xformers` installed",
-    )
-    def test_lora_xformers_on_off(self):
-        # enable deterministic behavior for gradient checkpointing
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        init_dict["attention_head_dim"] = 4
-
-        paddle.seed(0)
-        model = self.model_class(**init_dict)
-        lora_attn_procs = create_lora_layers(model)
-        model.set_attn_processor(lora_attn_procs)
-
-        # default
-        with paddle.no_grad():
-            sample = model(**inputs_dict).sample
-
-            model.enable_xformers_memory_efficient_attention()
-            on_sample = model(**inputs_dict).sample
-
-            model.disable_xformers_memory_efficient_attention()
-            off_sample = model(**inputs_dict).sample
-
-        assert (sample - on_sample).abs().max() < 0.005
-        assert (sample - off_sample).abs().max() < 0.005
-
-
-# (todo: sayakpaul) implement SLOW tests.
diff --git a/ppdiffusers/tests/models/test_models_vae.py b/ppdiffusers/tests/models/test_models_vae.py
deleted file mode 100644
index 0d7993c0a116..000000000000
--- a/ppdiffusers/tests/models/test_models_vae.py
+++ /dev/null
@@ -1,322 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import paddle
-from parameterized import parameterized
-
-from ppdiffusers import AutoencoderKL
-from ppdiffusers.utils import (
-    floats_tensor,
-    load_ppnlp_numpy,
-    paddle_all_close,
-    require_paddle_gpu,
-    slow,
-)
-
-from .test_modeling_common import ModelTesterMixin
-
-
-class AutoencoderKLTests(ModelTesterMixin, unittest.TestCase):
-    model_class = AutoencoderKL
-
-    @property
-    def dummy_input(self):
-        batch_size = 4
-        num_channels = 3
-        sizes = 32, 32
-        image = floats_tensor((batch_size, num_channels) + sizes)
-        return {"sample": image}
-
-    @property
-    def input_shape(self):
-        return 3, 32, 32
-
-    @property
-    def output_shape(self):
-        return 3, 32, 32
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "block_out_channels": [32, 64],
-            "in_channels": 3,
-            "out_channels": 3,
-            "down_block_types": ["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            "up_block_types": ["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            "latent_channels": 4,
-        }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    def test_forward_signature(self):
-        pass
-
-    def test_training(self):
-        pass
-
-    def test_gradient_checkpointing(self):
-        # enable deterministic behavior for gradient checkpointing
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict)
-
-        assert not model.is_gradient_checkpointing and model.training
-
-        out = model(**inputs_dict).sample
-        # run the backwards pass on the model. For backwards pass, for simplicity purpose,
-        # we won't calculate the loss and rather backprop on out.sum()
-        model.clear_gradients()
-
-        labels = paddle.randn(out.shape, dtype=out.dtype)
-        loss = (out - labels).mean()
-        loss.backward()
-
-        # re-instantiate the model now enabling gradient checkpointing
-        model_2 = self.model_class(**init_dict)
-        # clone model
-        model_2.load_dict(model.state_dict())
-        model_2.enable_gradient_checkpointing()
-
-        assert model_2.is_gradient_checkpointing and model_2.training
-
-        out_2 = model_2(**inputs_dict).sample
-        # run the backwards pass on the model. For backwards pass, for simplicity purpose,
-        # we won't calculate the loss and rather backprop on out.sum()
-        model_2.clear_gradients()
-        loss_2 = (out_2 - labels).mean()
-        loss_2.backward()
-
-        # compare the output and parameters gradients
-        self.assertTrue((loss - loss_2).abs() < 1e-5)
-        named_params = dict(model.named_parameters())
-        named_params_2 = dict(model_2.named_parameters())
-        with paddle.no_grad():
-            for name, param in named_params.items():
-                self.assertTrue(paddle_all_close(param.grad, named_params_2[name].grad, atol=5e-5))
-
-    def test_from_pretrained_hub(self):
-        model, loading_info = AutoencoderKL.from_pretrained("fusing/autoencoder-kl-dummy", output_loading_info=True)
-        self.assertIsNotNone(model)
-        self.assertEqual(len(loading_info["missing_keys"]), 0)
-        image = model(**self.dummy_input)
-        assert image is not None, "Make sure output is not None"
-
-    def test_output_pretrained(self):
-        model = AutoencoderKL.from_pretrained("fusing/autoencoder-kl-dummy")
-        model.eval()
-
-        generator = paddle.Generator().manual_seed(0)
-        image = paddle.randn(
-            shape=[1, model.config.in_channels, model.config.sample_size, model.config.sample_size],
-            generator=paddle.Generator().manual_seed(0),
-        )
-        with paddle.no_grad():
-            output = model(image, sample_posterior=True, generator=generator).sample
-        output_slice = output[0, -1, -3:, -3:].flatten().cpu()
-        expected_output_slice = paddle.to_tensor(
-            [
-                -0.39049336,
-                0.34836933,
-                0.27105471,
-                -0.02148458,
-                0.00975929,
-                0.27822807,
-                -0.12224892,
-                -0.02011922,
-                0.19761699,
-            ]
-        )
-        self.assertTrue(paddle_all_close(output_slice, expected_output_slice, rtol=0.01))
-
-
-@slow
-class AutoencoderKLIntegrationTests(unittest.TestCase):
-    def get_file_format(self, seed, shape):
-        return f"gaussian_noise_s={seed}_shape={'_'.join([str(s) for s in shape])}.npy"
-
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def get_sd_image(self, seed=0, shape=(4, 3, 512, 512), fp16=False):
-        dtype = paddle.float16 if fp16 else paddle.float32
-        image = paddle.to_tensor(data=load_ppnlp_numpy(self.get_file_format(seed, shape))).cast(dtype)
-        return image
-
-    def get_sd_vae_model(self, model_id="CompVis/stable-diffusion-v1-4", fp16=False):
-        revision = "fp16" if fp16 else None
-        paddle_dtype = paddle.float16 if fp16 else paddle.float32
-        model = AutoencoderKL.from_pretrained(model_id, subfolder="vae", paddle_dtype=paddle_dtype, revision=revision)
-        model.eval()
-        return model
-
-    def get_generator(self, seed=0):
-        return paddle.Generator().manual_seed(seed)
-
-    @parameterized.expand(
-        [
-            [
-                33,
-                [-0.1603, 0.9878, -0.0495, -0.079, -0.2709, 0.8375, -0.206, -0.0824],
-                [-0.2395, 0.0098, 0.0102, -0.0709, -0.284, -0.0274, -0.0718, -0.1824],
-            ],
-            [
-                47,
-                [-0.2376, 0.1168, 0.1332, -0.484, -0.2508, -0.0791, -0.0493, -0.4089],
-                [0.035, 0.0847, 0.0467, 0.0344, -0.0842, -0.0547, -0.0633, -0.1131],
-            ],
-        ]
-    )
-    def test_stable_diffusion(self, seed, expected_slice, expected_slice_mps):
-        model = self.get_sd_vae_model()
-        image = self.get_sd_image(seed)
-        generator = self.get_generator(seed)
-        with paddle.no_grad():
-            sample = model(image, generator=generator, sample_posterior=True).sample
-        assert sample.shape == image.shape
-        output_slice = sample[-1, -2:, -2:, :2].flatten().cast("float32").cpu()
-        expected_output_slice = paddle.to_tensor(expected_slice)
-        assert paddle_all_close(output_slice, expected_output_slice, atol=0.01)
-
-    @parameterized.expand(
-        [
-            [33, [-0.0513, 0.0289, 1.3799, 0.2166, -0.2573, -0.0871, 0.5103, -0.0999]],
-            [47, [-0.4128, -0.132, -0.3704, 0.1965, -0.4116, -0.2332, -0.334, 0.2247]],
-        ]
-    )
-    @require_paddle_gpu
-    def test_stable_diffusion_fp16(self, seed, expected_slice):
-        model = self.get_sd_vae_model(fp16=True)
-        image = self.get_sd_image(seed, fp16=True)
-        generator = self.get_generator(seed)
-        with paddle.no_grad():
-            sample = model(image, generator=generator, sample_posterior=True).sample
-        assert sample.shape == image.shape
-        output_slice = sample[-1, -2:, :2, -2:].flatten().cast("float32").cpu()
-        expected_output_slice = paddle.to_tensor(expected_slice)
-        assert paddle_all_close(output_slice, expected_output_slice, atol=0.01)
-
-    @parameterized.expand(
-        [
-            [
-                33,
-                [-0.1609, 0.9866, -0.0487, -0.0777, -0.2716, 0.8368, -0.2055, -0.0814],
-                [-0.2395, 0.0098, 0.0102, -0.0709, -0.284, -0.0274, -0.0718, -0.1824],
-            ],
-            [
-                47,
-                [-0.2377, 0.1147, 0.1333, -0.4841, -0.2506, -0.0805, -0.0491, -0.4085],
-                [0.035, 0.0847, 0.0467, 0.0344, -0.0842, -0.0547, -0.0633, -0.1131],
-            ],
-        ]
-    )
-    def test_stable_diffusion_mode(self, seed, expected_slice, expected_slice_mps):
-        model = self.get_sd_vae_model()
-        image = self.get_sd_image(seed)
-        with paddle.no_grad():
-            sample = model(image).sample
-        assert sample.shape == image.shape
-        output_slice = sample[-1, -2:, -2:, :2].flatten().cast("float32").cpu()
-        expected_output_slice = paddle.to_tensor(expected_slice)
-        assert paddle_all_close(output_slice, expected_output_slice, atol=0.01)
-
-    @parameterized.expand(
-        [
-            [13, [-0.2051, -0.1803, -0.2311, -0.2114, -0.3292, -0.3574, -0.2953, -0.3323]],
-            [37, [-0.2632, -0.2625, -0.2199, -0.2741, -0.4539, -0.499, -0.372, -0.4925]],
-        ]
-    )
-    @require_paddle_gpu
-    def test_stable_diffusion_decode(self, seed, expected_slice):
-        model = self.get_sd_vae_model()
-        encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64))
-        with paddle.no_grad():
-            sample = model.decode(encoding).sample
-        assert list(sample.shape) == [3, 3, 512, 512]
-        output_slice = sample[-1, -2:, :2, -2:].flatten().cpu()
-        expected_output_slice = paddle.to_tensor(expected_slice)
-        assert paddle_all_close(output_slice, expected_output_slice, atol=0.01)
-
-    @parameterized.expand(
-        [
-            [27, [-0.0369, 0.0207, -0.0776, -0.0682, -0.1747, -0.193, -0.1465, -0.2039]],
-            [16, [-0.1628, -0.2134, -0.2747, -0.2642, -0.3774, -0.4404, -0.3687, -0.4277]],
-        ]
-    )
-    @require_paddle_gpu
-    def test_stable_diffusion_decode_fp16(self, seed, expected_slice):
-        model = self.get_sd_vae_model(fp16=True)
-        encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64), fp16=True)
-        with paddle.no_grad():
-            sample = model.decode(encoding).sample
-        assert list(sample.shape) == [3, 3, 512, 512]
-        output_slice = sample[-1, -2:, :2, -2:].flatten().cast("float32").cpu()
-        expected_output_slice = paddle.to_tensor(expected_slice)
-        assert paddle_all_close(output_slice, expected_output_slice, atol=0.005)
-
-    @parameterized.expand([(13,), (16,), (27,)])
-    @require_paddle_gpu
-    def test_stable_diffusion_decode_ppxformers_vs_2_5_fp16(self, seed):
-        model = self.get_sd_vae_model(fp16=True)
-        encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64), fp16=True)
-
-        with paddle.no_grad():
-            sample = model.decode(encoding).sample
-
-        model.enable_xformers_memory_efficient_attention()
-        with paddle.no_grad():
-            sample_2 = model.decode(encoding).sample
-
-        assert list(sample.shape) == [3, 3, 512, 512]
-
-        assert paddle_all_close(sample, sample_2, atol=1e-1)
-
-    @parameterized.expand([(13,), (16,), (37,)])
-    @require_paddle_gpu
-    def test_stable_diffusion_decode_ppxformers_vs_2_5(self, seed):
-        model = self.get_sd_vae_model()
-        encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64))
-
-        with paddle.no_grad():
-            sample = model.decode(encoding).sample
-
-        model.enable_xformers_memory_efficient_attention()
-        with paddle.no_grad():
-            sample_2 = model.decode(encoding).sample
-
-        assert list(sample.shape) == [3, 3, 512, 512]
-
-        assert paddle_all_close(sample, sample_2, atol=1e-2)
-
-    @parameterized.expand(
-        [
-            [33, [-0.3001, 0.0918, -2.6984, -3.972, -3.2099, -5.0353, 1.7338, -0.2065, 3.4267]],
-            [47, [-1.503, -4.3871, -6.0355, -9.1157, -1.6661, -2.7853, 2.1607, -5.0823, 2.5633]],
-        ]
-    )
-    def test_stable_diffusion_encode_sample(self, seed, expected_slice):
-        model = self.get_sd_vae_model()
-        image = self.get_sd_image(seed)
-        generator = self.get_generator(seed)
-        with paddle.no_grad():
-            dist = model.encode(image).latent_dist
-            sample = dist.sample(generator=generator)
-        assert list(sample.shape) == [image.shape[0], 4] + [(i // 8) for i in image.shape[2:]]
-        output_slice = sample[0, -1, -3:, -3:].flatten().cpu()
-        expected_output_slice = paddle.to_tensor(expected_slice)
-        tolerance = 0.01
-        assert paddle_all_close(output_slice, expected_output_slice, atol=tolerance)
diff --git a/ppdiffusers/tests/models/test_models_vq.py b/ppdiffusers/tests/models/test_models_vq.py
deleted file mode 100644
index accb29913df8..000000000000
--- a/ppdiffusers/tests/models/test_models_vq.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import paddle
-
-from ppdiffusers import VQModel
-from ppdiffusers.utils import floats_tensor
-
-from .test_modeling_common import ModelTesterMixin
-
-
-class VQModelTests(ModelTesterMixin, unittest.TestCase):
-    model_class = VQModel
-
-    @property
-    def dummy_input(self, sizes=(32, 32)):
-        batch_size = 4
-        num_channels = 3
-        image = floats_tensor((batch_size, num_channels) + sizes)
-        return {"sample": image}
-
-    @property
-    def input_shape(self):
-        return 3, 32, 32
-
-    @property
-    def output_shape(self):
-        return 3, 32, 32
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "block_out_channels": [32, 64],
-            "in_channels": 3,
-            "out_channels": 3,
-            "down_block_types": ["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            "up_block_types": ["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            "latent_channels": 3,
-        }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    def test_forward_signature(self):
-        pass
-
-    def test_training(self):
-        pass
-
-    def test_from_pretrained_hub(self):
-        model, loading_info = VQModel.from_pretrained("fusing/vqgan-dummy", output_loading_info=True)
-        self.assertIsNotNone(model)
-        self.assertEqual(len(loading_info["missing_keys"]), 0)
-        image = model(**self.dummy_input)
-        assert image is not None, "Make sure output is not None"
-
-    def test_output_pretrained(self):
-        model = VQModel.from_pretrained("fusing/vqgan-dummy")
-        model.eval()
-        paddle.seed(0)
-        image = paddle.randn(shape=[1, model.config.in_channels, model.config.sample_size, model.config.sample_size])
-        with paddle.no_grad():
-            output = model(image).sample
-        output_slice = output[0, -1, -3:, -3:].flatten().cpu()
-        expected_output_slice = paddle.to_tensor(
-            [
-                -0.027147896587848663,
-                -0.41129639744758606,
-                -0.17730756103992462,
-                -0.5245445370674133,
-                -0.2423611730337143,
-                -0.3957087993621826,
-                -0.16461530327796936,
-                -0.06902074813842773,
-                -0.01736617460846901,
-            ]
-        )
-        self.assertTrue(paddle.allclose(output_slice, expected_output_slice, atol=0.01))
diff --git a/ppdiffusers/tests/models/test_unet_2d_blocks.py b/ppdiffusers/tests/models/test_unet_2d_blocks.py
deleted file mode 100644
index cfb2100ee38b..000000000000
--- a/ppdiffusers/tests/models/test_unet_2d_blocks.py
+++ /dev/null
@@ -1,556 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# noqa F403
-import unittest
-
-from ppdiffusers.models.unet_2d_blocks import (
-    AttnDownBlock2D,
-    AttnDownEncoderBlock2D,
-    AttnSkipDownBlock2D,
-    AttnSkipUpBlock2D,
-    AttnUpBlock2D,
-    AttnUpDecoderBlock2D,
-    CrossAttnDownBlock2D,
-    CrossAttnUpBlock2D,
-    DownBlock2D,
-    DownEncoderBlock2D,
-    ResnetDownsampleBlock2D,
-    ResnetUpsampleBlock2D,
-    SimpleCrossAttnDownBlock2D,
-    SimpleCrossAttnUpBlock2D,
-    SkipDownBlock2D,
-    SkipUpBlock2D,
-    UNetMidBlock2D,
-    UNetMidBlock2DCrossAttn,
-    UNetMidBlock2DSimpleCrossAttn,
-    UpBlock2D,
-    UpDecoderBlock2D,
-)
-
-from .test_unet_blocks_common import UNetBlockTesterMixin
-
-
-class DownBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = DownBlock2D
-    block_type = "down"
-
-    def test_output(self):
-        expected_slice = [
-            1.4686200618743896,
-            -1.0339399576187134,
-            -0.6087006330490112,
-            -0.9044048190116882,
-            0.21288111805915833,
-            -0.8680574297904968,
-            -0.4164941906929016,
-            -1.6082428693771362,
-            -1.5554661750793457,
-        ]
-        super().test_output(expected_slice)
-
-
-class ResnetDownsampleBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = ResnetDownsampleBlock2D
-    block_type = "down"
-
-    def test_output(self):
-        expected_slice = [
-            0.1373986005783081,
-            -0.06267327070236206,
-            0.6338546276092529,
-            0.9961339235305786,
-            0.012131750583648682,
-            0.2271430492401123,
-            0.4698519706726074,
-            -1.2050957679748535,
-            -0.12423264980316162,
-        ]
-        super().test_output(expected_slice)
-
-
-class AttnDownBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = AttnDownBlock2D
-    block_type = "down"
-
-    def test_output(self):
-        expected_slice = [
-            -3.9491326808929443,
-            -0.5726033449172974,
-            -0.1606975793838501,
-            0.16732816398143768,
-            0.480291485786438,
-            -0.6275963187217712,
-            0.8580896258354187,
-            -2.3375632762908936,
-            -1.4645881652832031,
-        ]
-        super().test_output(expected_slice)
-
-
-class CrossAttnDownBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = CrossAttnDownBlock2D
-    block_type = "down"
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common()
-        init_dict["cross_attention_dim"] = 32
-        return init_dict, inputs_dict
-
-    def test_output(self):
-        expected_slice = [
-            2.6956636905670166,
-            -4.308715343475342,
-            1.5738945007324219,
-            0.9817700982093811,
-            -2.193608283996582,
-            -0.42364418506622314,
-            6.60827112197876,
-            0.9649910926818848,
-            2.8010499477386475,
-        ]
-        super().test_output(expected_slice)
-
-
-class SimpleCrossAttnDownBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = SimpleCrossAttnDownBlock2D
-    block_type = "down"
-
-    @property
-    def dummy_input(self):
-        return super().get_dummy_input(include_encoder_hidden_states=True)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common()
-        init_dict["cross_attention_dim"] = 32
-        return init_dict, inputs_dict
-
-    def test_output(self):
-        expected_slice = [
-            -1.6289970874786377,
-            1.3748600482940674,
-            -0.10375875234603882,
-            0.9955897331237793,
-            -0.8343256115913391,
-            0.382874071598053,
-            -0.10101768374443054,
-            -0.250579297542572,
-            -0.9541524648666382,
-        ]
-        super().test_output(expected_slice)
-
-
-class SkipDownBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = SkipDownBlock2D
-    block_type = "down"
-
-    @property
-    def dummy_input(self):
-        return super().get_dummy_input(include_skip_sample=True)
-
-    def test_output(self):
-        expected_slice = [
-            0.2892754375934601,
-            -0.4464714229106903,
-            -0.18036654591560364,
-            -0.4965817928314209,
-            -0.050021037459373474,
-            -0.6248312592506409,
-            -0.5183243751525879,
-            -0.02524399757385254,
-            0.1424381136894226,
-        ]
-        super().test_output(expected_slice)
-
-
-class AttnSkipDownBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = AttnSkipDownBlock2D
-    block_type = "down"
-
-    @property
-    def dummy_input(self):
-        return super().get_dummy_input(include_skip_sample=True)
-
-    def test_output(self):
-        expected_slice = [
-            -0.4862610697746277,
-            0.8827285766601562,
-            0.7600707411766052,
-            1.828415870666504,
-            0.7132594585418701,
-            -0.12354043126106262,
-            0.7799923419952393,
-            -0.2145882546901703,
-            -1.3009073734283447,
-        ]
-        super().test_output(expected_slice)
-
-
-class DownEncoderBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = DownEncoderBlock2D
-    block_type = "down"
-
-    @property
-    def dummy_input(self):
-        return super().get_dummy_input(include_temb=False)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {"in_channels": 32, "out_channels": 32}
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    def test_output(self):
-        expected_slice = [
-            2.2016096115112305,
-            -0.15662731230258942,
-            1.789330005645752,
-            0.392975389957428,
-            -4.444106578826904,
-            2.293689489364624,
-            -0.7877296805381775,
-            0.5266609191894531,
-            -0.15173353254795074,
-        ]
-        super().test_output(expected_slice)
-
-
-class AttnDownEncoderBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = AttnDownEncoderBlock2D
-    block_type = "down"
-
-    @property
-    def dummy_input(self):
-        return super().get_dummy_input(include_temb=False)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {"in_channels": 32, "out_channels": 32}
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    def test_output(self):
-        expected_slice = [
-            2.127671957015991,
-            -0.11142143607139587,
-            1.2964460849761963,
-            3.6022450923919678,
-            -1.7154743671417236,
-            1.6823889017105103,
-            -1.6448723077774048,
-            -0.4970707595348358,
-            -3.637833833694458,
-        ]
-        super().test_output(expected_slice)
-
-
-class UNetMidBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = UNetMidBlock2D
-    block_type = "mid"
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {"in_channels": 32, "temb_channels": 128}
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    def test_output(self):
-        expected_slice = [
-            -2.115619421005249,
-            -0.18567246198654175,
-            -1.673149585723877,
-            -0.8526121973991394,
-            -0.09890538454055786,
-            -2.894134998321533,
-            -0.2579667568206787,
-            0.02939319610595703,
-            1.1619269847869873,
-        ]
-        super().test_output(expected_slice)
-
-
-class UNetMidBlock2DCrossAttnTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = UNetMidBlock2DCrossAttn
-    block_type = "mid"
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common()
-        init_dict["cross_attention_dim"] = 32
-        return init_dict, inputs_dict
-
-    def test_output(self):
-        expected_slice = [
-            -2.235785961151123,
-            -2.2744078636169434,
-            0.22076213359832764,
-            -3.0804693698883057,
-            -1.8690654039382935,
-            -4.610274791717529,
-            -0.625274121761322,
-            0.4143417179584503,
-            -1.8598196506500244,
-        ]
-        super().test_output(expected_slice)
-
-
-class UNetMidBlock2DSimpleCrossAttnTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = UNetMidBlock2DSimpleCrossAttn
-    block_type = "mid"
-
-    @property
-    def dummy_input(self):
-        return super().get_dummy_input(include_encoder_hidden_states=True)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common()
-        init_dict["cross_attention_dim"] = 32
-        return init_dict, inputs_dict
-
-    def test_output(self):
-        expected_slice = [
-            -3.61512899,
-            0.17301944,
-            -0.69105405,
-            -1.40025711,
-            -1.59702873,
-            -1.47273242,
-            -0.79226393,
-            -1.22910488,
-            1.09667253,
-        ]
-        super().test_output(expected_slice)
-
-
-class UpBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = UpBlock2D
-    block_type = "up"
-
-    @property
-    def dummy_input(self):
-        return super().get_dummy_input(include_res_hidden_states_tuple=True)
-
-    def test_output(self):
-        expected_slice = [
-            -4.957080364227295,
-            0.49701011180877686,
-            4.326162815093994,
-            -2.624238967895508,
-            1.4365060329437256,
-            3.467172145843506,
-            0.8403439521789551,
-            1.941118597984314,
-            -0.4804985523223877,
-        ]
-        super().test_output(expected_slice)
-
-
-class ResnetUpsampleBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = ResnetUpsampleBlock2D
-    block_type = "up"
-
-    @property
-    def dummy_input(self):
-        return super().get_dummy_input(include_res_hidden_states_tuple=True)
-
-    def test_output(self):
-        expected_slice = [
-            -2.075526714324951,
-            -3.90122652053833,
-            -3.0005340576171875,
-            -0.9611822366714478,
-            -1.0546646118164062,
-            -1.7606399059295654,
-            -0.24509593844413757,
-            -0.025167375802993774,
-            -0.7591105699539185,
-        ]
-        super().test_output(expected_slice)
-
-
-class CrossAttnUpBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = CrossAttnUpBlock2D
-    block_type = "up"
-
-    @property
-    def dummy_input(self):
-        return super().get_dummy_input(include_res_hidden_states_tuple=True)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common()
-        init_dict["cross_attention_dim"] = 32
-        return init_dict, inputs_dict
-
-    def test_output(self):
-        expected_slice = [
-            -1.2535507678985596,
-            -2.480539083480835,
-            -3.7073025703430176,
-            -2.2757019996643066,
-            -3.044628143310547,
-            -2.0491058826446533,
-            0.8988063335418701,
-            0.9877803325653076,
-            1.679555892944336,
-        ]
-        super().test_output(expected_slice)
-
-
-class SimpleCrossAttnUpBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = SimpleCrossAttnUpBlock2D
-    block_type = "up"
-
-    @property
-    def dummy_input(self):
-        return super().get_dummy_input(include_res_hidden_states_tuple=True, include_encoder_hidden_states=True)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common()
-        init_dict["cross_attention_dim"] = 32
-        return init_dict, inputs_dict
-
-    def test_output(self):
-        expected_slice = [
-            -0.2477731704711914,
-            -2.644524097442627,
-            -2.698854684829712,
-            -0.1323309689760208,
-            -1.104975700378418,
-            -0.9408857822418213,
-            -0.05827316641807556,
-            -0.3523079752922058,
-            -0.8070091009140015,
-        ]
-        super().test_output(expected_slice)
-
-
-class AttnUpBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = AttnUpBlock2D
-    block_type = "up"
-
-    @property
-    def dummy_input(self):
-        return super().get_dummy_input(include_res_hidden_states_tuple=True)
-
-    def test_output(self):
-        expected_slice = [
-            -1.8902320861816406,
-            -1.3337427377700806,
-            -0.8851560354232788,
-            1.4004807472229004,
-            -0.6870196461677551,
-            -1.4291317462921143,
-            1.4414796829223633,
-            0.6205850839614868,
-            -0.7466438412666321,
-        ]
-        super().test_output(expected_slice)
-
-
-class SkipUpBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = SkipUpBlock2D
-    block_type = "up"
-
-    @property
-    def dummy_input(self):
-        return super().get_dummy_input(include_res_hidden_states_tuple=True)
-
-    def test_output(self):
-        expected_slice = [
-            -0.987883985042572,
-            -0.5670157074928284,
-            -0.6942511796951294,
-            -1.0125863552093506,
-            -0.605157732963562,
-            -0.8832322955131531,
-            -0.9034348726272583,
-            -0.7994486689567566,
-            -0.9313756227493286,
-        ]
-        super().test_output(expected_slice)
-
-
-class AttnSkipUpBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = AttnSkipUpBlock2D
-    block_type = "up"
-
-    @property
-    def dummy_input(self):
-        return super().get_dummy_input(include_res_hidden_states_tuple=True)
-
-    def test_output(self):
-        expected_slice = [
-            0.5064516067504883,
-            0.582533061504364,
-            0.7436902523040771,
-            0.6235701441764832,
-            -0.03481818363070488,
-            -0.1513846069574356,
-            -0.40579983592033386,
-            -0.9227585196495056,
-            -0.9879465699195862,
-        ]
-        super().test_output(expected_slice)
-
-
-class UpDecoderBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = UpDecoderBlock2D
-    block_type = "up"
-
-    @property
-    def dummy_input(self):
-        return super().get_dummy_input(include_temb=False)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {"in_channels": 32, "out_channels": 32}
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    def test_output(self):
-        expected_slice = [
-            -0.14693844318389893,
-            0.4114452600479126,
-            1.3881545066833496,
-            0.6828031539916992,
-            0.21913594007492065,
-            0.9397234320640564,
-            0.8490088582038879,
-            -0.9372509121894836,
-            -0.16005855798721313,
-        ]
-        super().test_output(expected_slice)
-
-
-class AttnUpDecoderBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = AttnUpDecoderBlock2D
-    block_type = "up"
-
-    @property
-    def dummy_input(self):
-        return super().get_dummy_input(include_temb=False)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {"in_channels": 32, "out_channels": 32}
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    def test_output(self):
-        expected_slice = [
-            -1.6499664783477783,
-            -2.1455278396606445,
-            -1.504562497138977,
-            -2.667104482650757,
-            -3.483185291290283,
-            -2.0631113052368164,
-            0.9261775612831116,
-            -0.60399329662323,
-            -0.1882866621017456,
-        ]
-        super().test_output(expected_slice)
diff --git a/ppdiffusers/tests/models/test_unet_blocks_common.py b/ppdiffusers/tests/models/test_unet_blocks_common.py
deleted file mode 100644
index 4595f43aec64..000000000000
--- a/ppdiffusers/tests/models/test_unet_blocks_common.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Tuple
-
-import paddle
-
-from ppdiffusers.utils import floats_tensor, paddle_all_close, randn_tensor
-from ppdiffusers.utils.testing_utils import require_paddle
-
-
-@require_paddle
-class UNetBlockTesterMixin:
-    @property
-    def dummy_input(self):
-        return self.get_dummy_input()
-
-    @property
-    def output_shape(self):
-        if self.block_type == "down":
-            return 4, 32, 16, 16
-        elif self.block_type == "mid":
-            return 4, 32, 32, 32
-        elif self.block_type == "up":
-            return 4, 32, 64, 64
-        raise ValueError(f"'{self.block_type}' is not a supported block_type. Set it to 'up', 'mid', or 'down'.")
-
-    def get_dummy_input(
-        self,
-        include_temb=True,
-        include_res_hidden_states_tuple=False,
-        include_encoder_hidden_states=False,
-        include_skip_sample=False,
-    ):
-        batch_size = 4
-        num_channels = 32
-        sizes = 32, 32
-        generator = paddle.Generator().manual_seed(0)
-        shape = (batch_size, num_channels) + sizes
-        hidden_states = randn_tensor(shape, generator=generator)
-        dummy_input = {"hidden_states": hidden_states}
-        if include_temb:
-            temb_channels = 128
-            dummy_input["temb"] = randn_tensor((batch_size, temb_channels), generator=generator)
-        if include_res_hidden_states_tuple:
-            generator_1 = paddle.Generator().manual_seed(1)
-            dummy_input["res_hidden_states_tuple"] = (randn_tensor(shape, generator=generator_1),)
-        if include_encoder_hidden_states:
-            dummy_input["encoder_hidden_states"] = floats_tensor((batch_size, 32, 32))
-        if include_skip_sample:
-            dummy_input["skip_sample"] = randn_tensor((batch_size, 3) + sizes, generator=generator)
-
-        paddle.seed(0)
-        return dummy_input
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {"in_channels": 32, "out_channels": 32, "temb_channels": 128}
-        if self.block_type == "up":
-            init_dict["prev_output_channel"] = 32
-        if self.block_type == "mid":
-            init_dict.pop("out_channels")
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    def test_output(self, expected_slice):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        unet_block = self.block_class(**init_dict)
-        unet_block.eval()
-        with paddle.no_grad():
-            output = unet_block(**inputs_dict)
-        if isinstance(output, Tuple):
-            output = output[0]
-        self.assertEqual(list(output.shape), list(self.output_shape))
-        output_slice = output[0, -1, -3:, -3:]
-        expected_slice = paddle.to_tensor(expected_slice)
-        assert paddle_all_close(output_slice.flatten(), expected_slice, atol=0.005)
-
-    def test_training(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.block_class(**init_dict)
-        model.train()
-        for _, v in inputs_dict.items():
-            if paddle.is_tensor(v):
-                v.stop_gradient = False
-        output = model(**inputs_dict)
-        if isinstance(output, Tuple):
-            output = output[0]
-        noise = randn_tensor(output.shape)
-        loss = paddle.nn.functional.mse_loss(input=output, label=noise)
-        loss.backward()
diff --git a/ppdiffusers/tests/others/__init__.py b/ppdiffusers/tests/others/__init__.py
deleted file mode 100644
index a72d388cc895..000000000000
--- a/ppdiffusers/tests/others/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/ppdiffusers/tests/others/test_config.py b/ppdiffusers/tests/others/test_config.py
deleted file mode 100644
index e4637ce2c35a..000000000000
--- a/ppdiffusers/tests/others/test_config.py
+++ /dev/null
@@ -1,182 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import tempfile
-import unittest
-
-from ppdiffusers import (
-    DDIMScheduler,
-    DDPMScheduler,
-    DPMSolverMultistepScheduler,
-    EulerAncestralDiscreteScheduler,
-    EulerDiscreteScheduler,
-    PNDMScheduler,
-    logging,
-)
-from ppdiffusers.configuration_utils import ConfigMixin, register_to_config
-from ppdiffusers.utils.testing_utils import CaptureLogger
-
-
-class SampleObject(ConfigMixin):
-    config_name = "config.json"
-
-    @register_to_config
-    def __init__(self, a=2, b=5, c=(2, 5), d="for diffusion", e=[1, 3]):
-        pass
-
-
-class SampleObject2(ConfigMixin):
-    config_name = "config.json"
-
-    @register_to_config
-    def __init__(self, a=2, b=5, c=(2, 5), d="for diffusion", f=[1, 3]):
-        pass
-
-
-class SampleObject3(ConfigMixin):
-    config_name = "config.json"
-
-    @register_to_config
-    def __init__(self, a=2, b=5, c=(2, 5), d="for diffusion", e=[1, 3], f=[1, 3]):
-        pass
-
-
-class ConfigTester(unittest.TestCase):
-    def test_load_not_from_mixin(self):
-        with self.assertRaises(ValueError):
-            ConfigMixin.load_config("dummy_path")
-
-    def test_register_to_config(self):
-        obj = SampleObject()
-        config = obj.config
-        assert config["a"] == 2
-        assert config["b"] == 5
-        assert config["c"] == (2, 5)
-        assert config["d"] == "for diffusion"
-        assert config["e"] == [1, 3]
-        obj = SampleObject(_name_or_path="lalala")
-        config = obj.config
-        assert config["a"] == 2
-        assert config["b"] == 5
-        assert config["c"] == (2, 5)
-        assert config["d"] == "for diffusion"
-        assert config["e"] == [1, 3]
-        obj = SampleObject(c=6)
-        config = obj.config
-        assert config["a"] == 2
-        assert config["b"] == 5
-        assert config["c"] == 6
-        assert config["d"] == "for diffusion"
-        assert config["e"] == [1, 3]
-        obj = SampleObject(1, c=6)
-        config = obj.config
-        assert config["a"] == 1
-        assert config["b"] == 5
-        assert config["c"] == 6
-        assert config["d"] == "for diffusion"
-        assert config["e"] == [1, 3]
-
-    def test_save_load(self):
-        obj = SampleObject()
-        config = obj.config
-        assert config["a"] == 2
-        assert config["b"] == 5
-        assert config["c"] == (2, 5)
-        assert config["d"] == "for diffusion"
-        assert config["e"] == [1, 3]
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            obj.save_config(tmpdirname)
-            new_obj = SampleObject.from_config(SampleObject.load_config(tmpdirname))
-            new_config = new_obj.config
-        config = dict(config)
-        new_config = dict(new_config)
-        assert config.pop("c") == (2, 5)
-        assert new_config.pop("c") == [2, 5]
-        assert config == new_config
-
-    def test_load_ddim_from_pndm(self):
-        logger = logging.get_logger("ppdiffusers.configuration_utils")
-        # 30 for warning
-        logger.setLevel(30)
-        with CaptureLogger(logger) as cap_logger:
-            ddim = DDIMScheduler.from_pretrained(
-                "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler"
-            )
-        assert ddim.__class__ == DDIMScheduler
-        assert cap_logger.out == ""
-
-    def test_load_euler_from_pndm(self):
-        logger = logging.get_logger("ppdiffusers.configuration_utils")
-        # 30 for warning
-        logger.setLevel(30)
-        with CaptureLogger(logger) as cap_logger:
-            euler = EulerDiscreteScheduler.from_pretrained(
-                "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler"
-            )
-        assert euler.__class__ == EulerDiscreteScheduler
-        assert cap_logger.out == ""
-
-    def test_load_euler_ancestral_from_pndm(self):
-        logger = logging.get_logger("ppdiffusers.configuration_utils")
-        # 30 for warning
-        logger.setLevel(30)
-        with CaptureLogger(logger) as cap_logger:
-            euler = EulerAncestralDiscreteScheduler.from_pretrained(
-                "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler"
-            )
-        assert euler.__class__ == EulerAncestralDiscreteScheduler
-        assert cap_logger.out == ""
-
-    def test_load_pndm(self):
-        logger = logging.get_logger("ppdiffusers.configuration_utils")
-        # 30 for warning
-        logger.setLevel(30)
-        with CaptureLogger(logger) as cap_logger:
-            pndm = PNDMScheduler.from_pretrained(
-                "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler"
-            )
-        assert pndm.__class__ == PNDMScheduler
-        assert cap_logger.out == ""
-
-    def test_overwrite_config_on_load(self):
-        logger = logging.get_logger("ppdiffusers.configuration_utils")
-        # 30 for warning
-        logger.setLevel(30)
-        with CaptureLogger(logger) as cap_logger:
-            ddpm = DDPMScheduler.from_pretrained(
-                "hf-internal-testing/tiny-stable-diffusion-torch",
-                subfolder="scheduler",
-                prediction_type="sample",
-                beta_end=8,
-            )
-        with CaptureLogger(logger) as cap_logger_2:
-            ddpm_2 = DDPMScheduler.from_pretrained("google/ddpm-celebahq-256", beta_start=88)
-        assert ddpm.__class__ == DDPMScheduler
-        assert ddpm.config.prediction_type == "sample"
-        assert ddpm.config.beta_end == 8
-        assert ddpm_2.config.beta_start == 88
-        assert cap_logger.out == ""
-        assert cap_logger_2.out == ""
-
-    def test_load_dpmsolver(self):
-        logger = logging.get_logger("ppdiffusers.configuration_utils")
-        # 30 for warning
-        logger.setLevel(30)
-        with CaptureLogger(logger) as cap_logger:
-            dpm = DPMSolverMultistepScheduler.from_pretrained(
-                "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler"
-            )
-        assert dpm.__class__ == DPMSolverMultistepScheduler
-        assert cap_logger.out == ""
diff --git a/ppdiffusers/tests/others/test_ema.py b/ppdiffusers/tests/others/test_ema.py
deleted file mode 100644
index e9eb5150a540..000000000000
--- a/ppdiffusers/tests/others/test_ema.py
+++ /dev/null
@@ -1,155 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import tempfile
-import unittest
-
-import paddle
-
-from ppdiffusers import UNet2DConditionModel
-from ppdiffusers.training_utils import EMAModel
-
-
-class EMAModelTests(unittest.TestCase):
-    model_id = "hf-internal-testing/tiny-stable-diffusion-pipe"
-    batch_size = 1
-    prompt_length = 77
-    text_encoder_hidden_dim = 32
-    num_in_channels = 4
-    latent_height = latent_width = 64
-    generator = paddle.Generator().manual_seed(0)
-
-    def get_models(self, decay=0.9999):
-        unet = UNet2DConditionModel.from_pretrained(self.model_id, subfolder="unet")
-        ema_unet = EMAModel(unet.parameters(), decay=decay, model_cls=UNet2DConditionModel, model_config=unet.config)
-        return unet, ema_unet
-
-    def get_dummy_inputs(self):
-        noisy_latents = paddle.randn(
-            (self.batch_size, self.num_in_channels, self.latent_height, self.latent_width), generator=self.generator
-        )
-        timesteps = paddle.randint(0, 1000, shape=(self.batch_size,), generator=self.generator)
-        encoder_hidden_states = paddle.randn(
-            (self.batch_size, self.prompt_length, self.text_encoder_hidden_dim), generator=self.generator
-        )
-        return noisy_latents, timesteps, encoder_hidden_states
-
-    def simulate_backprop(self, unet):
-        updated_state_dict = {}
-        for k, param in unet.state_dict().items():
-            updated_param = paddle.randn(param.shape, dtype=param.dtype) + (
-                param * paddle.randn(param.shape, dtype=param.dtype)
-            )
-            updated_state_dict.update({k: updated_param})
-        unet.load_dict(updated_state_dict)
-        return unet
-
-    def test_optimization_steps_updated(self):
-        unet, ema_unet = self.get_models()
-        # Take the first (hypothetical) EMA step.
-        ema_unet.step(unet.parameters())
-        assert ema_unet.optimization_step == 1
-
-        # Take two more.
-        for _ in range(2):
-            ema_unet.step(unet.parameters())
-        assert ema_unet.optimization_step == 3
-
-    def test_shadow_params_not_updated(self):
-        unet, ema_unet = self.get_models()
-        # Since the `unet` is not being updated (i.e., backprop'd)
-        # there won't be any difference between the `params` of `unet`
-        # and `ema_unet` even if we call `ema_unet.step(unet.parameters())`.
-        ema_unet.step(unet.parameters())
-        orig_params = list(unet.parameters())
-        for s_param, param in zip(ema_unet.shadow_params, orig_params):
-            assert paddle.allclose(s_param, param)
-
-        # The above holds true even if we call `ema.step()` multiple times since
-        # `unet` params are still not being updated.
-        for _ in range(4):
-            ema_unet.step(unet.parameters())
-        for s_param, param in zip(ema_unet.shadow_params, orig_params):
-            assert paddle.allclose(s_param, param)
-
-    def test_shadow_params_updated(self):
-        unet, ema_unet = self.get_models()
-        # Here we simulate the parameter updates for `unet`. Since there might
-        # be some parameters which are initialized to zero we take extra care to
-        # initialize their values to something non-zero before the multiplication.
-        unet_pseudo_updated_step_one = self.simulate_backprop(unet)
-
-        # Take the EMA step.
-        ema_unet.step(unet_pseudo_updated_step_one.parameters())
-
-        # Now the EMA'd parameters won't be equal to the original model parameters.
-        orig_params = list(unet_pseudo_updated_step_one.parameters())
-        for s_param, param in zip(ema_unet.shadow_params, orig_params):
-            assert not paddle.allclose(s_param, param)
-
-        # Ensure this is the case when we take multiple EMA steps.
-        for _ in range(4):
-            ema_unet.step(unet.parameters())
-        for s_param, param in zip(ema_unet.shadow_params, orig_params):
-            assert not paddle.allclose(s_param, param)
-
-    def test_consecutive_shadow_params_updated(self):
-        # If we call EMA step after a backpropagation consecutively for two times,
-        # the shadow params from those two steps should be different.
-        unet, ema_unet = self.get_models()
-
-        # First backprop + EMA
-        unet_step_one = self.simulate_backprop(unet)
-        ema_unet.step(unet_step_one.parameters())
-        step_one_shadow_params = copy.deepcopy(ema_unet.shadow_params)
-
-        # Second backprop + EMA
-        unet_step_two = self.simulate_backprop(unet_step_one)
-        ema_unet.step(unet_step_two.parameters())
-        step_two_shadow_params = ema_unet.shadow_params
-
-        for step_one, step_two in zip(step_one_shadow_params, step_two_shadow_params):
-            assert not paddle.allclose(step_one, step_two)
-
-    def test_zero_decay(self):
-        # If there's no decay even if there are backprops, EMA steps
-        # won't take any effect i.e., the shadow params would remain the
-        # same.
-        unet, ema_unet = self.get_models(decay=0.0)
-        unet_step_one = self.simulate_backprop(unet)
-        ema_unet.step(unet_step_one.parameters())
-        step_one_shadow_params = ema_unet.shadow_params
-
-        unet_step_two = self.simulate_backprop(unet_step_one)
-        ema_unet.step(unet_step_two.parameters())
-        step_two_shadow_params = ema_unet.shadow_params
-
-        for step_one, step_two in zip(step_one_shadow_params, step_two_shadow_params):
-            assert paddle.allclose(step_one, step_two)
-
-    def test_serialization(self):
-        unet, ema_unet = self.get_models()
-        noisy_latents, timesteps, encoder_hidden_states = self.get_dummy_inputs()
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            ema_unet.save_pretrained(tmpdir)
-            loaded_unet = UNet2DConditionModel.from_pretrained(tmpdir, model_cls=UNet2DConditionModel)
-
-        # Since no EMA step has been performed the outputs should match.
-        output = unet(noisy_latents, timesteps, encoder_hidden_states).sample
-        output_loaded = loaded_unet(noisy_latents, timesteps, encoder_hidden_states).sample
-
-        assert paddle.allclose(output, output_loaded, atol=1e-4)
diff --git a/ppdiffusers/tests/others/test_hub_utils.py b/ppdiffusers/tests/others/test_hub_utils.py
deleted file mode 100644
index e2c61a2f1e28..000000000000
--- a/ppdiffusers/tests/others/test_hub_utils.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-from pathlib import Path
-from tempfile import TemporaryDirectory
-from unittest.mock import Mock, patch
-
-import ppdiffusers.utils.hub_utils
-
-
-class CreateModelCardTest(unittest.TestCase):
-    @patch("ppdiffusers.utils.hub_utils.get_full_repo_name")
-    def test_create_model_card(self, repo_name_mock: Mock) -> None:
-        repo_name_mock.return_value = "full_repo_name"
-        with TemporaryDirectory() as tmpdir:
-            args = Mock()
-            args.output_dir = tmpdir
-            args.local_rank = 0
-            args.hub_token = "hub_token"
-            args.dataset_name = "dataset_name"
-            args.learning_rate = 0.01
-            args.train_batch_size = 100000
-            args.eval_batch_size = 10000
-            args.gradient_accumulation_steps = 0.01
-            args.adam_beta1 = 0.02
-            args.adam_beta2 = 0.03
-            args.adam_weight_decay = 0.0005
-            args.adam_epsilon = 1e-06
-            args.lr_scheduler = 1
-            args.lr_warmup_steps = 10
-            args.ema_inv_gamma = 0.001
-            args.ema_power = 0.1
-            args.ema_max_decay = 0.2
-            args.mixed_precision = True
-            ppdiffusers.utils.hub_utils.create_model_card(args, model_name="model_name")
-            self.assertTrue((Path(tmpdir) / "README.md").is_file())
diff --git a/ppdiffusers/tests/others/test_image_processor.py b/ppdiffusers/tests/others/test_image_processor.py
deleted file mode 100644
index e0c88c40e56b..000000000000
--- a/ppdiffusers/tests/others/test_image_processor.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import paddle
-import PIL
-
-from ppdiffusers.image_processor import VaeImageProcessor
-
-
-class ImageProcessorTest(unittest.TestCase):
-    @property
-    def dummy_sample(self):
-        batch_size = 1
-        num_channels = 3
-        height = 8
-        width = 8
-
-        sample = paddle.rand((batch_size, num_channels, height, width))
-
-        return sample
-
-    def to_np(self, image):
-        if isinstance(image[0], PIL.Image.Image):
-            return np.stack([np.array(i) for i in image], axis=0)
-        elif isinstance(image, paddle.Tensor):
-            return image.transpose([0, 2, 3, 1]).cpu().numpy()
-        return image
-
-    def test_vae_image_processor_pd(self):
-        image_processor = VaeImageProcessor(do_resize=False, do_normalize=False)
-
-        input_pd = self.dummy_sample
-        input_np = self.to_np(input_pd)
-
-        for output_type in ["pd", "np", "pil"]:
-            out = image_processor.postprocess(
-                image_processor.preprocess(input_pd),
-                output_type=output_type,
-            )
-            out_np = self.to_np(out)
-            in_np = (input_np * 255).round() if output_type == "pil" else input_np
-            assert (
-                np.abs(in_np - out_np).max() < 1e-6
-            ), f"decoded output does not match input for output_type {output_type}"
-
-    def test_vae_image_processor_np(self):
-        image_processor = VaeImageProcessor(do_resize=False, do_normalize=False)
-        input_np = self.dummy_sample.transpose([0, 2, 3, 1]).cpu().numpy()
-
-        for output_type in ["pd", "np", "pil"]:
-            out = image_processor.postprocess(image_processor.preprocess(input_np), output_type=output_type)
-
-            out_np = self.to_np(out)
-            in_np = (input_np * 255).round() if output_type == "pil" else input_np
-            assert (
-                np.abs(in_np - out_np).max() < 1e-6
-            ), f"decoded output does not match input for output_type {output_type}"
-
-    def test_vae_image_processor_pil(self):
-        image_processor = VaeImageProcessor(do_resize=False, do_normalize=False)
-
-        input_np = self.dummy_sample.transpose([0, 2, 3, 1]).cpu().numpy()
-        input_pil = image_processor.numpy_to_pil(input_np)
-
-        for output_type in ["pd", "np", "pil"]:
-            out = image_processor.postprocess(image_processor.preprocess(input_pil), output_type=output_type)
-            for i, o in zip(input_pil, out):
-                in_np = np.array(i)
-                out_np = self.to_np(out) if output_type == "pil" else (self.to_np(out) * 255).round()
-                assert (
-                    np.abs(in_np - out_np).max() < 1e-6
-                ), f"decoded output does not match input for output_type {output_type}"
-
-    def test_preprocess_input_3d(self):
-        image_processor = VaeImageProcessor(do_resize=False, do_normalize=False)
-
-        input_pd_4d = self.dummy_sample
-        input_pd_3d = input_pd_4d.squeeze(0)
-
-        out_pt_4d = image_processor.postprocess(
-            image_processor.preprocess(input_pd_4d),
-            output_type="np",
-        )
-        out_pt_3d = image_processor.postprocess(
-            image_processor.preprocess(input_pd_3d),
-            output_type="np",
-        )
-
-        input_np_4d = self.to_np(self.dummy_sample)
-        input_np_3d = input_np_4d.squeeze(0)
-
-        out_np_4d = image_processor.postprocess(
-            image_processor.preprocess(input_np_4d),
-            output_type="np",
-        )
-        out_np_3d = image_processor.postprocess(
-            image_processor.preprocess(input_np_3d),
-            output_type="np",
-        )
-
-        assert np.abs(out_pt_4d - out_pt_3d).max() < 1e-6
-        assert np.abs(out_np_4d - out_np_3d).max() < 1e-6
-
-    def test_preprocess_input_list(self):
-        image_processor = VaeImageProcessor(do_resize=False, do_normalize=False)
-
-        input_pd_4d = self.dummy_sample
-        input_pd_list = list(input_pd_4d)
-
-        out_pt_4d = image_processor.postprocess(
-            image_processor.preprocess(input_pd_4d),
-            output_type="np",
-        )
-
-        out_pt_list = image_processor.postprocess(
-            image_processor.preprocess(input_pd_list),
-            output_type="np",
-        )
-
-        input_np_4d = self.to_np(self.dummy_sample)
-        list(input_np_4d)
-
-        out_np_4d = image_processor.postprocess(
-            image_processor.preprocess(input_pd_4d),
-            output_type="np",
-        )
-
-        out_np_list = image_processor.postprocess(
-            image_processor.preprocess(input_pd_list),
-            output_type="np",
-        )
-
-        assert np.abs(out_pt_4d - out_pt_list).max() < 1e-6
-        assert np.abs(out_np_4d - out_np_list).max() < 1e-6
diff --git a/ppdiffusers/tests/others/test_outputs.py b/ppdiffusers/tests/others/test_outputs.py
deleted file mode 100644
index fc811a0e1c40..000000000000
--- a/ppdiffusers/tests/others/test_outputs.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-from dataclasses import dataclass
-from typing import List, Union
-
-import numpy as np
-import PIL.Image
-
-from ppdiffusers.utils.outputs import BaseOutput
-
-
-@dataclass
-class CustomOutput(BaseOutput):
-    images: Union[List[PIL.Image.Image], np.ndarray]
-
-
-class ConfigTester(unittest.TestCase):
-    def test_outputs_single_attribute(self):
-        outputs = CustomOutput(images=np.random.rand(1, 3, 4, 4))
-        assert isinstance(outputs.images, np.ndarray)
-        assert outputs.images.shape == (1, 3, 4, 4)
-        assert isinstance(outputs["images"], np.ndarray)
-        assert outputs["images"].shape == (1, 3, 4, 4)
-        assert isinstance(outputs[0], np.ndarray)
-        assert outputs[0].shape == (1, 3, 4, 4)
-        outputs = CustomOutput(images=[PIL.Image.new("RGB", (4, 4))])
-        assert isinstance(outputs.images, list)
-        assert isinstance(outputs.images[0], PIL.Image.Image)
-        assert isinstance(outputs["images"], list)
-        assert isinstance(outputs["images"][0], PIL.Image.Image)
-        assert isinstance(outputs[0], list)
-        assert isinstance(outputs[0][0], PIL.Image.Image)
-
-    def test_outputs_dict_init(self):
-        outputs = CustomOutput({"images": np.random.rand(1, 3, 4, 4)})
-        assert isinstance(outputs.images, np.ndarray)
-        assert outputs.images.shape == (1, 3, 4, 4)
-        assert isinstance(outputs["images"], np.ndarray)
-        assert outputs["images"].shape == (1, 3, 4, 4)
-        assert isinstance(outputs[0], np.ndarray)
-        assert outputs[0].shape == (1, 3, 4, 4)
-        outputs = CustomOutput({"images": [PIL.Image.new("RGB", (4, 4))]})
-        assert isinstance(outputs.images, list)
-        assert isinstance(outputs.images[0], PIL.Image.Image)
-        assert isinstance(outputs["images"], list)
-        assert isinstance(outputs["images"][0], PIL.Image.Image)
-        assert isinstance(outputs[0], list)
-        assert isinstance(outputs[0][0], PIL.Image.Image)
diff --git a/ppdiffusers/tests/others/test_training.py b/ppdiffusers/tests/others/test_training.py
deleted file mode 100644
index 2abe14bd6c36..000000000000
--- a/ppdiffusers/tests/others/test_training.py
+++ /dev/null
@@ -1,139 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import paddle
-
-from ppdiffusers import DDIMScheduler, DDPMScheduler, UNet2DConditionModel, UNet2DModel
-from ppdiffusers.training_utils import set_seed
-from ppdiffusers.utils.import_utils import is_ppxformers_available
-from ppdiffusers.utils.testing_utils import slow
-
-
-class UNet2DModelTrainingTests(unittest.TestCase):
-    def get_model_optimizer(self, resolution=32):
-        set_seed(0)
-        model = UNet2DModel(sample_size=resolution, in_channels=3, out_channels=3)
-        optimizer = paddle.optimizer.SGD(parameters=model.parameters(), learning_rate=0.0001)
-        return model, optimizer
-
-    @slow
-    def test_training_step_equality(self):
-        ddpm_scheduler = DDPMScheduler(
-            num_train_timesteps=1000, beta_start=0.0001, beta_end=0.02, beta_schedule="linear", clip_sample=True
-        )
-        ddim_scheduler = DDIMScheduler(
-            num_train_timesteps=1000, beta_start=0.0001, beta_end=0.02, beta_schedule="linear", clip_sample=True
-        )
-        assert ddpm_scheduler.config.num_train_timesteps == ddim_scheduler.config.num_train_timesteps
-        set_seed(0)
-        clean_images = [paddle.randn(shape=(4, 3, 32, 32)).clip(min=-1, max=1) for _ in range(4)]
-        noise = [paddle.randn(shape=(4, 3, 32, 32)) for _ in range(4)]
-        timesteps = [paddle.randint(0, 1000, (4,)).astype(dtype="int64") for _ in range(4)]
-        model, optimizer = self.get_model_optimizer(resolution=32)
-        model.train()
-        for i in range(4):
-            optimizer.clear_grad()
-            ddpm_noisy_images = ddpm_scheduler.add_noise(clean_images[i], noise[i], timesteps[i])
-            ddpm_noise_pred = model(ddpm_noisy_images, timesteps[i]).sample
-            loss = paddle.nn.functional.mse_loss(input=ddpm_noise_pred, label=noise[i])
-            loss.backward()
-            optimizer.step()
-        del model, optimizer
-        model, optimizer = self.get_model_optimizer(resolution=32)
-        model.train()
-        for i in range(4):
-            optimizer.clear_grad()
-            ddim_noisy_images = ddim_scheduler.add_noise(clean_images[i], noise[i], timesteps[i])
-            ddim_noise_pred = model(ddim_noisy_images, timesteps[i]).sample
-            loss = paddle.nn.functional.mse_loss(input=ddim_noise_pred, label=noise[i])
-            loss.backward()
-            optimizer.step()
-        del model, optimizer
-        self.assertTrue(paddle.allclose(ddpm_noisy_images, ddim_noisy_images, atol=1e-05))
-        self.assertTrue(paddle.allclose(ddpm_noise_pred, ddim_noise_pred, atol=1e-04))
-
-
-# new added
-class UNet2DConditionModelTrainingTests(unittest.TestCase):
-    def get_model_optimizer(self, resolution=32):
-        set_seed(0)
-        model = UNet2DConditionModel(sample_size=resolution, in_channels=3, out_channels=3)
-        optimizer = paddle.optimizer.AdamW(parameters=model.parameters(), learning_rate=0.0001)
-        return model, optimizer
-
-    @slow
-    def test_training_step_equality(self):
-        ddpm_scheduler = DDPMScheduler(
-            num_train_timesteps=1000, beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
-        )
-        ddim_scheduler = DDIMScheduler(
-            num_train_timesteps=1000, beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
-        )
-        assert ddpm_scheduler.config.num_train_timesteps == ddim_scheduler.config.num_train_timesteps
-        set_seed(0)
-        clean_images = [paddle.randn(shape=(4, 3, 32, 32)).clip(min=-1, max=1) for _ in range(4)]
-        noise = [paddle.randn(shape=(4, 3, 32, 32)) for _ in range(4)]
-        text_embeddings = [paddle.randn(shape=(4, 77, 1280)) for _ in range(4)]
-        timesteps = [paddle.randint(0, 1000, (4,)).astype(dtype="int64") for _ in range(4)]
-        model, optimizer = self.get_model_optimizer(resolution=32)
-        model.train()
-        for i in range(4):
-            optimizer.clear_grad()
-            ddpm_noisy_images = ddpm_scheduler.add_noise(clean_images[i], noise[i], timesteps[i])
-            ddpm_noise_pred = model(ddpm_noisy_images, timesteps[i], encoder_hidden_states=text_embeddings[i]).sample
-            loss = paddle.nn.functional.mse_loss(input=ddpm_noise_pred, label=noise[i])
-            loss.backward()
-            optimizer.step()
-        del model, optimizer
-        model, optimizer = self.get_model_optimizer(resolution=32)
-        model.train()
-        for i in range(4):
-            optimizer.clear_grad()
-            ddim_noisy_images = ddim_scheduler.add_noise(clean_images[i], noise[i], timesteps[i])
-            ddim_noise_pred = model(ddim_noisy_images, timesteps[i], encoder_hidden_states=text_embeddings[i]).sample
-            loss = paddle.nn.functional.mse_loss(input=ddim_noise_pred, label=noise[i])
-            loss.backward()
-            optimizer.step()
-        del model, optimizer
-        self.assertTrue(paddle.allclose(ddpm_noisy_images, ddim_noisy_images, atol=1e-05))
-        self.assertTrue(paddle.allclose(ddpm_noise_pred, ddim_noise_pred, atol=1e-04))
-
-    @unittest.skipIf(
-        not is_ppxformers_available(),
-        reason="scaled_dot_product_attention attention is only available with CUDA and `scaled_dot_product_attention` installed",
-    )
-    @slow
-    def test_recompute_xformers_training(self):
-        ddpm_scheduler = DDPMScheduler(
-            num_train_timesteps=1000, beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
-        )
-        set_seed(0)
-        clean_images = [paddle.randn(shape=(4, 3, 32, 32)).clip(min=-1, max=1) for _ in range(4)]
-        noise = [paddle.randn(shape=(4, 3, 32, 32)) for _ in range(4)]
-        timesteps = [paddle.randint(0, 1000, (4,)).astype(dtype="int64") for _ in range(4)]
-        text_embeddings = [paddle.randn(shape=(4, 77, 1280)) for _ in range(4)]
-        model, optimizer = self.get_model_optimizer(resolution=32)
-        model.enable_gradient_checkpointing()
-        model.enable_xformers_memory_efficient_attention()
-        model.train()
-        for i in range(4):
-            optimizer.clear_grad()
-            ddpm_noisy_images = ddpm_scheduler.add_noise(clean_images[i], noise[i], timesteps[i])
-            ddpm_noise_pred = model(ddpm_noisy_images, timesteps[i], encoder_hidden_states=text_embeddings[i]).sample
-            loss = paddle.nn.functional.mse_loss(input=ddpm_noise_pred, label=noise[i])
-            loss.backward()
-            optimizer.step()
diff --git a/ppdiffusers/tests/others/test_utils.py b/ppdiffusers/tests/others/test_utils.py
deleted file mode 100644
index 4fb73a2ba430..000000000000
--- a/ppdiffusers/tests/others/test_utils.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-from ppdiffusers import __version__
-from ppdiffusers.utils import deprecate
-
-
-class DeprecateTester(unittest.TestCase):
-    higher_version = ".".join([str(int(__version__.split(".")[0]) + 1)] + __version__.split(".")[1:])
-    lower_version = "0.0.1"
-
-    def test_deprecate_function_arg(self):
-        kwargs = {"deprecated_arg": 4}
-        with self.assertWarns(FutureWarning) as warning:
-            output = deprecate("deprecated_arg", self.higher_version, "message", take_from=kwargs)
-        assert output == 4
-        assert (
-            str(warning.warning)
-            == f"The `deprecated_arg` argument is deprecated and will be removed in version {self.higher_version}. message"
-        )
-
-    def test_deprecate_function_arg_tuple(self):
-        kwargs = {"deprecated_arg": 4}
-        with self.assertWarns(FutureWarning) as warning:
-            output = deprecate(("deprecated_arg", self.higher_version, "message"), take_from=kwargs)
-        assert output == 4
-        assert (
-            str(warning.warning)
-            == f"The `deprecated_arg` argument is deprecated and will be removed in version {self.higher_version}. message"
-        )
-
-    def test_deprecate_function_args(self):
-        kwargs = {"deprecated_arg_1": 4, "deprecated_arg_2": 8}
-        with self.assertWarns(FutureWarning) as warning:
-            output_1, output_2 = deprecate(
-                ("deprecated_arg_1", self.higher_version, "Hey"),
-                ("deprecated_arg_2", self.higher_version, "Hey"),
-                take_from=kwargs,
-            )
-        assert output_1 == 4
-        assert output_2 == 8
-        assert (
-            str(warning.warnings[0].message)
-            == f"The `deprecated_arg_1` argument is deprecated and will be removed in version {self.higher_version}. Hey"
-        )
-        assert (
-            str(warning.warnings[1].message)
-            == f"The `deprecated_arg_2` argument is deprecated and will be removed in version {self.higher_version}. Hey"
-        )
-
-    def test_deprecate_function_incorrect_arg(self):
-        kwargs = {"deprecated_arg": 4}
-        with self.assertRaises(TypeError) as error:
-            deprecate(("wrong_arg", self.higher_version, "message"), take_from=kwargs)
-        assert "test_deprecate_function_incorrect_arg in" in str(error.exception)
-        assert "line" in str(error.exception)
-        assert "got an unexpected keyword argument `deprecated_arg`" in str(error.exception)
-
-    def test_deprecate_arg_no_kwarg(self):
-        with self.assertWarns(FutureWarning) as warning:
-            deprecate(("deprecated_arg", self.higher_version, "message"))
-        assert (
-            str(warning.warning)
-            == f"`deprecated_arg` is deprecated and will be removed in version {self.higher_version}. message"
-        )
-
-    def test_deprecate_args_no_kwarg(self):
-        with self.assertWarns(FutureWarning) as warning:
-            deprecate(
-                ("deprecated_arg_1", self.higher_version, "Hey"), ("deprecated_arg_2", self.higher_version, "Hey")
-            )
-        assert (
-            str(warning.warnings[0].message)
-            == f"`deprecated_arg_1` is deprecated and will be removed in version {self.higher_version}. Hey"
-        )
-        assert (
-            str(warning.warnings[1].message)
-            == f"`deprecated_arg_2` is deprecated and will be removed in version {self.higher_version}. Hey"
-        )
-
-    def test_deprecate_class_obj(self):
-        class Args:
-            arg = 5
-
-        with self.assertWarns(FutureWarning) as warning:
-            arg = deprecate(("arg", self.higher_version, "message"), take_from=Args())
-        assert arg == 5
-        assert (
-            str(warning.warning)
-            == f"The `arg` attribute is deprecated and will be removed in version {self.higher_version}. message"
-        )
-
-    def test_deprecate_class_objs(self):
-        class Args:
-            arg = 5
-            foo = 7
-
-        with self.assertWarns(FutureWarning) as warning:
-            arg_1, arg_2 = deprecate(
-                ("arg", self.higher_version, "message"),
-                ("foo", self.higher_version, "message"),
-                ("does not exist", self.higher_version, "message"),
-                take_from=Args(),
-            )
-        assert arg_1 == 5
-        assert arg_2 == 7
-        assert (
-            str(warning.warning)
-            == f"The `arg` attribute is deprecated and will be removed in version {self.higher_version}. message"
-        )
-        assert (
-            str(warning.warnings[0].message)
-            == f"The `arg` attribute is deprecated and will be removed in version {self.higher_version}. message"
-        )
-        assert (
-            str(warning.warnings[1].message)
-            == f"The `foo` attribute is deprecated and will be removed in version {self.higher_version}. message"
-        )
-
-    def test_deprecate_incorrect_version(self):
-        kwargs = {"deprecated_arg": 4}
-        with self.assertRaises(ValueError) as error:
-            deprecate(("wrong_arg", self.lower_version, "message"), take_from=kwargs)
-        assert (
-            str(error.exception)
-            == f"The deprecation tuple ('wrong_arg', '0.0.1', 'message') should be removed since ppdiffusers' version {__version__} is >= {self.lower_version}"
-        )
-
-    def test_deprecate_incorrect_no_standard_warn(self):
-        with self.assertWarns(FutureWarning) as warning:
-            deprecate(("deprecated_arg", self.higher_version, "This message is better!!!"), standard_warn=False)
-        assert str(warning.warning) == "This message is better!!!"
-
-    def test_deprecate_stacklevel(self):
-        with self.assertWarns(FutureWarning) as warning:
-            deprecate(("deprecated_arg", self.higher_version, "This message is better!!!"), standard_warn=False)
-        assert str(warning.warning) == "This message is better!!!"
-        assert "test_utils.py" in warning.filename
diff --git a/ppdiffusers/tests/pipelines/__init__.py b/ppdiffusers/tests/pipelines/__init__.py
deleted file mode 100644
index a72d388cc895..000000000000
--- a/ppdiffusers/tests/pipelines/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/ppdiffusers/tests/pipelines/altdiffusion/__init__.py b/ppdiffusers/tests/pipelines/altdiffusion/__init__.py
deleted file mode 100644
index a72d388cc895..000000000000
--- a/ppdiffusers/tests/pipelines/altdiffusion/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/ppdiffusers/tests/pipelines/altdiffusion/test_alt_diffusion.py b/ppdiffusers/tests/pipelines/altdiffusion/test_alt_diffusion.py
deleted file mode 100644
index 92cee5f2189f..000000000000
--- a/ppdiffusers/tests/pipelines/altdiffusion/test_alt_diffusion.py
+++ /dev/null
@@ -1,224 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import numpy as np
-import paddle
-
-import ppdiffusers  # noqa F401
-from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, XLMRobertaTokenizer
-from ppdiffusers import (
-    AltDiffusionPipeline,
-    AutoencoderKL,
-    DDIMScheduler,
-    PNDMScheduler,
-    UNet2DConditionModel,
-)
-from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import (
-    RobertaSeriesConfig,
-    RobertaSeriesModelWithTransformation,
-)
-from ppdiffusers.utils import slow
-from ppdiffusers.utils.testing_utils import require_paddle_gpu
-
-from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin
-
-
-class AltDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = AltDiffusionPipeline
-    params = TEXT_TO_IMAGE_PARAMS
-    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
-
-    def get_dummy_components(self):
-        paddle.seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-        paddle.seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        paddle.seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            projection_dim=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=5002,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config).eval()
-        tokenizer = XLMRobertaTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-xlm-roberta", model_max_length=77
-        )  # must set model_max_length 77 here
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "safety_checker": None,
-            "feature_extractor": None,
-        }
-        return components
-
-    def get_dummy_inputs(self, seed=0):
-        generator = paddle.Generator().manual_seed(seed)
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_alt_diffusion_ddim(self):
-        components = self.get_dummy_components()
-        paddle.seed(0)
-        text_encoder_config = RobertaSeriesConfig(
-            hidden_size=32,
-            project_dim=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            vocab_size=5002,
-        )
-        text_encoder = RobertaSeriesModelWithTransformation(text_encoder_config).eval()
-        components["text_encoder"] = text_encoder
-        alt_pipe = AltDiffusionPipeline(**components)
-        alt_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        inputs["prompt"] = "A photo of an astronaut"
-        output = alt_pipe(**inputs)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [0.32336113, 0.2371237, 0.34009337, 0.22972241, 0.23742735, 0.4925817, 0.22020563, 0.20505491, 0.43374813]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05
-
-    def test_alt_diffusion_pndm(self):
-        components = self.get_dummy_components()
-        components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
-        paddle.seed(0)
-        text_encoder_config = RobertaSeriesConfig(
-            hidden_size=32,
-            project_dim=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            vocab_size=5002,
-        )
-        text_encoder = RobertaSeriesModelWithTransformation(text_encoder_config).eval()
-        components["text_encoder"] = text_encoder
-        alt_pipe = AltDiffusionPipeline(**components)
-        alt_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        output = alt_pipe(**inputs)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [0.24095133, 0.26875997, 0.34291863, 0.2529385, 0.2736602, 0.49928105, 0.23973131, 0.21133915, 0.41810605]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05
-
-
-@slow
-@require_paddle_gpu
-class AltDiffusionPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def test_alt_diffusion(self):
-        alt_pipe = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion", safety_checker=None)
-        alt_pipe.set_progress_bar_config(disable=None)
-        prompt = "A painting of a squirrel eating a burger"
-        generator = paddle.Generator().manual_seed(0)
-        output = alt_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=20, output_type="np")
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array(
-            [
-                0.8718514442443848,
-                0.8715569972991943,
-                0.8748429417610168,
-                0.8708409070968628,
-                0.8782679438591003,
-                0.8931069374084473,
-                0.883078932762146,
-                0.881088376045227,
-                0.8617547154426575,
-            ]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_alt_diffusion_fast_ddim(self):
-        scheduler = DDIMScheduler.from_pretrained("BAAI/AltDiffusion", subfolder="scheduler")
-        alt_pipe = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion", scheduler=scheduler, safety_checker=None)
-        alt_pipe.set_progress_bar_config(disable=None)
-        prompt = "A painting of a squirrel eating a burger"
-        generator = paddle.Generator().manual_seed(0)
-        output = alt_pipe([prompt], generator=generator, num_inference_steps=2, output_type="numpy")
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array(
-            [
-                0.9265012741088867,
-                0.9305188059806824,
-                0.8999797105789185,
-                0.9346827268600464,
-                0.9264709949493408,
-                0.9447494745254517,
-                0.9428927898406982,
-                0.9417785406112671,
-                0.9157286882400513,
-            ]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05
diff --git a/ppdiffusers/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py b/ppdiffusers/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
deleted file mode 100644
index 6174438e55e0..000000000000
--- a/ppdiffusers/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
+++ /dev/null
@@ -1,255 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import random
-import unittest
-
-import numpy as np
-import paddle
-
-import ppdiffusers  # noqa F401
-from paddlenlp.transformers import XLMRobertaTokenizer
-from ppdiffusers import (
-    AltDiffusionImg2ImgPipeline,
-    AutoencoderKL,
-    PNDMScheduler,
-    UNet2DConditionModel,
-)
-from ppdiffusers.image_processor import VaeImageProcessor
-from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import (
-    RobertaSeriesConfig,
-    RobertaSeriesModelWithTransformation,
-)
-from ppdiffusers.utils import floats_tensor, load_image, slow
-from ppdiffusers.utils.testing_utils import require_paddle_gpu
-
-
-class AltDiffusionImg2ImgPipelineFastTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    @property
-    def dummy_image(self):
-        batch_size = 1
-        num_channels = 3
-        sizes = 32, 32
-        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0))
-        return image
-
-    @property
-    def dummy_cond_unet(self):
-        paddle.seed(0)
-        model = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        return model
-
-    @property
-    def dummy_vae(self):
-        paddle.seed(0)
-        model = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        return model
-
-    @property
-    def dummy_text_encoder(self):
-        paddle.seed(0)
-        config = RobertaSeriesConfig(
-            hidden_size=32,
-            project_dim=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=5006,
-        )
-        return RobertaSeriesModelWithTransformation(config)
-
-    @property
-    def dummy_extractor(self):
-        def extract(*args, **kwargs):
-            class Out:
-                def __init__(self):
-                    self.pixel_values = paddle.ones(shape=[0])
-
-                def to(self, device):
-                    self.pixel_values
-                    return self
-
-            return Out()
-
-        return extract
-
-    def test_stable_diffusion_img2img_default_case(self):
-        unet = self.dummy_cond_unet
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = XLMRobertaTokenizer.from_pretrained("hf-internal-testing/tiny-xlm-roberta")
-        tokenizer.model_max_length = 77
-        init_image = self.dummy_image
-        alt_pipe = AltDiffusionImg2ImgPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=self.dummy_extractor,
-        )
-        alt_pipe.image_processor = VaeImageProcessor(vae_scale_factor=alt_pipe.vae_scale_factor)
-        alt_pipe.set_progress_bar_config(disable=None)
-        prompt = "A painting of a squirrel eating a burger"
-        generator = paddle.Generator().manual_seed(0)
-        output = alt_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=6.0,
-            num_inference_steps=2,
-            output_type="np",
-            image=init_image,
-        )
-        image = output.images
-        generator = paddle.Generator().manual_seed(0)
-        image_from_tuple = alt_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=6.0,
-            num_inference_steps=2,
-            output_type="np",
-            image=init_image,
-            return_dict=False,
-        )[0]
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array(
-            [0.48931587, 0.40102208, 0.49653798, 0.4203022, 0.34621224, 0.50789315, 0.41116416, 0.4933398, 0.5465742]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.005
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.005
-
-    def test_stable_diffusion_img2img_fp16(self):
-        """Test that stable diffusion img2img works with fp16"""
-        unet = self.dummy_cond_unet
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = XLMRobertaTokenizer.from_pretrained("hf-internal-testing/tiny-xlm-roberta")
-        tokenizer.model_max_length = 77
-        init_image = self.dummy_image
-        unet = unet.to(dtype=paddle.float16)
-        vae = vae.to(dtype=paddle.float16)
-        bert = bert.to(dtype=paddle.float16)
-        alt_pipe = AltDiffusionImg2ImgPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=self.dummy_extractor,
-        )
-        alt_pipe.image_processor = VaeImageProcessor(vae_scale_factor=alt_pipe.vae_scale_factor)
-        alt_pipe.set_progress_bar_config(disable=None)
-        prompt = "A painting of a squirrel eating a burger"
-        generator = paddle.Generator().manual_seed(0)
-        image = alt_pipe(
-            [prompt], generator=generator, num_inference_steps=2, output_type="np", image=init_image
-        ).images
-        assert image.shape == (1, 32, 32, 3)
-
-    def test_stable_diffusion_img2img_pipeline_multiple_of_8(self):
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/img2img/sketch-mountains-input.jpg"
-        )
-        init_image = init_image.resize((760, 504))
-        model_id = "BAAI/AltDiffusion"
-        pipe = AltDiffusionImg2ImgPipeline.from_pretrained(model_id, safety_checker=None)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        prompt = "A fantasy landscape, trending on artstation"
-        generator = paddle.Generator().manual_seed(0)
-        output = pipe(
-            prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5, generator=generator, output_type="np"
-        )
-        image = output.images[0]
-        image_slice = image[255:258, 383:386, -1]
-        assert image.shape == (504, 760, 3)
-        expected_slice = np.array(
-            [0.3251649, 0.3340174, 0.3418343, 0.32628638, 0.33462793, 0.3300547, 0.31628466, 0.3470268, 0.34273332]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.005
-
-
-@slow
-@require_paddle_gpu
-class AltDiffusionImg2ImgPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def test_stable_diffusion_img2img_pipeline_default(self):
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/img2img/sketch-mountains-input.jpg"
-        )
-        init_image = init_image.resize((768, 512))
-        # expected_image = load_numpy(
-        #     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/img2img/fantasy_landscape_alt.npy"
-        # )
-        model_id = "BAAI/AltDiffusion"
-        pipe = AltDiffusionImg2ImgPipeline.from_pretrained(model_id, safety_checker=None)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        prompt = "A fantasy landscape, trending on artstation"
-        generator = paddle.Generator().manual_seed(0)
-        output = pipe(
-            prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5, generator=generator, output_type="np"
-        )
-        image = output.images
-        assert image.shape == (1, 512, 768, 3)
-        image_slice = image[0, -3:, -3:, -1]
-        expected_slice = np.array(
-            [
-                0.09987255930900574,
-                0.09875822067260742,
-                0.12803134322166443,
-                0.10067081451416016,
-                0.1142435073852539,
-                0.11815103888511658,
-                0.14216548204421997,
-                0.16465380787849426,
-                0.15393462777137756,
-            ]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
diff --git a/ppdiffusers/tests/pipelines/audio_diffusion/__init__.py b/ppdiffusers/tests/pipelines/audio_diffusion/__init__.py
deleted file mode 100644
index a72d388cc895..000000000000
--- a/ppdiffusers/tests/pipelines/audio_diffusion/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/ppdiffusers/tests/pipelines/audio_diffusion/test_audio_diffusion.py b/ppdiffusers/tests/pipelines/audio_diffusion/test_audio_diffusion.py
deleted file mode 100644
index ae028faf9cdc..000000000000
--- a/ppdiffusers/tests/pipelines/audio_diffusion/test_audio_diffusion.py
+++ /dev/null
@@ -1,167 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import numpy as np
-import paddle
-
-from ppdiffusers import (
-    AudioDiffusionPipeline,
-    AutoencoderKL,
-    DDIMScheduler,
-    DDPMScheduler,
-    DiffusionPipeline,
-    Mel,
-    UNet2DConditionModel,
-    UNet2DModel,
-)
-from ppdiffusers.utils import slow
-from ppdiffusers.utils.testing_utils import require_paddle_gpu
-
-
-class PipelineFastTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    @property
-    def dummy_unet(self):
-        paddle.seed(0)
-        model = UNet2DModel(
-            sample_size=(32, 64),
-            in_channels=1,
-            out_channels=1,
-            layers_per_block=2,
-            block_out_channels=(128, 128),
-            down_block_types=("AttnDownBlock2D", "DownBlock2D"),
-            up_block_types=("UpBlock2D", "AttnUpBlock2D"),
-        )
-        return model
-
-    @property
-    def dummy_unet_condition(self):
-        paddle.seed(0)
-        model = UNet2DConditionModel(
-            sample_size=(64, 32),
-            in_channels=1,
-            out_channels=1,
-            layers_per_block=2,
-            block_out_channels=(128, 128),
-            down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
-            up_block_types=("UpBlock2D", "CrossAttnUpBlock2D"),
-            cross_attention_dim=10,
-        )
-        return model
-
-    @property
-    def dummy_vqvae_and_unet(self):
-        paddle.seed(0)
-        vqvae = AutoencoderKL(
-            sample_size=(128, 64),
-            in_channels=1,
-            out_channels=1,
-            latent_channels=1,
-            layers_per_block=2,
-            block_out_channels=(128, 128),
-            down_block_types=("DownEncoderBlock2D", "DownEncoderBlock2D"),
-            up_block_types=("UpDecoderBlock2D", "UpDecoderBlock2D"),
-        )
-        unet = UNet2DModel(
-            sample_size=(64, 32),
-            in_channels=1,
-            out_channels=1,
-            layers_per_block=2,
-            block_out_channels=(128, 128),
-            down_block_types=("AttnDownBlock2D", "DownBlock2D"),
-            up_block_types=("UpBlock2D", "AttnUpBlock2D"),
-        )
-        return vqvae, unet
-
-    def test_audio_diffusion(self):
-        mel = Mel()
-        scheduler = DDPMScheduler()
-        pipe = AudioDiffusionPipeline(vqvae=None, unet=self.dummy_unet, mel=mel, scheduler=scheduler)
-        pipe.set_progress_bar_config(disable=None)
-        generator = paddle.Generator().manual_seed(42)
-        output = pipe(generator=generator, steps=4)
-        audio = output.audios[0]
-        image = output.images[0]
-        generator = paddle.Generator().manual_seed(42)
-        output = pipe(generator=generator, steps=4, return_dict=False)
-        image_from_tuple = output[0][0]
-        assert audio.shape == (1, (self.dummy_unet.config.sample_size[1] - 1) * mel.hop_length)
-        assert (
-            image.height == self.dummy_unet.config.sample_size[0]
-            and image.width == self.dummy_unet.config.sample_size[1]
-        )
-        image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10]
-        image_from_tuple_slice = np.frombuffer(image_from_tuple.tobytes(), dtype="uint8")[:10]
-        expected_slice = np.array([0, 252, 0, 160, 144, 1, 0, 211, 99, 3])
-        assert np.abs(image_slice.flatten() - expected_slice).max() == 0
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() <= 5
-        scheduler = DDIMScheduler()
-        dummy_vqvae_and_unet = self.dummy_vqvae_and_unet
-        pipe = AudioDiffusionPipeline(
-            vqvae=self.dummy_vqvae_and_unet[0], unet=dummy_vqvae_and_unet[1], mel=mel, scheduler=scheduler
-        )
-        pipe.set_progress_bar_config(disable=None)
-        np.random.seed(0)
-        raw_audio = np.random.uniform(-1, 1, ((dummy_vqvae_and_unet[0].config.sample_size[1] - 1) * mel.hop_length,))
-        generator = paddle.Generator().manual_seed(42)
-        output = pipe(raw_audio=raw_audio, generator=generator, start_step=5, steps=10)
-        image = output.images[0]
-        assert (
-            image.height == self.dummy_vqvae_and_unet[0].config.sample_size[0]
-            and image.width == self.dummy_vqvae_and_unet[0].config.sample_size[1]
-        )
-        image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10]
-        expected_slice = np.array([128, 100, 153, 95, 92, 77, 130, 121, 81, 166])
-        assert np.abs(image_slice.flatten() - expected_slice).max() <= 5
-        dummy_unet_condition = self.dummy_unet_condition
-        pipe = AudioDiffusionPipeline(
-            vqvae=self.dummy_vqvae_and_unet[0], unet=dummy_unet_condition, mel=mel, scheduler=scheduler
-        )
-        np.random.seed(0)
-        encoding = paddle.rand(shape=(1, 1, 10))
-        output = pipe(generator=generator, encoding=encoding)
-        image = output.images[0]
-        image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10]
-        expected_slice = np.array([139, 103, 88, 105, 100, 120, 116, 99, 106, 89])
-        assert np.abs(image_slice.flatten() - expected_slice).max() <= 5
-
-
-@slow
-@require_paddle_gpu
-class PipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def test_audio_diffusion(self):
-        pipe = DiffusionPipeline.from_pretrained("teticio/audio-diffusion-ddim-256")
-        pipe.set_progress_bar_config(disable=None)
-        generator = paddle.Generator().manual_seed(42)
-        output = pipe(generator=generator)
-        audio = output.audios[0]
-        image = output.images[0]
-        assert audio.shape == (1, (pipe.unet.config.sample_size[1] - 1) * pipe.mel.hop_length)
-        assert image.height == pipe.unet.config.sample_size[0] and image.width == pipe.unet.config.sample_size[1]
-        image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10]
-        expected_slice = np.array([151, 167, 154, 144, 122, 134, 121, 105, 70, 26])
-        assert np.abs(image_slice.flatten() - expected_slice).max() <= 5
diff --git a/ppdiffusers/tests/pipelines/audioldm/__init__.py b/ppdiffusers/tests/pipelines/audioldm/__init__.py
deleted file mode 100644
index 595add0aed9e..000000000000
--- a/ppdiffusers/tests/pipelines/audioldm/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/ppdiffusers/tests/pipelines/audioldm/test_audioldm.py b/ppdiffusers/tests/pipelines/audioldm/test_audioldm.py
deleted file mode 100644
index 0155ef1fb163..000000000000
--- a/ppdiffusers/tests/pipelines/audioldm/test_audioldm.py
+++ /dev/null
@@ -1,399 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import numpy as np
-import paddle
-import paddle.nn.functional as F
-
-from paddlenlp.transformers import (
-    ClapTextConfig,
-    ClapTextModelWithProjection,
-    RobertaTokenizer,
-    SpeechT5HifiGan,
-    SpeechT5HifiGanConfig,
-)
-from ppdiffusers import (
-    AudioLDMPipeline,
-    AutoencoderKL,
-    DDIMScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    UNet2DConditionModel,
-)
-from ppdiffusers.training_utils import enable_full_determinism
-from ppdiffusers.utils import require_paddle_gpu, slow
-
-from ..pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS, TEXT_TO_AUDIO_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin
-
-enable_full_determinism(42)
-
-
-class AudioLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = AudioLDMPipeline
-    params = TEXT_TO_AUDIO_PARAMS
-    batch_params = TEXT_TO_AUDIO_BATCH_PARAMS
-    test_xformers_attention = False
-    required_optional_params = frozenset(
-        [
-            "num_inference_steps",
-            "num_waveforms_per_prompt",
-            "generator",
-            "latents",
-            "output_type",
-            "return_dict",
-            "callback",
-            "callback_steps",
-        ]
-    )
-
-    def get_dummy_components(self):
-        paddle.seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=(32, 64),
-            class_embed_type="simple_projection",
-            projection_class_embeddings_input_dim=32,
-            class_embeddings_concat=True,
-        )
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-        paddle.seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=1,
-            out_channels=1,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        paddle.seed(0)
-        text_encoder_config = ClapTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-            projection_dim=32,
-        )
-        text_encoder = ClapTextModelWithProjection(text_encoder_config)
-        text_encoder.eval()
-        tokenizer = RobertaTokenizer.from_pretrained("hf-internal-testing/tiny-random-roberta", model_max_length=77)
-
-        vocoder_config = SpeechT5HifiGanConfig(
-            model_in_dim=8,
-            sampling_rate=16000,
-            upsample_initial_channel=16,
-            upsample_rates=[2, 2],
-            upsample_kernel_sizes=[4, 4],
-            resblock_kernel_sizes=[3, 7],
-            resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5]],
-            normalize_before=False,
-        )
-
-        vocoder = SpeechT5HifiGan(vocoder_config)
-        vocoder.eval()
-
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "vocoder": vocoder,
-        }
-        return components
-
-    def get_dummy_inputs(self, seed=0):
-        generator = paddle.Generator().manual_seed(seed)
-        inputs = {
-            "prompt": "A hammer hitting a wooden surface",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-        }
-        return inputs
-
-    def test_audioldm_ddim(self):
-        components = self.get_dummy_components()
-        audioldm_pipe = AudioLDMPipeline(**components)
-        inputs = self.get_dummy_inputs()
-        output = audioldm_pipe(**inputs)
-        audio = output.audios[0]
-
-        assert audio.ndim == 1
-        assert len(audio) == 256
-
-        audio_slice = audio[:10]
-        expected_slice = np.array(
-            [-0.0050, 0.0050, -0.0060, 0.0033, -0.0026, 0.0033, -0.0027, 0.0033, -0.0028, 0.0033]
-        )
-
-        assert np.abs(audio_slice - expected_slice).max() < 1e-2
-
-    def test_audioldm_prompt_embeds(self):
-        components = self.get_dummy_components()
-        audioldm_pipe = AudioLDMPipeline(**components)
-        # audioldm_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs()
-        inputs["prompt"] = 3 * [inputs["prompt"]]
-
-        # forward
-        output = audioldm_pipe(**inputs)
-        audio_1 = output.audios[0]
-
-        inputs = self.get_dummy_inputs()
-        prompt = 3 * [inputs.pop("prompt")]
-
-        text_inputs = audioldm_pipe.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=audioldm_pipe.tokenizer.model_max_length,
-            return_attention_mask=True,
-            truncation=True,
-            return_tensors="pd",
-        )
-        text_inputs = text_inputs["input_ids"].cast("int32")
-
-        prompt_embeds = audioldm_pipe.text_encoder(
-            text_inputs,
-        )
-        prompt_embeds = prompt_embeds.text_embeds
-        # additional L_2 normalization over each hidden-state
-        prompt_embeds = F.normalize(prompt_embeds, axis=-1)
-
-        inputs["prompt_embeds"] = prompt_embeds
-
-        # forward
-        output = audioldm_pipe(**inputs)
-        audio_2 = output.audios[0]
-
-        assert np.abs(audio_1 - audio_2).max() < 1e-2
-
-    def test_audioldm_negative_prompt_embeds(self):
-        components = self.get_dummy_components()
-        audioldm_pipe = AudioLDMPipeline(**components)
-        audioldm_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs()
-        negative_prompt = 3 * ["this is a negative prompt"]
-        inputs["negative_prompt"] = negative_prompt
-        inputs["prompt"] = 3 * [inputs["prompt"]]
-
-        # forward
-        output = audioldm_pipe(**inputs)
-        audio_1 = output.audios[0]
-
-        inputs = self.get_dummy_inputs()
-        prompt = 3 * [inputs.pop("prompt")]
-
-        embeds = []
-        for p in [prompt, negative_prompt]:
-            text_inputs = audioldm_pipe.tokenizer(
-                p,
-                padding="max_length",
-                max_length=audioldm_pipe.tokenizer.model_max_length,
-                truncation=True,
-                return_attention_mask=True,
-                return_tensors="pd",
-            )
-            text_inputs = text_inputs["input_ids"].cast("int32")
-
-            text_embeds = audioldm_pipe.text_encoder(
-                text_inputs,
-            )
-            text_embeds = text_embeds.text_embeds
-            # additional L_2 normalization over each hidden-state
-            text_embeds = F.normalize(text_embeds, axis=-1)
-
-            embeds.append(text_embeds)
-
-        inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds
-
-        # forward
-        output = audioldm_pipe(**inputs)
-        audio_2 = output.audios[0]
-
-        assert np.abs(audio_1 - audio_2).max() < 1e-2
-
-    def test_audioldm_negative_prompt(self):
-        components = self.get_dummy_components()
-        components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
-        audioldm_pipe = AudioLDMPipeline(**components)
-        audioldm_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs()
-        negative_prompt = "egg cracking"
-        output = audioldm_pipe(**inputs, negative_prompt=negative_prompt)
-        audio = output.audios[0]
-
-        assert audio.ndim == 1
-        assert len(audio) == 256
-
-        audio_slice = audio[:10]
-        expected_slice = np.array(
-            [-0.0051, 0.0050, -0.0060, 0.0034, -0.0026, 0.0033, -0.0027, 0.0033, -0.0028, 0.0032]
-        )
-
-        assert np.abs(audio_slice - expected_slice).max() < 1e-2
-
-    def test_audioldm_num_waveforms_per_prompt(self):
-        components = self.get_dummy_components()
-        components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
-        audioldm_pipe = AudioLDMPipeline(**components)
-
-        prompt = "A hammer hitting a wooden surface"
-
-        # test num_waveforms_per_prompt=1 (default)
-        audios = audioldm_pipe(prompt, num_inference_steps=2).audios
-
-        assert audios.shape == (1, 256)
-
-        # test num_waveforms_per_prompt=1 (default) for batch of prompts
-        batch_size = 2
-        audios = audioldm_pipe([prompt] * batch_size, num_inference_steps=2).audios
-
-        assert audios.shape == (batch_size, 256)
-
-        # test num_waveforms_per_prompt for single prompt
-        num_waveforms_per_prompt = 2
-        audios = audioldm_pipe(prompt, num_inference_steps=2, num_waveforms_per_prompt=num_waveforms_per_prompt).audios
-
-        assert audios.shape == (num_waveforms_per_prompt, 256)
-
-        # test num_waveforms_per_prompt for batch of prompts
-        batch_size = 2
-        audios = audioldm_pipe(
-            [prompt] * batch_size, num_inference_steps=2, num_waveforms_per_prompt=num_waveforms_per_prompt
-        ).audios
-
-        assert audios.shape == (batch_size * num_waveforms_per_prompt, 256)
-
-    def test_audioldm_audio_length_in_s(self):
-        components = self.get_dummy_components()
-        audioldm_pipe = AudioLDMPipeline(**components)
-        vocoder_sampling_rate = audioldm_pipe.vocoder.config.sampling_rate
-
-        inputs = self.get_dummy_inputs()
-        output = audioldm_pipe(audio_length_in_s=0.016, **inputs)
-        audio = output.audios[0]
-
-        assert audio.ndim == 1
-        assert len(audio) / vocoder_sampling_rate == 0.016
-
-        output = audioldm_pipe(audio_length_in_s=0.032, **inputs)
-        audio = output.audios[0]
-
-        assert audio.ndim == 1
-        assert len(audio) / vocoder_sampling_rate == 0.032
-
-    def test_audioldm_vocoder_model_in_dim(self):
-        components = self.get_dummy_components()
-        audioldm_pipe = AudioLDMPipeline(**components)
-        # audioldm_pipe.set_progress_bar_config(disable=None)
-
-        prompt = ["hey"]
-
-        output = audioldm_pipe(prompt, num_inference_steps=1)
-        audio_shape = output.audios.shape
-        assert audio_shape == (1, 256)
-
-        config = audioldm_pipe.vocoder.config
-        config.model_in_dim *= 2
-        audioldm_pipe.vocoder = SpeechT5HifiGan(config)
-        output = audioldm_pipe(prompt, num_inference_steps=1)
-        audio_shape = output.audios.shape
-        # waveform shape is unchanged, we just have 2x the number of mel channels in the spectrogram
-        assert audio_shape == (1, 256)
-
-    def test_attention_slicing_forward_pass(self):
-        self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False)
-
-    def test_inference_batch_single_identical(self):
-        self._test_inference_batch_single_identical(test_mean_pixel_difference=False)
-
-
-@slow
-@require_paddle_gpu
-class AudioLDMPipelineSlowTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def get_inputs(self, dtype=paddle.float32, seed=0):
-        generator = paddle.Generator().manual_seed(seed)
-        latents = np.random.RandomState(seed).standard_normal((1, 8, 128, 16))
-        latents = paddle.to_tensor(latents).cast(dtype=dtype)
-        inputs = {
-            "prompt": "A hammer hitting a wooden surface",
-            "latents": latents,
-            "generator": generator,
-            "num_inference_steps": 3,
-            "guidance_scale": 2.5,
-        }
-        return inputs
-
-    def test_audioldm(self):
-        audioldm_pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm")
-        audioldm_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs()
-        inputs["num_inference_steps"] = 25
-        audio = audioldm_pipe(**inputs).audios[0]
-
-        assert audio.ndim == 1
-        assert len(audio) == 81920
-
-        audio_slice = audio[77230:77240]
-        expected_slice = np.array(
-            [-0.4884, -0.4607, 0.0023, 0.5007, 0.5896, 0.5151, 0.3813, -0.0208, -0.3687, -0.4315]
-        )
-        max_diff = np.abs(expected_slice - audio_slice).max()
-        assert max_diff < 1e-2
-
-    def test_audioldm_lms(self):
-        audioldm_pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm")
-        audioldm_pipe.scheduler = LMSDiscreteScheduler.from_config(audioldm_pipe.scheduler.config)
-        audioldm_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs()
-        audio = audioldm_pipe(**inputs).audios[0]
-
-        assert audio.ndim == 1
-        assert len(audio) == 81920
-
-        audio_slice = audio[27780:27790]
-        expected_slice = np.array([-0.2131, -0.0873, -0.0124, -0.0189, 0.0569, 0.1373, 0.1883, 0.2886, 0.3297, 0.2212])
-        max_diff = np.abs(expected_slice - audio_slice).max()
-        assert max_diff < 3e-2
diff --git a/ppdiffusers/tests/pipelines/dance_diffusion/__init__.py b/ppdiffusers/tests/pipelines/dance_diffusion/__init__.py
deleted file mode 100644
index a72d388cc895..000000000000
--- a/ppdiffusers/tests/pipelines/dance_diffusion/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/ppdiffusers/tests/pipelines/dance_diffusion/test_dance_diffusion.py b/ppdiffusers/tests/pipelines/dance_diffusion/test_dance_diffusion.py
deleted file mode 100644
index 1d66ac53ab10..000000000000
--- a/ppdiffusers/tests/pipelines/dance_diffusion/test_dance_diffusion.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import numpy as np
-import paddle
-
-from ppdiffusers import DanceDiffusionPipeline, IPNDMScheduler, UNet1DModel
-from ppdiffusers.utils import slow
-from ppdiffusers.utils.testing_utils import require_paddle_gpu
-
-from ..pipeline_params import (
-    UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS,
-    UNCONDITIONAL_AUDIO_GENERATION_PARAMS,
-)
-from ..test_pipelines_common import PipelineTesterMixin
-
-
-class DanceDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = DanceDiffusionPipeline
-    test_attention_slicing = False
-    test_cpu_offload = False
-    params = UNCONDITIONAL_AUDIO_GENERATION_PARAMS
-    required_optional_params = PipelineTesterMixin.required_optional_params - {
-        "callback",
-        "latents",
-        "callback_steps",
-        "output_type",
-        "num_images_per_prompt",
-    }
-    batch_params = UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS
-
-    def get_dummy_components(self):
-        paddle.seed(0)
-        unet = UNet1DModel(
-            block_out_channels=(32, 32, 64),
-            extra_in_channels=16,
-            sample_size=512,
-            sample_rate=16000,
-            in_channels=2,
-            out_channels=2,
-            flip_sin_to_cos=True,
-            use_timestep_embedding=False,
-            time_embedding_type="fourier",
-            mid_block_type="UNetMidBlock1D",
-            down_block_types=("DownBlock1DNoSkip", "DownBlock1D", "AttnDownBlock1D"),
-            up_block_types=("AttnUpBlock1D", "UpBlock1D", "UpBlock1DNoSkip"),
-        )
-        scheduler = IPNDMScheduler()
-        components = {"unet": unet, "scheduler": scheduler}
-        return components
-
-    def get_dummy_inputs(self, seed=0):
-        generator = paddle.Generator().manual_seed(seed)
-
-        inputs = {"batch_size": 1, "generator": generator, "num_inference_steps": 4}
-        return inputs
-
-    def test_dance_diffusion(self):
-        components = self.get_dummy_components()
-        pipe = DanceDiffusionPipeline(**components)
-        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        output = pipe(**inputs)
-        audio = output.audios
-        audio_slice = audio[0, -3:, -3:]
-        assert audio.shape == (1, 2, components["unet"].sample_size)
-        expected_slice = np.array([1.0, 1.0, 0.9972942, -0.4477799, -0.5952974, 1.0])
-        assert np.abs(audio_slice.flatten() - expected_slice).max() < 0.01
-
-
-@slow
-@require_paddle_gpu
-class PipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def test_dance_diffusion(self):
-        pipe = DanceDiffusionPipeline.from_pretrained("harmonai/maestro-150k")
-        pipe.set_progress_bar_config(disable=None)
-        generator = paddle.Generator().manual_seed(0)
-        output = pipe(generator=generator, num_inference_steps=100, audio_length_in_s=4.096)
-        audio = output.audios
-        audio_slice = audio[0, -3:, -3:]
-        assert audio.shape == (1, 2, pipe.unet.sample_size)
-        expected_slice = np.array([-0.15758808, -0.15257765, -0.12701476, -0.26994032, -0.27616554, -0.24865153])
-        assert np.abs(audio_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_dance_diffusion_fp16(self):
-        pipe = DanceDiffusionPipeline.from_pretrained("harmonai/maestro-150k", paddle_dtype=paddle.float16)
-        pipe.set_progress_bar_config(disable=None)
-        generator = paddle.Generator().manual_seed(0)
-        output = pipe(generator=generator, num_inference_steps=100, audio_length_in_s=4.096)
-        audio = output.audios
-        audio_slice = audio[0, -3:, -3:]
-        assert audio.shape == (1, 2, pipe.unet.sample_size)
-        # scheduler use fp32
-        expected_slice = np.array([-0.15350387, -0.14624646, -0.12091318, -0.25969276, -0.26154587, -0.23359495])
-        assert np.abs(audio_slice.flatten() - expected_slice).max() < 0.05
diff --git a/ppdiffusers/tests/pipelines/ddim/__init__.py b/ppdiffusers/tests/pipelines/ddim/__init__.py
deleted file mode 100644
index a72d388cc895..000000000000
--- a/ppdiffusers/tests/pipelines/ddim/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/ppdiffusers/tests/pipelines/ddim/test_ddim.py b/ppdiffusers/tests/pipelines/ddim/test_ddim.py
deleted file mode 100644
index 78830aa73981..000000000000
--- a/ppdiffusers/tests/pipelines/ddim/test_ddim.py
+++ /dev/null
@@ -1,106 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import paddle
-
-from ppdiffusers import DDIMPipeline, DDIMScheduler, UNet2DModel
-from ppdiffusers.utils.testing_utils import require_paddle_gpu, slow
-
-from ..pipeline_params import (
-    UNCONDITIONAL_IMAGE_GENERATION_BATCH_PARAMS,
-    UNCONDITIONAL_IMAGE_GENERATION_PARAMS,
-)
-from ..test_pipelines_common import PipelineTesterMixin
-
-
-class DDIMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = DDIMPipeline
-    test_cpu_offload = False
-    params = UNCONDITIONAL_IMAGE_GENERATION_PARAMS
-    required_optional_params = PipelineTesterMixin.required_optional_params - {
-        "num_images_per_prompt",
-        "latents",
-        "callback",
-        "callback_steps",
-    }
-    batch_params = UNCONDITIONAL_IMAGE_GENERATION_BATCH_PARAMS
-
-    def get_dummy_components(self):
-        paddle.seed(0)
-        unet = UNet2DModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=3,
-            out_channels=3,
-            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
-            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
-        )
-        scheduler = DDIMScheduler()
-        components = {"unet": unet, "scheduler": scheduler}
-        return components
-
-    def get_dummy_inputs(self, seed=0):
-        generator = paddle.Generator().manual_seed(seed)
-
-        inputs = {"batch_size": 1, "generator": generator, "num_inference_steps": 2, "output_type": "numpy"}
-        return inputs
-
-    def test_inference(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-        self.assertEqual(image.shape, (1, 32, 32, 3))
-        expected_slice = np.array([0.0, 0.00152004, 0.0, 0.0, 0.00860906, 0.00182715, 0.00189051, 1.0, 0.668702])
-        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
-        self.assertLessEqual(max_diff, 0.001)
-
-
-@slow
-@require_paddle_gpu
-class DDIMPipelineIntegrationTests(unittest.TestCase):
-    def test_inference_cifar10(self):
-        model_id = "google/ddpm-cifar10-32"
-        unet = UNet2DModel.from_pretrained(model_id)
-        scheduler = DDIMScheduler()
-        ddim = DDIMPipeline(unet=unet, scheduler=scheduler)
-        ddim.set_progress_bar_config(disable=None)
-        generator = paddle.Generator().manual_seed(0)
-        image = ddim(generator=generator, eta=0.0, output_type="numpy").images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.2060, 0.2042, 0.2022, 0.2193, 0.2146, 0.2110, 0.2471, 0.2446, 0.2388])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_inference_ema_bedroom(self):
-        model_id = "google/ddpm-ema-bedroom-256"
-        unet = UNet2DModel.from_pretrained(model_id)
-        scheduler = DDIMScheduler.from_pretrained(model_id)
-        ddim = DDIMPipeline(unet=unet, scheduler=scheduler)
-        ddim.set_progress_bar_config(disable=None)
-        generator = paddle.Generator().manual_seed(0)
-        image = ddim(generator=generator, output_type="numpy").images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 256, 256, 3)
-        expected_slice = np.array(
-            [0.19830778, 0.18826014, 0.18584034, 0.1927332, 0.18754855, 0.17855307, 0.18288234, 0.16375086, 0.1497818]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
diff --git a/ppdiffusers/tests/pipelines/ddpm/__init__.py b/ppdiffusers/tests/pipelines/ddpm/__init__.py
deleted file mode 100644
index a72d388cc895..000000000000
--- a/ppdiffusers/tests/pipelines/ddpm/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/ppdiffusers/tests/pipelines/ddpm/test_ddpm.py b/ppdiffusers/tests/pipelines/ddpm/test_ddpm.py
deleted file mode 100644
index afb93b29df21..000000000000
--- a/ppdiffusers/tests/pipelines/ddpm/test_ddpm.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import paddle
-
-from ppdiffusers import DDPMPipeline, DDPMScheduler, UNet2DModel
-from ppdiffusers.utils.testing_utils import require_paddle_gpu, slow
-
-
-class DDPMPipelineFastTests(unittest.TestCase):
-    @property
-    def dummy_uncond_unet(self):
-        paddle.seed(0)
-        model = UNet2DModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=3,
-            out_channels=3,
-            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
-            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
-        )
-        return model
-
-    def test_fast_inference(self):
-        unet = self.dummy_uncond_unet
-        scheduler = DDPMScheduler()
-        ddpm = DDPMPipeline(unet=unet, scheduler=scheduler)
-        ddpm.set_progress_bar_config(disable=None)
-        generator = paddle.Generator().manual_seed(0)
-        image = ddpm(generator=generator, num_inference_steps=2, output_type="numpy").images
-        generator = paddle.Generator().manual_seed(0)
-        image_from_tuple = ddpm(generator=generator, num_inference_steps=2, output_type="numpy", return_dict=False)[0]
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array(
-            [
-                0.0,
-                0.0,
-                0.0,
-                0.0,
-                0.007474243640899658,
-                0.0,
-                0.007990598678588867,
-                0.9972629547119141,
-                0.6665917634963989,
-            ]
-        )
-        print(image_slice.flatten().tolist())
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_inference_predict_sample(self):
-        unet = self.dummy_uncond_unet
-        scheduler = DDPMScheduler(prediction_type="sample")
-        ddpm = DDPMPipeline(unet=unet, scheduler=scheduler)
-        ddpm.set_progress_bar_config(disable=None)
-        generator = paddle.Generator().manual_seed(0)
-        image = ddpm(generator=generator, num_inference_steps=2, output_type="numpy").images
-        generator = paddle.Generator().manual_seed(0)
-        image_eps = ddpm(generator=generator, num_inference_steps=2, output_type="numpy")[0]
-        image_slice = image[0, -3:, -3:, -1]
-        image_eps_slice = image_eps[0, -3:, -3:, -1]
-        assert image.shape == (1, 32, 32, 3)
-        tolerance = 0.01
-        assert np.abs(image_slice.flatten() - image_eps_slice.flatten()).max() < tolerance
-
-
-@slow
-@require_paddle_gpu
-class DDPMPipelineIntegrationTests(unittest.TestCase):
-    def test_inference_cifar10(self):
-        model_id = "google/ddpm-cifar10-32"
-        unet = UNet2DModel.from_pretrained(model_id)
-        scheduler = DDPMScheduler.from_pretrained(model_id)
-        ddpm = DDPMPipeline(unet=unet, scheduler=scheduler)
-        ddpm.set_progress_bar_config(disable=None)
-        generator = paddle.Generator().manual_seed(0)
-        image = ddpm(generator=generator, output_type="numpy").images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.4454, 0.2025, 0.0315, 0.3023, 0.2575, 0.1031, 0.0953, 0.1604, 0.2020])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
diff --git a/ppdiffusers/tests/pipelines/deepfloyd_if/__init__.py b/ppdiffusers/tests/pipelines/deepfloyd_if/__init__.py
deleted file mode 100644
index d2512121d603..000000000000
--- a/ppdiffusers/tests/pipelines/deepfloyd_if/__init__.py
+++ /dev/null
@@ -1,281 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import tempfile
-
-import numpy as np
-import paddle
-
-from paddlenlp.transformers import AutoTokenizer, T5EncoderModel
-from ppdiffusers import DDPMScheduler, UNet2DConditionModel
-from ppdiffusers.models.attention_processor import AttnAddedKVProcessor
-from ppdiffusers.pipelines.deepfloyd_if import IFWatermarker
-
-from ..test_pipelines_common import to_np
-
-# WARN: the hf-internal-testing/tiny-random-t5 text encoder has some non-determinism in the `save_load` tests.
-
-
-class IFPipelineTesterMixin:
-    def _get_dummy_components(self):
-        paddle.seed(0)
-        text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
-        text_encoder.eval()
-
-        paddle.seed(0)
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
-
-        paddle.seed(0)
-        unet = UNet2DConditionModel(
-            sample_size=32,
-            layers_per_block=1,
-            block_out_channels=[32, 64],
-            down_block_types=[
-                "ResnetDownsampleBlock2D",
-                "SimpleCrossAttnDownBlock2D",
-            ],
-            mid_block_type="UNetMidBlock2DSimpleCrossAttn",
-            up_block_types=["SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"],
-            in_channels=3,
-            out_channels=6,
-            cross_attention_dim=32,
-            encoder_hid_dim=32,
-            attention_head_dim=8,
-            addition_embed_type="text",
-            addition_embed_type_num_heads=2,
-            cross_attention_norm="group_norm",
-            resnet_time_scale_shift="scale_shift",
-            act_fn="gelu",
-        )
-        unet.set_attn_processor(AttnAddedKVProcessor())  # For reproducibility tests
-
-        paddle.seed(0)
-        scheduler = DDPMScheduler(
-            num_train_timesteps=1000,
-            beta_schedule="squaredcos_cap_v2",
-            beta_start=0.0001,
-            beta_end=0.02,
-            thresholding=True,
-            dynamic_thresholding_ratio=0.95,
-            sample_max_value=1.0,
-            prediction_type="epsilon",
-            variance_type="learned_range",
-        )
-
-        paddle.seed(0)
-        watermarker = IFWatermarker()
-
-        return {
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "unet": unet,
-            "scheduler": scheduler,
-            "watermarker": watermarker,
-            "safety_checker": None,
-            "feature_extractor": None,
-        }
-
-    def _get_superresolution_dummy_components(self):
-        paddle.seed(0)
-        text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
-        text_encoder.eval()
-
-        paddle.seed(0)
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
-
-        paddle.seed(0)
-        unet = UNet2DConditionModel(
-            sample_size=32,
-            layers_per_block=[1, 2],
-            block_out_channels=[32, 64],
-            down_block_types=[
-                "ResnetDownsampleBlock2D",
-                "SimpleCrossAttnDownBlock2D",
-            ],
-            mid_block_type="UNetMidBlock2DSimpleCrossAttn",
-            up_block_types=["SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"],
-            in_channels=6,
-            out_channels=6,
-            cross_attention_dim=32,
-            encoder_hid_dim=32,
-            attention_head_dim=8,
-            addition_embed_type="text",
-            addition_embed_type_num_heads=2,
-            cross_attention_norm="group_norm",
-            resnet_time_scale_shift="scale_shift",
-            act_fn="gelu",
-            class_embed_type="timestep",
-            mid_block_scale_factor=1.414,
-            time_embedding_act_fn="gelu",
-            time_embedding_dim=32,
-        )
-        unet.set_attn_processor(AttnAddedKVProcessor())  # For reproducibility tests
-
-        paddle.seed(0)
-        scheduler = DDPMScheduler(
-            num_train_timesteps=1000,
-            beta_schedule="squaredcos_cap_v2",
-            beta_start=0.0001,
-            beta_end=0.02,
-            thresholding=True,
-            dynamic_thresholding_ratio=0.95,
-            sample_max_value=1.0,
-            prediction_type="epsilon",
-            variance_type="learned_range",
-        )
-
-        paddle.seed(0)
-        image_noising_scheduler = DDPMScheduler(
-            num_train_timesteps=1000,
-            beta_schedule="squaredcos_cap_v2",
-            beta_start=0.0001,
-            beta_end=0.02,
-        )
-
-        paddle.seed(0)
-        watermarker = IFWatermarker()
-
-        return {
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "unet": unet,
-            "scheduler": scheduler,
-            "image_noising_scheduler": image_noising_scheduler,
-            "watermarker": watermarker,
-            "safety_checker": None,
-            "feature_extractor": None,
-        }
-
-    # this test is modified from the base class because if pipelines set the text encoder
-    # as optional with the intention that the user is allowed to encode the prompt once
-    # and then pass the embeddings directly to the pipeline. The base class test uses
-    # the unmodified arguments from `self.get_dummy_inputs` which will pass the unencoded
-    # prompt to the pipeline when the text encoder is set to None, throwing an error.
-    # So we make the test reflect the intended usage of setting the text encoder to None.
-    def _test_save_load_optional_components(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs()
-
-        prompt = inputs["prompt"]
-        generator = inputs["generator"]
-        num_inference_steps = inputs["num_inference_steps"]
-        output_type = inputs["output_type"]
-
-        if "image" in inputs:
-            image = inputs["image"]
-        else:
-            image = None
-
-        if "mask_image" in inputs:
-            mask_image = inputs["mask_image"]
-        else:
-            mask_image = None
-
-        if "original_image" in inputs:
-            original_image = inputs["original_image"]
-        else:
-            original_image = None
-
-        prompt_embeds, negative_prompt_embeds = pipe.encode_prompt(prompt)
-
-        # inputs with prompt converted to embeddings
-        inputs = {
-            "prompt_embeds": prompt_embeds,
-            "negative_prompt_embeds": negative_prompt_embeds,
-            "generator": generator,
-            "num_inference_steps": num_inference_steps,
-            "output_type": output_type,
-        }
-
-        if image is not None:
-            inputs["image"] = image
-
-        if mask_image is not None:
-            inputs["mask_image"] = mask_image
-
-        if original_image is not None:
-            inputs["original_image"] = original_image
-
-        # set all optional components to None
-        for optional_component in pipe._optional_components:
-            setattr(pipe, optional_component, None)
-
-        output = pipe(**inputs)[0]
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            pipe.save_pretrained(tmpdir)
-            pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
-            pipe_loaded.set_progress_bar_config(disable=None)
-
-        pipe_loaded.unet.set_attn_processor(AttnAddedKVProcessor())  # For reproducibility tests
-
-        for optional_component in pipe._optional_components:
-            self.assertTrue(
-                getattr(pipe_loaded, optional_component) is None,
-                f"`{optional_component}` did not stay set to None after loading.",
-            )
-
-        inputs = self.get_dummy_inputs()
-
-        generator = inputs["generator"]
-        num_inference_steps = inputs["num_inference_steps"]
-        output_type = inputs["output_type"]
-
-        # inputs with prompt converted to embeddings
-        inputs = {
-            "prompt_embeds": prompt_embeds,
-            "negative_prompt_embeds": negative_prompt_embeds,
-            "generator": generator,
-            "num_inference_steps": num_inference_steps,
-            "output_type": output_type,
-        }
-
-        if image is not None:
-            inputs["image"] = image
-
-        if mask_image is not None:
-            inputs["mask_image"] = mask_image
-
-        if original_image is not None:
-            inputs["original_image"] = original_image
-
-        output_loaded = pipe_loaded(**inputs)[0]
-
-        max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
-        self.assertLess(max_diff, 1e-4)
-
-    # Modified from `PipelineTesterMixin` to set the attn processor as it's not serialized.
-    # This should be handled in the base test and then this method can be removed.
-    def _test_save_load_local(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs()
-        output = pipe(**inputs)[0]
-        with tempfile.TemporaryDirectory() as tmpdir:
-            pipe.save_pretrained(tmpdir)
-            pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
-            pipe_loaded.set_progress_bar_config(disable=None)
-
-        pipe_loaded.unet.set_attn_processor(AttnAddedKVProcessor())  # For reproducibility tests
-
-        inputs = self.get_dummy_inputs()
-        output_loaded = pipe_loaded(**inputs)[0]
-
-        max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
-        self.assertLess(max_diff, 1e-4)
diff --git a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if.py b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if.py
deleted file mode 100644
index 4b1a52e2b950..000000000000
--- a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if.py
+++ /dev/null
@@ -1,322 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import random
-import unittest
-
-import paddle
-
-from ppdiffusers import (
-    IFImg2ImgPipeline,
-    IFImg2ImgSuperResolutionPipeline,
-    IFInpaintingPipeline,
-    IFInpaintingSuperResolutionPipeline,
-    IFPipeline,
-    IFSuperResolutionPipeline,
-)
-from ppdiffusers.models.attention_processor import AttnAddedKVProcessor
-from ppdiffusers.utils.testing_utils import (
-    floats_tensor,
-    load_numpy,
-    require_paddle_gpu,
-    slow,
-)
-
-from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
-from . import IFPipelineTesterMixin
-
-
-class IFPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase):
-    pipeline_class = IFPipeline
-    params = TEXT_TO_IMAGE_PARAMS - {"width", "height", "latents"}
-    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
-    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
-
-    def get_dummy_components(self):
-        return self._get_dummy_components()
-
-    def get_dummy_inputs(self, seed=0):
-        generator = paddle.Generator().manual_seed(seed)
-
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "output_type": "numpy",
-        }
-
-        return inputs
-
-    def test_save_load_optional_components(self):
-        self._test_save_load_optional_components()
-
-    def test_save_load_float16(self):
-        # Due to non-determinism in save load of the hf-internal-testing/tiny-random-t5 text encoder
-        pass
-
-    def test_attention_slicing_forward_pass(self):
-        self._test_attention_slicing_forward_pass(expected_max_diff=1e-2)
-
-    def test_save_load_local(self):
-        self._test_save_load_local()
-
-    def test_inference_batch_single_identical(self):
-        self._test_inference_batch_single_identical(
-            expected_max_diff=1e-2,
-        )
-
-    def test_xformers_attention_forwardGenerator_pass(self):
-        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3)
-
-
-@slow
-@require_paddle_gpu
-class IFPipelineSlowTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def test_all(self):
-        # if
-
-        pipe_1 = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", paddle_dtype=paddle.float16)
-
-        pipe_2 = IFSuperResolutionPipeline.from_pretrained(
-            "DeepFloyd/IF-II-L-v1.0", variant="fp16", paddle_dtype=paddle.float16, text_encoder=None, tokenizer=None
-        )
-
-        # pre compute text embeddings and remove T5 to save memory
-
-        pipe_1.text_encoder
-
-        prompt_embeds, negative_prompt_embeds = pipe_1.encode_prompt("anime turtle")
-
-        del pipe_1.tokenizer
-        del pipe_1.text_encoder
-        gc.collect()
-
-        pipe_1.tokenizer = None
-        pipe_1.text_encoder = None
-
-        pipe_1.enable_model_cpu_offload()
-        pipe_2.enable_model_cpu_offload()
-
-        pipe_1.unet.set_attn_processor(AttnAddedKVProcessor())
-        pipe_2.unet.set_attn_processor(AttnAddedKVProcessor())
-
-        self._test_if(pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds)
-
-        pipe_1.remove_all_hooks()
-        pipe_2.remove_all_hooks()
-
-        # img2img
-
-        pipe_1 = IFImg2ImgPipeline(**pipe_1.components)
-        pipe_2 = IFImg2ImgSuperResolutionPipeline(**pipe_2.components)
-
-        pipe_1.enable_model_cpu_offload()
-        pipe_2.enable_model_cpu_offload()
-
-        pipe_1.unet.set_attn_processor(AttnAddedKVProcessor())
-        pipe_2.unet.set_attn_processor(AttnAddedKVProcessor())
-
-        self._test_if_img2img(pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds)
-
-        pipe_1.remove_all_hooks()
-        pipe_2.remove_all_hooks()
-
-        # inpainting
-
-        pipe_1 = IFInpaintingPipeline(**pipe_1.components)
-        pipe_2 = IFInpaintingSuperResolutionPipeline(**pipe_2.components)
-
-        pipe_1.enable_model_cpu_offload()
-        pipe_2.enable_model_cpu_offload()
-
-        pipe_1.unet.set_attn_processor(AttnAddedKVProcessor())
-        pipe_2.unet.set_attn_processor(AttnAddedKVProcessor())
-
-        self._test_if_inpainting(pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds)
-
-    def _test_if(self, pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds):
-        # pipeline 1
-
-        generator = paddle.Generator().manual_seed(0)
-        output = pipe_1(
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            num_inference_steps=2,
-            generator=generator,
-            output_type="np",
-        )
-
-        image = output.images[0]
-
-        assert image.shape == (64, 64, 3)
-
-        mem_bytes = paddle.cuda.max_memory_allocated()
-        assert mem_bytes < 13 * 10**9
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/if/test_if.npy"
-        )
-        assert_mean_pixel_difference(image, expected_image)
-
-        # pipeline 2
-
-        generator = paddle.Generator().manual_seed(0)
-
-        image = floats_tensor((1, 3, 64, 64), rng=random.Random(0))
-
-        output = pipe_2(
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            image=image,
-            generator=generator,
-            num_inference_steps=2,
-            output_type="np",
-        )
-
-        image = output.images[0]
-
-        assert image.shape == (256, 256, 3)
-
-        mem_bytes = paddle.cuda.max_memory_allocated()
-        assert mem_bytes < 4 * 10**9
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/if/test_if_superresolution_stage_II.npy"
-        )
-        assert_mean_pixel_difference(image, expected_image)
-
-    def _test_if_img2img(self, pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds):
-        # pipeline 1
-
-        image = floats_tensor((1, 3, 64, 64), rng=random.Random(0))
-
-        generator = paddle.Generator().manual_seed(0)
-
-        output = pipe_1(
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            image=image,
-            num_inference_steps=2,
-            generator=generator,
-            output_type="np",
-        )
-
-        image = output.images[0]
-
-        assert image.shape == (64, 64, 3)
-
-        mem_bytes = paddle.cuda.max_memory_allocated()
-        assert mem_bytes < 10 * 10**9
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/if/test_if_img2img.npy"
-        )
-        assert_mean_pixel_difference(image, expected_image)
-
-        # pipeline 2
-
-        generator = paddle.Generator().manual_seed(0)
-
-        original_image = floats_tensor((1, 3, 256, 256), rng=random.Random(0))
-        image = floats_tensor((1, 3, 64, 64), rng=random.Random(0))
-
-        output = pipe_2(
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            image=image,
-            original_image=original_image,
-            generator=generator,
-            num_inference_steps=2,
-            output_type="np",
-        )
-
-        image = output.images[0]
-
-        assert image.shape == (256, 256, 3)
-
-        mem_bytes = paddle.cuda.max_memory_allocated()
-        assert mem_bytes < 4 * 10**9
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/if/test_if_img2img_superresolution_stage_II.npy"
-        )
-        assert_mean_pixel_difference(image, expected_image)
-
-    def _test_if_inpainting(self, pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds):
-        # pipeline 1
-
-        image = floats_tensor((1, 3, 64, 64), rng=random.Random(0))
-        mask_image = floats_tensor((1, 3, 64, 64), rng=random.Random(1))
-
-        generator = paddle.Generator().manual_seed(0)
-        output = pipe_1(
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            image=image,
-            mask_image=mask_image,
-            num_inference_steps=2,
-            generator=generator,
-            output_type="np",
-        )
-
-        image = output.images[0]
-
-        assert image.shape == (64, 64, 3)
-
-        mem_bytes = paddle.cuda.max_memory_allocated()
-        assert mem_bytes < 10 * 10**9
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/if/test_if_inpainting.npy"
-        )
-        assert_mean_pixel_difference(image, expected_image)
-
-        # pipeline 2
-
-        generator = paddle.Generator().manual_seed(0)
-
-        image = floats_tensor((1, 3, 64, 64), rng=random.Random(0))
-        original_image = floats_tensor((1, 3, 256, 256), rng=random.Random(0))
-        mask_image = floats_tensor((1, 3, 256, 256), rng=random.Random(1))
-
-        output = pipe_2(
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            image=image,
-            mask_image=mask_image,
-            original_image=original_image,
-            generator=generator,
-            num_inference_steps=2,
-            output_type="np",
-        )
-
-        image = output.images[0]
-
-        assert image.shape == (256, 256, 3)
-
-        mem_bytes = paddle.device.cuda.max_memory_allocated()
-        assert mem_bytes < 4 * 10**9
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/if/test_if_inpainting_superresolution_stage_II.npy"
-        )
-        assert_mean_pixel_difference(image, expected_image)
diff --git a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_img2img.py b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_img2img.py
deleted file mode 100644
index bab44fc4a5cb..000000000000
--- a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_img2img.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-import unittest
-
-import paddle
-
-from ppdiffusers import IFImg2ImgPipeline
-from ppdiffusers.utils import floats_tensor
-
-from ..pipeline_params import (
-    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
-    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
-)
-from ..test_pipelines_common import PipelineTesterMixin
-from . import IFPipelineTesterMixin
-
-
-class IFImg2ImgPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase):
-    pipeline_class = IFImg2ImgPipeline
-    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"width", "height"}
-    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
-    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
-
-    def get_dummy_components(self):
-        return self._get_dummy_components()
-
-    def get_dummy_inputs(self, seed=0):
-
-        generator = paddle.Generator().manual_seed(seed)
-
-        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed))
-
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "image": image,
-            "generator": generator,
-            "num_inference_steps": 2,
-            "output_type": "numpy",
-        }
-
-        return inputs
-
-    def test_save_load_optional_components(self):
-        self._test_save_load_optional_components()
-
-    def test_xformers_attention_forwardGenerator_pass(self):
-        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3)
-
-    def test_save_load_float16(self):
-        # Due to non-determinism in save load of the hf-internal-testing/tiny-random-t5 text encoder
-        super().test_save_load_float16(expected_max_diff=1e-1)
-
-    def test_float16_inference(self):
-        super().test_float16_inference(expected_max_diff=1e-1)
-
-    def test_attention_slicing_forward_pass(self):
-        self._test_attention_slicing_forward_pass(expected_max_diff=1e-2)
-
-    def test_save_load_local(self):
-        self._test_save_load_local()
-
-    def test_inference_batch_single_identical(self):
-        self._test_inference_batch_single_identical(
-            expected_max_diff=1e-2,
-        )
diff --git a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py
deleted file mode 100644
index 0d977c5d6f2e..000000000000
--- a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-import unittest
-
-import paddle
-
-from ppdiffusers import IFImg2ImgSuperResolutionPipeline
-from ppdiffusers.utils import floats_tensor
-
-from ..pipeline_params import (
-    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
-    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
-)
-from ..test_pipelines_common import PipelineTesterMixin
-from . import IFPipelineTesterMixin
-
-
-class IFImg2ImgSuperResolutionPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase):
-    pipeline_class = IFImg2ImgSuperResolutionPipeline
-    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"width", "height"}
-    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS.union({"original_image"})
-    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
-
-    def get_dummy_components(self):
-        return self._get_superresolution_dummy_components()
-
-    def get_dummy_inputs(self, seed=0):
-
-        generator = paddle.Generator().manual_seed(seed)
-
-        original_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed))
-        image = floats_tensor((1, 3, 16, 16), rng=random.Random(seed))
-
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "image": image,
-            "original_image": original_image,
-            "generator": generator,
-            "num_inference_steps": 2,
-            "output_type": "numpy",
-        }
-
-        return inputs
-
-    def test_xformers_attention_forwardGenerator_pass(self):
-        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3)
-
-    def test_save_load_optional_components(self):
-        self._test_save_load_optional_components()
-
-    def test_save_load_float16(self):
-        # Due to non-determinism in save load of the hf-internal-testing/tiny-random-t5 text encoder
-        super().test_save_load_float16(expected_max_diff=1e-1)
-
-    def test_attention_slicing_forward_pass(self):
-        self._test_attention_slicing_forward_pass(expected_max_diff=1e-2)
-
-    def test_save_load_local(self):
-        self._test_save_load_local()
-
-    def test_inference_batch_single_identical(self):
-        self._test_inference_batch_single_identical(
-            expected_max_diff=1e-2,
-        )
diff --git a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_inpainting.py b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_inpainting.py
deleted file mode 100644
index e46b7c5ebea6..000000000000
--- a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_inpainting.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-import unittest
-
-import paddle
-
-from ppdiffusers import IFInpaintingPipeline
-from ppdiffusers.utils import floats_tensor
-
-from ..pipeline_params import (
-    TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
-    TEXT_GUIDED_IMAGE_INPAINTING_PARAMS,
-)
-from ..test_pipelines_common import PipelineTesterMixin
-from . import IFPipelineTesterMixin
-
-
-class IFInpaintingPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase):
-    pipeline_class = IFInpaintingPipeline
-    params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS - {"width", "height"}
-    batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
-    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
-
-    def get_dummy_components(self):
-        return self._get_dummy_components()
-
-    def get_dummy_inputs(self, seed=0):
-
-        generator = paddle.Generator().manual_seed(seed)
-
-        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed))
-        mask_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed))
-
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "image": image,
-            "mask_image": mask_image,
-            "generator": generator,
-            "num_inference_steps": 2,
-            "output_type": "numpy",
-        }
-
-        return inputs
-
-    def test_xformers_attention_forwardGenerator_pass(self):
-        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3)
-
-    def test_save_load_optional_components(self):
-        self._test_save_load_optional_components()
-
-    def test_save_load_float16(self):
-        # Due to non-determinism in save load of the hf-internal-testing/tiny-random-t5 text encoder
-        super().test_save_load_float16(expected_max_diff=1e-1)
-
-    def test_attention_slicing_forward_pass(self):
-        self._test_attention_slicing_forward_pass(expected_max_diff=1e-2)
-
-    def test_save_load_local(self):
-        self._test_save_load_local()
-
-    def test_inference_batch_single_identical(self):
-        self._test_inference_batch_single_identical(
-            expected_max_diff=1e-2,
-        )
diff --git a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py
deleted file mode 100644
index d50852284146..000000000000
--- a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-import unittest
-
-import paddle
-
-from ppdiffusers import IFInpaintingSuperResolutionPipeline
-from ppdiffusers.utils import floats_tensor
-
-from ..pipeline_params import (
-    TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
-    TEXT_GUIDED_IMAGE_INPAINTING_PARAMS,
-)
-from ..test_pipelines_common import PipelineTesterMixin
-from . import IFPipelineTesterMixin
-
-
-class IFInpaintingSuperResolutionPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase):
-    pipeline_class = IFInpaintingSuperResolutionPipeline
-    params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS - {"width", "height"}
-    batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS.union({"original_image"})
-    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
-
-    def get_dummy_components(self):
-        return self._get_superresolution_dummy_components()
-
-    def get_dummy_inputs(self, seed=0):
-
-        generator = paddle.Generator().manual_seed(seed)
-
-        image = floats_tensor((1, 3, 16, 16), rng=random.Random(seed))
-        original_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed))
-        mask_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed))
-
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "image": image,
-            "original_image": original_image,
-            "mask_image": mask_image,
-            "generator": generator,
-            "num_inference_steps": 2,
-            "output_type": "numpy",
-        }
-
-        return inputs
-
-    def test_xformers_attention_forwardGenerator_pass(self):
-        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3)
-
-    def test_save_load_optional_components(self):
-        self._test_save_load_optional_components()
-
-    def test_save_load_float16(self):
-        # Due to non-determinism in save load of the hf-internal-testing/tiny-random-t5 text encoder
-        super().test_save_load_float16(expected_max_diff=1e-1)
-
-    def test_attention_slicing_forward_pass(self):
-        self._test_attention_slicing_forward_pass(expected_max_diff=1e-2)
-
-    def test_save_load_local(self):
-        self._test_save_load_local()
-
-    def test_inference_batch_single_identical(self):
-        self._test_inference_batch_single_identical(
-            expected_max_diff=1e-2,
-        )
diff --git a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_superresolution.py b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_superresolution.py
deleted file mode 100644
index 79a7319b8075..000000000000
--- a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_superresolution.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-import unittest
-
-import paddle
-
-from ppdiffusers import IFSuperResolutionPipeline
-from ppdiffusers.utils import floats_tensor
-
-from ..pipeline_params import (
-    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
-    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
-)
-from ..test_pipelines_common import PipelineTesterMixin
-from . import IFPipelineTesterMixin
-
-
-class IFSuperResolutionPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase):
-    pipeline_class = IFSuperResolutionPipeline
-    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"width", "height"}
-    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
-    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
-
-    def get_dummy_components(self):
-        return self._get_superresolution_dummy_components()
-
-    def get_dummy_inputs(self, seed=0):
-
-        generator = paddle.Generator().manual_seed(seed)
-
-        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed))
-
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "image": image,
-            "generator": generator,
-            "num_inference_steps": 2,
-            "output_type": "numpy",
-        }
-
-        return inputs
-
-    def test_xformers_attention_forwardGenerator_pass(self):
-        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3)
-
-    def test_save_load_optional_components(self):
-        self._test_save_load_optional_components()
-
-    def test_save_load_float16(self):
-        # Due to non-determinism in save load of the hf-internal-testing/tiny-random-t5 text encoder
-        super().test_save_load_float16(expected_max_diff=1e-1)
-
-    def test_attention_slicing_forward_pass(self):
-        self._test_attention_slicing_forward_pass(expected_max_diff=1e-2)
-
-    def test_save_load_local(self):
-        self._test_save_load_local()
-
-    def test_inference_batch_single_identical(self):
-        self._test_inference_batch_single_identical(
-            expected_max_diff=1e-2,
-        )
diff --git a/ppdiffusers/tests/pipelines/dit/__init__.py b/ppdiffusers/tests/pipelines/dit/__init__.py
deleted file mode 100644
index a72d388cc895..000000000000
--- a/ppdiffusers/tests/pipelines/dit/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/ppdiffusers/tests/pipelines/dit/test_dit.py b/ppdiffusers/tests/pipelines/dit/test_dit.py
deleted file mode 100644
index 98e77e486948..000000000000
--- a/ppdiffusers/tests/pipelines/dit/test_dit.py
+++ /dev/null
@@ -1,185 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import numpy as np
-import paddle
-
-from ppdiffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    DiTPipeline,
-    DPMSolverMultistepScheduler,
-    Transformer2DModel,
-)
-from ppdiffusers.utils import slow
-from ppdiffusers.utils.testing_utils import require_paddle_gpu
-
-from ..pipeline_params import (
-    CLASS_CONDITIONED_IMAGE_GENERATION_BATCH_PARAMS,
-    CLASS_CONDITIONED_IMAGE_GENERATION_PARAMS,
-)
-from ..test_pipelines_common import PipelineTesterMixin
-
-
-class DiTPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = DiTPipeline
-    test_cpu_offload = False
-    params = CLASS_CONDITIONED_IMAGE_GENERATION_PARAMS
-    required_optional_params = PipelineTesterMixin.required_optional_params - {
-        "latents",
-        "num_images_per_prompt",
-        "callback",
-        "callback_steps",
-    }
-    batch_params = CLASS_CONDITIONED_IMAGE_GENERATION_BATCH_PARAMS
-
-    def get_dummy_components(self):
-        paddle.seed(0)
-        transformer = Transformer2DModel(
-            sample_size=16,
-            num_layers=2,
-            patch_size=4,
-            attention_head_dim=8,
-            num_attention_heads=2,
-            in_channels=4,
-            out_channels=8,
-            attention_bias=True,
-            activation_fn="gelu-approximate",
-            num_embeds_ada_norm=1000,
-            norm_type="ada_norm_zero",
-            norm_elementwise_affine=False,
-        )
-        vae = AutoencoderKL()
-        scheduler = DDIMScheduler()
-        components = {"transformer": transformer.eval(), "vae": vae.eval(), "scheduler": scheduler}
-        return components
-
-    def get_dummy_inputs(self, seed=0):
-        generator = paddle.Generator().manual_seed(seed)
-
-        inputs = {"class_labels": [1], "generator": generator, "num_inference_steps": 2, "output_type": "numpy"}
-        return inputs
-
-    def test_inference(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-        self.assertEqual(image.shape, (1, 16, 16, 3))
-        print(image_slice.flatten())
-        expected_slice = np.array([0.28088313, 0.0, 0.8108508, 1.0, 1.0, 0.47994, 0.9075564, 0.0, 0.14398015])
-        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
-        self.assertLessEqual(max_diff, 0.001)
-
-    def test_inference_batch_single_identical(self):
-        self._test_inference_batch_single_identical(relax_max_difference=True, expected_max_diff=1e-3)
-
-    def test_xformers_attention_forwardGenerator_pass(self):
-        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3)
-
-
-@require_paddle_gpu
-@slow
-class DiTPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def test_dit_256(self):
-        generator = paddle.Generator().manual_seed(0)
-        pipe = DiTPipeline.from_pretrained("facebook/DiT-XL-2-256")
-        pipe.to("gpu")
-
-        words = ["vase", "umbrella", "white shark", "white wolf"]
-        ids = pipe.get_label_ids(words)
-        images = pipe(ids, generator=generator, num_inference_steps=40, output_type="np").images
-        expected_slices = np.array(
-            [
-                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                [0.0, 0.0, 0.0, 0.0, 0.0016301274299621582, 0.0, 0.0, 0.0, 0.0],
-                [
-                    0.434637188911438,
-                    0.4323567748069763,
-                    0.4406988322734833,
-                    0.442973256111145,
-                    0.4462621212005615,
-                    0.45129328966140747,
-                    0.41893237829208374,
-                    0.42390328645706177,
-                    0.3906112015247345,
-                ],
-                [
-                    0.9986965656280518,
-                    0.9948190450668335,
-                    0.9841029644012451,
-                    0.9911775588989258,
-                    0.9871039390563965,
-                    0.9874314069747925,
-                    0.9822297096252441,
-                    0.9997426271438599,
-                    1.0,
-                ],
-            ]
-        )
-
-        for word, image, expected_slice in zip(words, images, expected_slices):
-            # expected_image = load_numpy(
-            #     f"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/dit/{word}.npy"
-            # )
-            assert image.shape == (256, 256, 3)
-            image_slice = image[-3:, -3:, -1]
-            assert np.abs((image_slice.flatten() - expected_slice).max()) < 0.001
-
-    def test_dit_512_fp16(self):
-        pipe = DiTPipeline.from_pretrained("facebook/DiT-XL-2-512", paddle_dtype=paddle.float16)
-        pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
-        pipe.to("gpu")
-
-        words = ["vase", "umbrella"]
-        ids = pipe.get_label_ids(words)
-        generator = paddle.Generator().manual_seed(0)
-        images = pipe(ids, generator=generator, num_inference_steps=25, output_type="np").images
-
-        expected_slices = np.array(
-            [
-                [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.994140625],
-                [
-                    0.0,
-                    0.0,
-                    0.01708984375,
-                    0.024658203125,
-                    0.0830078125,
-                    0.134521484375,
-                    0.175537109375,
-                    0.33740234375,
-                    0.207763671875,
-                ],
-            ]
-        )
-
-        for word, image, expected_slice in zip(words, images, expected_slices):
-            # expected_image = load_numpy(
-            #     f"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/dit/{word}_fp16.npy"
-            # )
-            assert image.shape == (512, 512, 3)
-            image_slice = image[-3:, -3:, -1]
-            # TODO make this pass, maybe cased by DPMSolverMultistepScheduler
-            assert np.abs((image_slice.flatten() - expected_slice).max()) < 0.75
diff --git a/ppdiffusers/tests/pipelines/karras_ve/__init__.py b/ppdiffusers/tests/pipelines/karras_ve/__init__.py
deleted file mode 100644
index a72d388cc895..000000000000
--- a/ppdiffusers/tests/pipelines/karras_ve/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/ppdiffusers/tests/pipelines/karras_ve/test_karras_ve.py b/ppdiffusers/tests/pipelines/karras_ve/test_karras_ve.py
deleted file mode 100644
index e7c902783732..000000000000
--- a/ppdiffusers/tests/pipelines/karras_ve/test_karras_ve.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import paddle
-
-from ppdiffusers import KarrasVePipeline, KarrasVeScheduler, UNet2DModel
-from ppdiffusers.utils.testing_utils import require_paddle, slow
-
-
-class KarrasVePipelineFastTests(unittest.TestCase):
-    @property
-    def dummy_uncond_unet(self):
-        paddle.seed(0)
-        model = UNet2DModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=3,
-            out_channels=3,
-            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
-            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
-        )
-        return model
-
-    def test_inference(self):
-        unet = self.dummy_uncond_unet
-        scheduler = KarrasVeScheduler()
-        pipe = KarrasVePipeline(unet=unet, scheduler=scheduler)
-        pipe.set_progress_bar_config(disable=None)
-        generator = paddle.Generator().manual_seed(0)
-        image = pipe(num_inference_steps=2, generator=generator, output_type="numpy").images
-        generator = paddle.Generator().manual_seed(0)
-        image_from_tuple = pipe(num_inference_steps=2, generator=generator, output_type="numpy", return_dict=False)[0]
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
-
-
-@slow
-@require_paddle
-class KarrasVePipelineIntegrationTests(unittest.TestCase):
-    def test_inference(self):
-        model_id = "google/ncsnpp-celebahq-256"
-        model = UNet2DModel.from_pretrained(model_id)
-        scheduler = KarrasVeScheduler()
-        pipe = KarrasVePipeline(unet=model, scheduler=scheduler)
-        pipe.set_progress_bar_config(disable=None)
-        generator = paddle.Generator().manual_seed(0)
-        image = pipe(num_inference_steps=20, generator=generator, output_type="numpy").images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 256, 256, 3)
-        expected_slice = np.array(
-            [0.7528239, 0.7529462, 0.76014197, 0.75482357, 0.75692874, 0.7577723, 0.760527, 0.758951, 0.7599246]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
diff --git a/ppdiffusers/tests/pipelines/latent_diffusion/__init__.py b/ppdiffusers/tests/pipelines/latent_diffusion/__init__.py
deleted file mode 100644
index a72d388cc895..000000000000
--- a/ppdiffusers/tests/pipelines/latent_diffusion/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion.py b/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion.py
deleted file mode 100644
index 9f1340825c0d..000000000000
--- a/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion.py
+++ /dev/null
@@ -1,192 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import numpy as np
-import paddle
-
-from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    LDMTextToImagePipeline,
-    UNet2DConditionModel,
-)
-from ppdiffusers.utils.testing_utils import (
-    load_numpy,
-    nightly,
-    require_paddle_gpu,
-    slow,
-)
-
-from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin
-
-
-class LDMTextToImagePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = LDMTextToImagePipeline
-    params = TEXT_TO_IMAGE_PARAMS - {
-        "negative_prompt",
-        "negative_prompt_embeds",
-        "cross_attention_kwargs",
-        "prompt_embeds",
-    }
-    required_optional_params = PipelineTesterMixin.required_optional_params - {
-        "num_images_per_prompt",
-        "callback",
-        "callback_steps",
-    }
-    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
-    test_cpu_offload = False
-
-    def get_dummy_components(self):
-        paddle.seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-        paddle.seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=(32, 64),
-            in_channels=3,
-            out_channels=3,
-            down_block_types=("DownEncoderBlock2D", "DownEncoderBlock2D"),
-            up_block_types=("UpDecoderBlock2D", "UpDecoderBlock2D"),
-            latent_channels=4,
-        )
-        paddle.seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config).eval()
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        components = {"unet": unet, "scheduler": scheduler, "vqvae": vae, "bert": text_encoder, "tokenizer": tokenizer}
-        return components
-
-    def get_dummy_inputs(self, seed=0):
-        generator = paddle.Generator().manual_seed(seed)
-
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_inference_text2img(self):
-        components = self.get_dummy_components()
-        pipe = LDMTextToImagePipeline(**components)
-        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        image = pipe(**inputs).images
-        assert image.shape == (1, 64, 64, 3)
-        image_slice = image[0, -3:, -3:, -1]
-        expected_slice = np.array(
-            [0.28524342, 0.23806289, 0.38151595, 0.21939021, 0.26112252, 0.5172909, 0.25647423, 0.25049314, 0.47979864]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
-
-
-@slow
-@require_paddle_gpu
-class LDMTextToImagePipelineSlowTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def get_inputs(self, dtype="float32", seed=0):
-        generator = paddle.Generator().manual_seed(seed=seed)
-        latents = np.random.RandomState(seed).standard_normal((1, 4, 32, 32))
-        latents = paddle.to_tensor(latents).cast(dtype)
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "latents": latents,
-            "generator": generator,
-            "num_inference_steps": 3,
-            "guidance_scale": 6.0,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_ldm_default_ddim(self):
-        pipe = LDMTextToImagePipeline.from_pretrained("CompVis/ldm-text2im-large-256")
-        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-        assert image.shape == (1, 256, 256, 3)
-        expected_slice = np.array([0.51825, 0.5285, 0.52543, 0.54258, 0.52304, 0.52569, 0.54363, 0.55276, 0.56878])
-        max_diff = np.abs(expected_slice - image_slice).max()
-        assert max_diff < 0.02
-
-
-@nightly
-@require_paddle_gpu
-class LDMTextToImagePipelineNightlyTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def get_inputs(self, dtype="float32", seed=0):
-        generator = paddle.Generator().manual_seed(seed=seed)
-        latents = np.random.RandomState(seed).standard_normal((1, 4, 32, 32))
-        latents = paddle.to_tensor(latents).cast(dtype)
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "latents": latents,
-            "generator": generator,
-            "num_inference_steps": 50,
-            "guidance_scale": 6.0,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_ldm_default_ddim(self):
-        pipe = LDMTextToImagePipeline.from_pretrained("CompVis/ldm-text2im-large-256")
-        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs()
-        image = pipe(**inputs).images[0]
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/ldm_text2img/ldm_large_256_ddim.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 0.05
diff --git a/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py b/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py
deleted file mode 100644
index 0637c57cd878..000000000000
--- a/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-import unittest
-
-import numpy as np
-import paddle
-
-from ppdiffusers import DDIMScheduler, LDMSuperResolutionPipeline, UNet2DModel, VQModel
-from ppdiffusers.utils import PIL_INTERPOLATION, floats_tensor, load_image, slow
-from ppdiffusers.utils.testing_utils import require_paddle
-
-
-class LDMSuperResolutionPipelineFastTests(unittest.TestCase):
-    @property
-    def dummy_image(self):
-        batch_size = 1
-        num_channels = 3
-        sizes = 32, 32
-        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0))
-        return image
-
-    @property
-    def dummy_uncond_unet(self):
-        paddle.seed(0)
-        model = UNet2DModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=6,
-            out_channels=3,
-            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
-            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
-        )
-        return model
-
-    @property
-    def dummy_vq_model(self):
-        paddle.seed(0)
-        model = VQModel(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=3,
-        )
-        return model
-
-    def test_inference_superresolution(self):
-        unet = self.dummy_uncond_unet
-        scheduler = DDIMScheduler()
-        vqvae = self.dummy_vq_model
-        ldm = LDMSuperResolutionPipeline(unet=unet, vqvae=vqvae, scheduler=scheduler)
-        ldm.set_progress_bar_config(disable=None)
-        init_image = self.dummy_image
-        generator = paddle.Generator().manual_seed(0)
-        image = ldm(image=init_image, generator=generator, num_inference_steps=2, output_type="numpy").images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [0.12982202, 0.8338444, 0.46506804, 0.5459576, 0.6662215, 0.38444045, 0.72195464, 0.5719301, 0.36579454]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05
-
-    def test_inference_superresolution_fp16(self):
-        unet = self.dummy_uncond_unet
-        scheduler = DDIMScheduler()
-        vqvae = self.dummy_vq_model
-        unet = unet.to(dtype=paddle.float16)
-        vqvae = vqvae.to(dtype=paddle.float16)
-        ldm = LDMSuperResolutionPipeline(unet=unet, vqvae=vqvae, scheduler=scheduler)
-        ldm.set_progress_bar_config(disable=None)
-        init_image = self.dummy_image
-        image = ldm(init_image, num_inference_steps=2, output_type="numpy").images
-        assert image.shape == (1, 64, 64, 3)
-
-
-@slow
-@require_paddle
-class LDMSuperResolutionPipelineIntegrationTests(unittest.TestCase):
-    def test_inference_superresolution(self):
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/vq_diffusion/teddy_bear_pool.png"
-        )
-        init_image = init_image.resize((64, 64), resample=PIL_INTERPOLATION["lanczos"])
-        ldm = LDMSuperResolutionPipeline.from_pretrained("duongna/ldm-super-resolution")
-        ldm.set_progress_bar_config(disable=None)
-        generator = paddle.Generator().manual_seed(0)
-        image = ldm(image=init_image, generator=generator, num_inference_steps=20, output_type="numpy").images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 256, 256, 3)
-        expected_slice = np.array([0.7644, 0.7679, 0.7642, 0.7633, 0.7666, 0.756, 0.7425, 0.7257, 0.6907])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05
diff --git a/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_uncond.py b/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_uncond.py
deleted file mode 100644
index 4d9f0320f8bd..000000000000
--- a/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_uncond.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import paddle
-
-from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel
-from ppdiffusers import DDIMScheduler, LDMPipeline, UNet2DModel, VQModel
-from ppdiffusers.utils.testing_utils import require_paddle, slow
-
-
-class LDMPipelineFastTests(unittest.TestCase):
-    @property
-    def dummy_uncond_unet(self):
-        paddle.seed(0)
-        model = UNet2DModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=3,
-            out_channels=3,
-            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
-            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
-        )
-        return model
-
-    @property
-    def dummy_vq_model(self):
-        paddle.seed(0)
-        model = VQModel(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=3,
-        )
-        return model
-
-    @property
-    def dummy_text_encoder(self):
-        paddle.seed(0)
-        config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        return CLIPTextModel(config).eval()
-
-    def test_inference_uncond(self):
-        unet = self.dummy_uncond_unet
-        scheduler = DDIMScheduler()
-        vae = self.dummy_vq_model
-        ldm = LDMPipeline(unet=unet, vqvae=vae, scheduler=scheduler)
-        ldm.set_progress_bar_config(disable=None)
-        generator = paddle.Generator().manual_seed(0)
-        image = ldm(generator=generator, num_inference_steps=2, output_type="numpy").images
-        generator = paddle.Generator().manual_seed(0)
-        image_from_tuple = ldm(generator=generator, num_inference_steps=2, output_type="numpy", return_dict=False)[0]
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [0.827049, 1.0, 0.6244688, 0.7729403, 1.0, 0.73071766, 0.6108738, 0.9107263, 0.7249622]
-        )
-        tolerance = 0.01
-        assert np.abs(image_slice.flatten() - expected_slice).max() < tolerance
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < tolerance
-
-
-@slow
-@require_paddle
-class LDMPipelineIntegrationTests(unittest.TestCase):
-    def test_inference_uncond(self):
-        ldm = LDMPipeline.from_pretrained("CompVis/ldm-celebahq-256")
-        ldm.set_progress_bar_config(disable=None)
-        generator = paddle.Generator().manual_seed(0)
-        image = ldm(generator=generator, num_inference_steps=5, output_type="numpy").images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 256, 256, 3)
-        expected_slice = np.array(
-            [0.59802866, 0.61698544, 0.62753576, 0.6128236, 0.60961217, 0.617262, 0.6060791, 0.60261935, 0.6129079]
-        )
-        tolerance = 0.01
-        assert np.abs(image_slice.flatten() - expected_slice).max() < tolerance
diff --git a/ppdiffusers/tests/pipelines/paint_by_example/__init__.py b/ppdiffusers/tests/pipelines/paint_by_example/__init__.py
deleted file mode 100644
index a72d388cc895..000000000000
--- a/ppdiffusers/tests/pipelines/paint_by_example/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/ppdiffusers/tests/pipelines/paint_by_example/test_paint_by_example.py b/ppdiffusers/tests/pipelines/paint_by_example/test_paint_by_example.py
deleted file mode 100644
index 9878cf668754..000000000000
--- a/ppdiffusers/tests/pipelines/paint_by_example/test_paint_by_example.py
+++ /dev/null
@@ -1,188 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import random
-import unittest
-
-import numpy as np
-import paddle
-from PIL import Image
-
-from paddlenlp.transformers import CLIPImageProcessor, CLIPVisionConfig
-from ppdiffusers import (
-    AutoencoderKL,
-    PaintByExamplePipeline,
-    PNDMScheduler,
-    UNet2DConditionModel,
-)
-from ppdiffusers.pipelines.paint_by_example import PaintByExampleImageEncoder
-from ppdiffusers.utils import floats_tensor, load_image, slow
-from ppdiffusers.utils.testing_utils import require_paddle_gpu
-
-from ..pipeline_params import (
-    IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
-    IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS,
-)
-from ..test_pipelines_common import PipelineTesterMixin
-
-
-class PaintByExamplePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = PaintByExamplePipeline
-    params = IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS
-    batch_params = IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
-
-    def get_dummy_components(self):
-        paddle.seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=9,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        paddle.seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        paddle.seed(0)
-        config = CLIPVisionConfig(
-            hidden_size=32,
-            projection_dim=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            image_size=32,
-            patch_size=4,
-        )
-        image_encoder = PaintByExampleImageEncoder(config, proj_size=32)
-        feature_extractor = CLIPImageProcessor(crop_size=32, size=32)
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "image_encoder": image_encoder,
-            "safety_checker": None,
-            "feature_extractor": feature_extractor,
-        }
-        return components
-
-    def convert_to_pd(self, image):
-        image = np.array(image.convert("RGB"))
-        image = image[None].transpose(0, 3, 1, 2)
-        image = paddle.to_tensor(data=image).cast("float32") / 127.5 - 1.0
-        return image
-
-    # TODO check this
-    def test_save_load_float16(self):
-        pass
-
-    def get_dummy_inputs(self, seed=0):
-        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed))
-        image = image.cpu().transpose(perm=[0, 2, 3, 1])[0]
-        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
-        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((64, 64))
-        example_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((32, 32))
-        generator = paddle.Generator().manual_seed(seed)
-
-        inputs = {
-            "example_image": example_image,
-            "image": init_image,
-            "mask_image": mask_image,
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_paint_by_example_inpaint(self):
-        components = self.get_dummy_components()
-        pipe = PaintByExamplePipeline(**components)
-        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        output = pipe(**inputs)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [0.82595694, 0.51862055, 0.5474039, 0.2411496, 0.20220888, 0.3430622, 0.3558151, 0.06606945, 0.4550809]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_paint_by_example_image_tensor(self):
-        inputs = self.get_dummy_inputs()
-        inputs.pop("mask_image")
-        image = self.convert_to_pd(inputs.pop("image"))
-        mask_image = image.clip(min=0, max=1) / 2
-        pipe = PaintByExamplePipeline(**self.get_dummy_components())
-        pipe.set_progress_bar_config(disable=None)
-        output = pipe(image=image, mask_image=mask_image[:, 0], **inputs)
-        out_1 = output.images
-        image = image.cpu().transpose(perm=[0, 2, 3, 1])[0]
-        mask_image = mask_image.cpu().transpose(perm=[0, 2, 3, 1])[0]
-        image = Image.fromarray(np.uint8(image)).convert("RGB")
-        mask_image = Image.fromarray(np.uint8(mask_image)).convert("RGB")
-        output = pipe(**self.get_dummy_inputs())
-        out_2 = output.images
-        assert out_1.shape == (1, 64, 64, 3)
-        assert np.abs(out_1.flatten() - out_2.flatten()).max() < 0.05
-
-
-@slow
-@require_paddle_gpu
-class PaintByExamplePipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def test_paint_by_example(self):
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/paint_by_example/dog_in_bucket.png"
-        )
-        mask_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/paint_by_example/mask.png"
-        )
-        example_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/paint_by_example/panda.jpg"
-        )
-        pipe = PaintByExamplePipeline.from_pretrained("Fantasy-Studio/Paint-by-Example")
-        pipe.set_progress_bar_config(disable=None)
-        generator = paddle.Generator().manual_seed(seed=321)
-        output = pipe(
-            image=init_image,
-            mask_image=mask_image,
-            example_image=example_image,
-            generator=generator,
-            guidance_scale=5.0,
-            num_inference_steps=50,
-            output_type="np",
-        )
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.4834, 0.4811, 0.4874, 0.5122, 0.5081, 0.5144, 0.5291, 0.529, 0.5374])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.02
diff --git a/ppdiffusers/tests/pipelines/pipeline_params.py b/ppdiffusers/tests/pipelines/pipeline_params.py
deleted file mode 100644
index f045127664b3..000000000000
--- a/ppdiffusers/tests/pipelines/pipeline_params.py
+++ /dev/null
@@ -1,137 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# These are canonical sets of parameters for different types of pipelines.
-# They are set on subclasses of `PipelineTesterMixin` as `params` and
-# `batch_params`.
-#
-# If a pipeline's set of arguments has minor changes from one of the common sets
-# of arguments, do not make modifications to the existing common sets of arguments.
-# I.e. a text to image pipeline with non-configurable height and width arguments
-# should set its attribute as `params = TEXT_TO_IMAGE_PARAMS - {'height', 'width'}`.
-
-TEXT_TO_IMAGE_PARAMS = frozenset(
-    [
-        "prompt",
-        "height",
-        "width",
-        "guidance_scale",
-        "negative_prompt",
-        "prompt_embeds",
-        "negative_prompt_embeds",
-        "cross_attention_kwargs",
-    ]
-)
-
-TEXT_TO_IMAGE_BATCH_PARAMS = frozenset(["prompt", "negative_prompt"])
-
-IMAGE_VARIATION_PARAMS = frozenset(
-    [
-        "image",
-        "height",
-        "width",
-        "guidance_scale",
-    ]
-)
-
-IMAGE_VARIATION_BATCH_PARAMS = frozenset(["image"])
-
-TEXT_GUIDED_IMAGE_VARIATION_PARAMS = frozenset(
-    [
-        "prompt",
-        "image",
-        "height",
-        "width",
-        "guidance_scale",
-        "negative_prompt",
-        "prompt_embeds",
-        "negative_prompt_embeds",
-    ]
-)
-
-TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS = frozenset(["prompt", "image", "negative_prompt"])
-
-TEXT_GUIDED_IMAGE_INPAINTING_PARAMS = frozenset(
-    [
-        # Text guided image variation with an image mask
-        "prompt",
-        "image",
-        "mask_image",
-        "height",
-        "width",
-        "guidance_scale",
-        "negative_prompt",
-        "prompt_embeds",
-        "negative_prompt_embeds",
-    ]
-)
-
-TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS = frozenset(["prompt", "image", "mask_image", "negative_prompt"])
-
-IMAGE_INPAINTING_PARAMS = frozenset(
-    [
-        # image variation with an image mask
-        "image",
-        "mask_image",
-        "height",
-        "width",
-        "guidance_scale",
-    ]
-)
-
-IMAGE_INPAINTING_BATCH_PARAMS = frozenset(["image", "mask_image"])
-
-IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS = frozenset(
-    [
-        "example_image",
-        "image",
-        "mask_image",
-        "height",
-        "width",
-        "guidance_scale",
-    ]
-)
-
-IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS = frozenset(["example_image", "image", "mask_image"])
-
-CLASS_CONDITIONED_IMAGE_GENERATION_PARAMS = frozenset(["class_labels"])
-
-CLASS_CONDITIONED_IMAGE_GENERATION_BATCH_PARAMS = frozenset(["class_labels"])
-
-UNCONDITIONAL_IMAGE_GENERATION_PARAMS = frozenset(["batch_size"])
-
-UNCONDITIONAL_IMAGE_GENERATION_BATCH_PARAMS = frozenset([])
-
-UNCONDITIONAL_AUDIO_GENERATION_PARAMS = frozenset(["batch_size"])
-
-UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS = frozenset([])
-
-
-TEXT_TO_AUDIO_PARAMS = frozenset(
-    [
-        "prompt",
-        "audio_length_in_s",
-        "guidance_scale",
-        "negative_prompt",
-        "prompt_embeds",
-        "negative_prompt_embeds",
-        "cross_attention_kwargs",
-    ]
-)
-
-TEXT_TO_AUDIO_BATCH_PARAMS = frozenset(["prompt", "negative_prompt"])
-TOKENS_TO_AUDIO_GENERATION_PARAMS = frozenset(["input_tokens"])
-
-TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS = frozenset(["input_tokens"])
diff --git a/ppdiffusers/tests/pipelines/pndm/__init__.py b/ppdiffusers/tests/pipelines/pndm/__init__.py
deleted file mode 100644
index a72d388cc895..000000000000
--- a/ppdiffusers/tests/pipelines/pndm/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/ppdiffusers/tests/pipelines/pndm/test_pndm.py b/ppdiffusers/tests/pipelines/pndm/test_pndm.py
deleted file mode 100644
index 7f10e188fcf8..000000000000
--- a/ppdiffusers/tests/pipelines/pndm/test_pndm.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import paddle
-
-from ppdiffusers import PNDMPipeline, PNDMScheduler, UNet2DModel
-from ppdiffusers.utils.testing_utils import require_paddle, slow
-
-
-class PNDMPipelineFastTests(unittest.TestCase):
-    @property
-    def dummy_uncond_unet(self):
-        paddle.seed(0)
-        model = UNet2DModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=3,
-            out_channels=3,
-            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
-            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
-        )
-        return model
-
-    def test_inference(self):
-        unet = self.dummy_uncond_unet
-        scheduler = PNDMScheduler()
-        pndm = PNDMPipeline(unet=unet, scheduler=scheduler)
-        pndm.set_progress_bar_config(disable=None)
-        generator = paddle.Generator().manual_seed(0)
-        image = pndm(generator=generator, num_inference_steps=20, output_type="numpy").images
-        generator = paddle.Generator().manual_seed(0)
-        image_from_tuple = pndm(generator=generator, num_inference_steps=20, output_type="numpy", return_dict=False)[0]
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
-
-
-@slow
-@require_paddle
-class PNDMPipelineIntegrationTests(unittest.TestCase):
-    def test_inference_cifar10(self):
-        model_id = "google/ddpm-cifar10-32"
-        unet = UNet2DModel.from_pretrained(model_id)
-        scheduler = PNDMScheduler()
-        pndm = PNDMPipeline(unet=unet, scheduler=scheduler)
-        pndm.set_progress_bar_config(disable=None)
-        generator = paddle.Generator().manual_seed(0)
-        image = pndm(generator=generator, output_type="numpy").images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array(
-            [
-                0.15949559211730957,
-                0.17172572016716003,
-                0.17315810918807983,
-                0.1836635172367096,
-                0.1823960244655609,
-                0.1799020767211914,
-                0.21776044368743896,
-                0.22992581129074097,
-                0.21678516268730164,
-            ]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
diff --git a/ppdiffusers/tests/pipelines/repaint/__init__.py b/ppdiffusers/tests/pipelines/repaint/__init__.py
deleted file mode 100644
index a72d388cc895..000000000000
--- a/ppdiffusers/tests/pipelines/repaint/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/ppdiffusers/tests/pipelines/repaint/test_repaint.py b/ppdiffusers/tests/pipelines/repaint/test_repaint.py
deleted file mode 100644
index b9655a7d636f..000000000000
--- a/ppdiffusers/tests/pipelines/repaint/test_repaint.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import numpy as np
-import paddle
-
-from ppdiffusers import RePaintPipeline, RePaintScheduler, UNet2DModel
-from ppdiffusers.utils.testing_utils import (
-    load_image,
-    load_numpy,
-    nightly,
-    require_paddle_gpu,
-)
-
-from ..pipeline_params import IMAGE_INPAINTING_BATCH_PARAMS, IMAGE_INPAINTING_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin
-
-
-class RepaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = RePaintPipeline
-    test_cpu_offload = False
-    params = IMAGE_INPAINTING_PARAMS - {"width", "height", "guidance_scale"}
-    required_optional_params = PipelineTesterMixin.required_optional_params - {
-        "latents",
-        "num_images_per_prompt",
-        "callback",
-        "callback_steps",
-    }
-    batch_params = IMAGE_INPAINTING_BATCH_PARAMS
-
-    def get_dummy_components(self):
-        paddle.seed(0)
-        unet = UNet2DModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=3,
-            out_channels=3,
-            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
-            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
-        )
-        scheduler = RePaintScheduler()
-        components = {"unet": unet, "scheduler": scheduler}
-        return components
-
-    def get_dummy_inputs(self, seed=0):
-        generator = paddle.Generator().manual_seed(seed)
-
-        image = np.random.RandomState(seed).standard_normal((1, 3, 32, 32))
-        image = paddle.to_tensor(data=image).cast("float32")
-        mask = (image > 0).cast("float32")
-        inputs = {
-            "image": image,
-            "mask_image": mask,
-            "generator": generator,
-            "num_inference_steps": 5,
-            "eta": 0.0,
-            "jump_length": 2,
-            "jump_n_sample": 2,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_repaint(self):
-        components = self.get_dummy_components()
-        sd_pipe = RePaintPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array(
-            [0.08341709, 0.54262626, 0.549711, 0.00903523, 0.0, 1.0, 0.05136755, 0.5604646, 0.6273578]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
-
-    # RePaint can hardly be made deterministic since the scheduler is currently always
-    # nondeterministic
-    @unittest.skip("non-deterministic pipeline")
-    def test_inference_batch_single_identical(self):
-        return super().test_inference_batch_single_identical()
-
-
-@nightly
-@require_paddle_gpu
-class RepaintPipelineNightlyTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def test_celebahq(self):
-        original_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/repaint/celeba_hq_256.png"
-        )
-        mask_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/repaint/mask_256.png"
-        )
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/repaint/celeba_hq_256_result.npy"
-        )
-        model_id = "google/ddpm-ema-celebahq-256"
-        unet = UNet2DModel.from_pretrained(model_id)
-        scheduler = RePaintScheduler.from_pretrained(model_id)
-        repaint = RePaintPipeline(unet=unet, scheduler=scheduler)
-        repaint.set_progress_bar_config(disable=None)
-        repaint.enable_attention_slicing()
-        generator = paddle.Generator().manual_seed(0)
-        output = repaint(
-            original_image,
-            mask_image,
-            num_inference_steps=250,
-            eta=0.0,
-            jump_length=10,
-            jump_n_sample=10,
-            generator=generator,
-            output_type="np",
-        )
-        image = output.images[0]
-        assert image.shape == (256, 256, 3)
-        assert np.abs(expected_image - image).mean() < 0.01
diff --git a/ppdiffusers/tests/pipelines/score_sde_ve/__init__.py b/ppdiffusers/tests/pipelines/score_sde_ve/__init__.py
deleted file mode 100644
index a72d388cc895..000000000000
--- a/ppdiffusers/tests/pipelines/score_sde_ve/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/ppdiffusers/tests/pipelines/score_sde_ve/test_score_sde_ve.py b/ppdiffusers/tests/pipelines/score_sde_ve/test_score_sde_ve.py
deleted file mode 100644
index f8cd507716eb..000000000000
--- a/ppdiffusers/tests/pipelines/score_sde_ve/test_score_sde_ve.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import paddle
-
-from ppdiffusers import ScoreSdeVePipeline, ScoreSdeVeScheduler, UNet2DModel
-from ppdiffusers.utils.testing_utils import require_paddle, slow
-
-
-class ScoreSdeVeipelineFastTests(unittest.TestCase):
-    @property
-    def dummy_uncond_unet(self):
-        paddle.seed(0)
-        model = UNet2DModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=3,
-            out_channels=3,
-            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
-            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
-        )
-        return model
-
-    def test_inference(self):
-        unet = self.dummy_uncond_unet
-        scheduler = ScoreSdeVeScheduler()
-        sde_ve = ScoreSdeVePipeline(unet=unet, scheduler=scheduler)
-        sde_ve.set_progress_bar_config(disable=None)
-        generator = paddle.Generator().manual_seed(0)
-        image = sde_ve(num_inference_steps=2, output_type="numpy", generator=generator).images
-        generator = paddle.Generator().manual_seed(0)
-        image_from_tuple = sde_ve(num_inference_steps=2, output_type="numpy", generator=generator, return_dict=False)[
-            0
-        ]
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
-
-
-@slow
-@require_paddle
-class ScoreSdeVePipelineIntegrationTests(unittest.TestCase):
-    def test_inference(self):
-        model_id = "google/ncsnpp-church-256"
-        model = UNet2DModel.from_pretrained(model_id)
-        scheduler = ScoreSdeVeScheduler.from_pretrained(model_id)
-        sde_ve = ScoreSdeVePipeline(unet=model, scheduler=scheduler)
-        sde_ve.set_progress_bar_config(disable=None)
-        generator = paddle.Generator().manual_seed(0)
-        image = sde_ve(num_inference_steps=10, output_type="numpy", generator=generator).images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 256, 256, 3)
-        expected_slice = np.array([1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
diff --git a/ppdiffusers/tests/pipelines/semantic_stable_diffusion/__init__.py b/ppdiffusers/tests/pipelines/semantic_stable_diffusion/__init__.py
deleted file mode 100644
index a72d388cc895..000000000000
--- a/ppdiffusers/tests/pipelines/semantic_stable_diffusion/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/ppdiffusers/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py b/ppdiffusers/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py
deleted file mode 100644
index d9f083dde8e1..000000000000
--- a/ppdiffusers/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py
+++ /dev/null
@@ -1,509 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import random
-import tempfile
-import unittest
-
-import numpy as np
-import paddle
-
-from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    UNet2DConditionModel,
-)
-from ppdiffusers.pipelines.semantic_stable_diffusion import (
-    SemanticStableDiffusionPipeline as StableDiffusionPipeline,
-)
-from ppdiffusers.utils import floats_tensor, nightly
-from ppdiffusers.utils.testing_utils import require_paddle_gpu
-
-
-class SafeDiffusionPipelineFastTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    @property
-    def dummy_image(self):
-        batch_size = 1
-        num_channels = 3
-        sizes = 32, 32
-        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0))
-        return image
-
-    @property
-    def dummy_cond_unet(self):
-        paddle.seed(0)
-        model = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        return model
-
-    @property
-    def dummy_vae(self):
-        paddle.seed(0)
-        model = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        return model
-
-    @property
-    def dummy_text_encoder(self):
-        paddle.seed(0)
-        config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        return CLIPTextModel(config).eval()
-
-    @property
-    def dummy_extractor(self):
-        def extract(*args, **kwargs):
-            class Out:
-                def __init__(self):
-                    self.pixel_values = paddle.ones(shape=[0])
-
-                def to(self, device):
-                    self.pixel_values
-                    return self
-
-            return Out()
-
-        return extract
-
-    def test_semantic_diffusion_ddim(self):
-        unet = self.dummy_cond_unet
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        sd_pipe = StableDiffusionPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=self.dummy_extractor,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-        prompt = "A painting of a squirrel eating a burger"
-        generator = paddle.Generator().manual_seed(0)
-        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
-        image = output.images
-        generator = paddle.Generator().manual_seed(0)
-        image_from_tuple = sd_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=6.0,
-            num_inference_steps=2,
-            output_type="np",
-            return_dict=False,
-        )[0]
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [0.28401083, 0.23724163, 0.38141036, 0.2201719, 0.26111937, 0.5176592, 0.25668317, 0.25036532, 0.47986418]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_semantic_diffusion_no_safety_checker(self):
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None
-        )
-        assert isinstance(pipe, StableDiffusionPipeline)
-        assert isinstance(pipe.scheduler, LMSDiscreteScheduler)
-        assert pipe.safety_checker is None
-        image = pipe("example prompt", num_inference_steps=2).images[0]
-        assert image is not None
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pipe.save_pretrained(tmpdirname)
-            pipe = StableDiffusionPipeline.from_pretrained(tmpdirname, from_diffusers=False)
-        assert pipe.safety_checker is None
-        image = pipe("example prompt", num_inference_steps=2).images[0]
-        assert image is not None
-
-    def test_semantic_diffusion_pndm(self):
-        unet = self.dummy_cond_unet
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        sd_pipe = StableDiffusionPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=self.dummy_extractor,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-        prompt = "A painting of a squirrel eating a burger"
-        generator = paddle.Generator().manual_seed(0)
-        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
-        image = output.images
-        generator = paddle.Generator().manual_seed(0)
-        image_from_tuple = sd_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=6.0,
-            num_inference_steps=2,
-            output_type="np",
-            return_dict=False,
-        )[0]
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [0.18612236, 0.24176982, 0.36099488, 0.21807766, 0.27262795, 0.51991826, 0.22258872, 0.22143877, 0.4452843]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.02
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.02
-
-    def test_semantic_diffusion_fp16(self):
-        """Test that stable diffusion works with fp16"""
-        unet = self.dummy_cond_unet
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        unet = unet.to(dtype=paddle.float16)
-        vae = vae.to(dtype=paddle.float16)
-        bert = bert.to(dtype=paddle.float16)
-        sd_pipe = StableDiffusionPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=self.dummy_extractor,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-        prompt = "A painting of a squirrel eating a burger"
-        image = sd_pipe([prompt], num_inference_steps=2, output_type="np").images
-        assert image.shape == (1, 64, 64, 3)
-
-
-@nightly
-@require_paddle_gpu
-class SemanticDiffusionPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        # paddle.device.cuda.empty_cache()
-
-    def test_positive_guidance(self):
-        pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-        pipe.set_progress_bar_config(disable=None)
-        prompt = "a photo of a cat"
-        edit = {
-            "editing_prompt": ["sunglasses"],
-            "reverse_editing_direction": [False],
-            "edit_warmup_steps": 10,
-            "edit_guidance_scale": 6,
-            "edit_threshold": 0.95,
-            "edit_momentum_scale": 0.5,
-            "edit_mom_beta": 0.6,
-        }
-        seed = 3
-        guidance_scale = 7
-        generator = paddle.Generator().manual_seed(seed)
-        output = pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=guidance_scale,
-            num_inference_steps=50,
-            output_type="np",
-            width=512,
-            height=512,
-        )
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        expected_slice = [
-            0.34673113,
-            0.38492733,
-            0.37597352,
-            0.34086335,
-            0.35650748,
-            0.35579205,
-            0.3384763,
-            0.34340236,
-            0.3573271,
-        ]
-        assert image.shape == (1, 512, 512, 3)
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-        generator = paddle.Generator().manual_seed(seed)
-        output = pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=guidance_scale,
-            num_inference_steps=50,
-            output_type="np",
-            width=512,
-            height=512,
-            **edit,
-        )
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        expected_slice = [
-            0.41887826,
-            0.37728766,
-            0.30138272,
-            0.41416335,
-            0.41664985,
-            0.36283392,
-            0.36191246,
-            0.43364465,
-            0.43001732,
-        ]
-        assert image.shape == (1, 512, 512, 3)
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_negative_guidance(self):
-        pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-        pipe.set_progress_bar_config(disable=None)
-        prompt = "an image of a crowded boulevard, realistic, 4k"
-        edit = {
-            "editing_prompt": "crowd, crowded, people",
-            "reverse_editing_direction": True,
-            "edit_warmup_steps": 10,
-            "edit_guidance_scale": 8.3,
-            "edit_threshold": 0.9,
-            "edit_momentum_scale": 0.5,
-            "edit_mom_beta": 0.6,
-        }
-        seed = 9
-        guidance_scale = 7
-        generator = paddle.Generator().manual_seed(seed)
-        output = pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=guidance_scale,
-            num_inference_steps=50,
-            output_type="np",
-            width=512,
-            height=512,
-        )
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        expected_slice = [
-            0.43497998,
-            0.91814065,
-            0.7540739,
-            0.55580205,
-            0.8467265,
-            0.5389691,
-            0.62574506,
-            0.58897763,
-            0.50926757,
-        ]
-        assert image.shape == (1, 512, 512, 3)
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-        generator = paddle.Generator().manual_seed(seed)
-        output = pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=guidance_scale,
-            num_inference_steps=50,
-            output_type="np",
-            width=512,
-            height=512,
-            **edit,
-        )
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        expected_slice = [
-            0.3089719,
-            0.30500144,
-            0.29016042,
-            0.30630964,
-            0.325687,
-            0.29419225,
-            0.2908091,
-            0.28723598,
-            0.27696294,
-        ]
-        assert image.shape == (1, 512, 512, 3)
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_multi_cond_guidance(self):
-        pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-        pipe.set_progress_bar_config(disable=None)
-        prompt = "a castle next to a river"
-        edit = {
-            "editing_prompt": ["boat on a river, boat", "monet, impression, sunrise"],
-            "reverse_editing_direction": False,
-            "edit_warmup_steps": [15, 18],
-            "edit_guidance_scale": 6,
-            "edit_threshold": [0.9, 0.8],
-            "edit_momentum_scale": 0.5,
-            "edit_mom_beta": 0.6,
-        }
-        seed = 48
-        guidance_scale = 7
-        generator = paddle.Generator().manual_seed(seed)
-        output = pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=guidance_scale,
-            num_inference_steps=50,
-            output_type="np",
-            width=512,
-            height=512,
-        )
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        expected_slice = [
-            0.75163555,
-            0.76037145,
-            0.61785,
-            0.9189673,
-            0.8627701,
-            0.85189694,
-            0.8512813,
-            0.87012076,
-            0.8312857,
-        ]
-        assert image.shape == (1, 512, 512, 3)
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-        generator = paddle.Generator().manual_seed(seed)
-        output = pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=guidance_scale,
-            num_inference_steps=50,
-            output_type="np",
-            width=512,
-            height=512,
-            **edit,
-        )
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        expected_slice = [
-            0.73553365,
-            0.7537271,
-            0.74341905,
-            0.66480356,
-            0.6472925,
-            0.63039416,
-            0.64812905,
-            0.6749717,
-            0.6517102,
-        ]
-        assert image.shape == (1, 512, 512, 3)
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.02
-
-    def test_guidance_fp16(self):
-        pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", paddle_dtype=paddle.float16)
-        pipe.set_progress_bar_config(disable=None)
-        prompt = "a photo of a cat"
-        edit = {
-            "editing_prompt": ["sunglasses"],
-            "reverse_editing_direction": [False],
-            "edit_warmup_steps": 10,
-            "edit_guidance_scale": 6,
-            "edit_threshold": 0.95,
-            "edit_momentum_scale": 0.5,
-            "edit_mom_beta": 0.6,
-        }
-        seed = 3
-        guidance_scale = 7
-        generator = paddle.Generator().manual_seed(seed)
-        output = pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=guidance_scale,
-            num_inference_steps=50,
-            output_type="np",
-            width=512,
-            height=512,
-        )
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        expected_slice = [
-            0.34887695,
-            0.3876953,
-            0.375,
-            0.34423828,
-            0.3581543,
-            0.35717773,
-            0.3383789,
-            0.34570312,
-            0.359375,
-        ]
-        assert image.shape == (1, 512, 512, 3)
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.02
-        generator = paddle.Generator().manual_seed(seed)
-        output = pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=guidance_scale,
-            num_inference_steps=50,
-            output_type="np",
-            width=512,
-            height=512,
-            **edit,
-        )
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        expected_slice = [
-            0.42285156,
-            0.36914062,
-            0.29077148,
-            0.42041016,
-            0.41918945,
-            0.35498047,
-            0.3618164,
-            0.4423828,
-            0.43115234,
-        ]
-        assert image.shape == (1, 512, 512, 3)
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.02
diff --git a/ppdiffusers/tests/pipelines/spectrogram_diffusion/__init__.py b/ppdiffusers/tests/pipelines/spectrogram_diffusion/__init__.py
deleted file mode 100644
index 595add0aed9e..000000000000
--- a/ppdiffusers/tests/pipelines/spectrogram_diffusion/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/ppdiffusers/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/ppdiffusers/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
deleted file mode 100644
index e529e166e0b6..000000000000
--- a/ppdiffusers/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
+++ /dev/null
@@ -1,232 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import numpy as np
-import paddle
-
-from ppdiffusers import DDPMScheduler, MidiProcessor, SpectrogramDiffusionPipeline
-from ppdiffusers.pipelines.spectrogram_diffusion import (
-    SpectrogramContEncoder,
-    SpectrogramNotesEncoder,
-    T5FilmDecoder,
-)
-from ppdiffusers.training_utils import enable_full_determinism
-from ppdiffusers.utils import require_paddle_gpu, slow
-
-from ..pipeline_params import (
-    TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS,
-    TOKENS_TO_AUDIO_GENERATION_PARAMS,
-)
-from ..test_pipelines_common import PipelineTesterMixin
-
-enable_full_determinism(42)
-
-
-MIDI_FILE = "./tests/fixtures/elise_format0.mid"
-
-
-# The note-seq package throws an error on import because the default installed version of Ipython
-# is not compatible with python 3.8 which we run in the CI.
-# https://github.com/huggingface/diffusers/actions/runs/4830121056/jobs/8605954838#step:7:98
-# @unittest.skip("The note-seq package currently throws an error on import")
-class SpectrogramDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = SpectrogramDiffusionPipeline
-    required_optional_params = PipelineTesterMixin.required_optional_params - {
-        "callback",
-        "latents",
-        "callback_steps",
-        "output_type",
-        "num_images_per_prompt",
-    }
-    test_attention_slicing = False
-    test_xformers_attention = False
-    test_cpu_offload = False
-    batch_params = TOKENS_TO_AUDIO_GENERATION_PARAMS
-    params = TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS
-
-    def get_dummy_components(self):
-        paddle.seed(0)
-        notes_encoder = SpectrogramNotesEncoder(
-            max_length=2048,
-            vocab_size=1536,
-            d_model=768,
-            dropout_rate=0.1,
-            num_layers=1,
-            num_heads=1,
-            d_kv=4,
-            d_ff=2048,
-            feed_forward_proj="gated-gelu",
-        )
-        notes_encoder.eval()
-        paddle.seed(0)
-        continuous_encoder = SpectrogramContEncoder(
-            input_dims=128,
-            targets_context_length=256,
-            d_model=768,
-            dropout_rate=0.1,
-            num_layers=1,
-            num_heads=1,
-            d_kv=4,
-            d_ff=2048,
-            feed_forward_proj="gated-gelu",
-        )
-        continuous_encoder.eval()
-
-        paddle.seed(0)
-        decoder = T5FilmDecoder(
-            input_dims=128,
-            targets_length=256,
-            max_decoder_noise_time=20000.0,
-            d_model=768,
-            num_layers=1,
-            num_heads=1,
-            d_kv=4,
-            d_ff=2048,
-            dropout_rate=0.1,
-        )
-        decoder.eval()
-
-        scheduler = DDPMScheduler()
-
-        components = {
-            "notes_encoder": notes_encoder,
-            "continuous_encoder": continuous_encoder,
-            "decoder": decoder,
-            "scheduler": scheduler,
-            "melgan": None,
-        }
-        return components
-
-    def get_dummy_inputs(self, seed=0):
-
-        generator = paddle.Generator().manual_seed(seed)
-        inputs = {
-            "input_tokens": [
-                [1134, 90, 1135, 1133, 1080, 112, 1132, 1080, 1133, 1079, 133, 1132, 1079, 1133, 1] + [0] * 2033
-            ],
-            "generator": generator,
-            "num_inference_steps": 4,
-            "output_type": "mel",
-        }
-        return inputs
-
-    def test_spectrogram_diffusion(self):
-        components = self.get_dummy_components()
-        pipe = SpectrogramDiffusionPipeline(**components)
-
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs()
-        output = pipe(**inputs)
-        mel = output.audios
-
-        mel_slice = mel[0, -3:, -3:]
-
-        assert mel_slice.shape == (3, 3)
-        expected_slice = np.array(
-            [-11.46511, 4.0, -8.506372, -11.512925, -11.512925, -10.417862, -8.077912, 3.7985802, 4.0]
-        )
-        assert np.abs(mel_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_save_load_local(self):
-        return super().test_save_load_local()
-
-    def test_dict_tuple_outputs_equivalent(self):
-        return super().test_dict_tuple_outputs_equivalent()
-
-    def test_save_load_optional_components(self):
-        return super().test_save_load_optional_components()
-
-    def test_attention_slicing_forward_pass(self):
-        return super().test_attention_slicing_forward_pass()
-
-    def test_inference_batch_single_identical(self):
-        pass
-
-    def test_inference_batch_consistent(self):
-        pass
-
-    def test_progress_bar(self):
-        return super().test_progress_bar()
-
-
-@slow
-@require_paddle_gpu
-class PipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def test_callback(self):
-        # TODO - test that pipeline can decode tokens in a callback
-        # so that music can be played live
-        pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion")
-        melgan = pipe.melgan
-        pipe.melgan = None
-
-        pipe.set_progress_bar_config(disable=None)
-
-        def callback(step, mel_output):
-            # decode mel to audio
-            audio = melgan(input_features=mel_output.astype(np.float32))[0]
-            assert len(audio[0]) == 81920 * (step + 1)
-            # simulate that audio is played
-            return audio
-
-        processor = MidiProcessor()
-        input_tokens = processor(MIDI_FILE)
-
-        input_tokens = input_tokens[:3]
-        generator = paddle.Generator().manual_seed(0)
-        pipe(input_tokens, num_inference_steps=5, generator=generator, callback=callback, output_type="mel")
-
-    def test_spectrogram_fast(self):
-
-        pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion")
-        pipe.set_progress_bar_config(disable=None)
-        processor = MidiProcessor()
-
-        input_tokens = processor(MIDI_FILE)
-        # just run two denoising loops
-        input_tokens = input_tokens[:2]
-
-        generator = paddle.Generator().manual_seed(0)
-        output = pipe(input_tokens, num_inference_steps=2, generator=generator)
-
-        audio = output.audios[0]
-
-        assert abs(np.abs(audio).sum() - 3815.163) < 1e-1
-
-    def test_spectrogram(self):
-
-        pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion")
-        pipe.set_progress_bar_config(disable=None)
-
-        processor = MidiProcessor()
-
-        input_tokens = processor(MIDI_FILE)
-
-        # just run 4 denoising loops
-        input_tokens = input_tokens[:4]
-
-        generator = paddle.Generator().manual_seed(0)
-        output = pipe(input_tokens, num_inference_steps=100, generator=generator)
-
-        audio = output.audios[0]
-        assert abs(np.abs(audio).sum() - 14418.089) < 5e-2
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/__init__.py b/ppdiffusers/tests/pipelines/stable_diffusion/__init__.py
deleted file mode 100644
index a72d388cc895..000000000000
--- a/ppdiffusers/tests/pipelines/stable_diffusion/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_cycle_diffusion.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_cycle_diffusion.py
deleted file mode 100644
index fc4657604b99..000000000000
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_cycle_diffusion.py
+++ /dev/null
@@ -1,224 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import random
-import unittest
-
-import numpy as np
-import paddle
-from ..pipeline_params import (
-    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
-    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
-)
-from ..test_pipelines_common import PipelineTesterMixin
-
-from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (
-    AutoencoderKL,
-    CycleDiffusionPipeline,
-    DDIMScheduler,
-    UNet2DConditionModel,
-)
-from ppdiffusers.utils import floats_tensor, load_image, slow
-from ppdiffusers.utils.testing_utils import require_paddle_gpu
-
-
-class CycleDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = CycleDiffusionPipeline
-    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {
-        "negative_prompt",
-        "height",
-        "width",
-        "negative_prompt_embeds",
-    }
-    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
-    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS.union({"source_prompt"})
-
-    def get_dummy_components(self):
-        paddle.seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            num_train_timesteps=1000,
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-        paddle.seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        paddle.seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config).eval()
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "safety_checker": None,
-            "feature_extractor": None,
-        }
-        return components
-
-    def get_dummy_inputs(self, seed=0):
-        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed))
-        generator = paddle.Generator().manual_seed(seed)
-
-        inputs = {
-            "prompt": "An astronaut riding an elephant",
-            "source_prompt": "An astronaut riding a horse",
-            "image": image,
-            "generator": generator,
-            "num_inference_steps": 2,
-            "eta": 0.1,
-            "strength": 0.8,
-            "guidance_scale": 3,
-            "source_guidance_scale": 1,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_cycle(self):
-        components = self.get_dummy_components()
-        pipe = CycleDiffusionPipeline(**components)
-        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        output = pipe(**inputs)
-        images = output.images
-        image_slice = images[0, -3:, -3:, -1]
-        assert images.shape == (1, 32, 32, 3)
-        expected_slice = np.array(
-            [0.04812625, 0.77983606, 0.71009433, 0.15924984, 0.9788434, 0.49732354, 0.362224, 0.6481595, 0.4530744]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_stable_diffusion_cycle_fp16(self):
-        components = self.get_dummy_components()
-        for name, module in components.items():
-            if hasattr(module, "to"):
-                components[name] = module.to(dtype=paddle.float16)
-        pipe = CycleDiffusionPipeline(**components)
-        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        output = pipe(**inputs)
-        images = output.images
-        image_slice = images[0, -3:, -3:, -1]
-        assert images.shape == (1, 32, 32, 3)
-        expected_slice = np.array(
-            [0.05053711, 0.78125, 0.7114258, 0.15991211, 0.9785156, 0.49804688, 0.36279297, 0.6484375, 0.45361328]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-
-    @unittest.skip("non-deterministic pipeline")
-    def test_inference_batch_single_identical(self):
-        return super().test_inference_batch_single_identical()
-
-
-@slow
-@require_paddle_gpu
-class CycleDiffusionPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def test_cycle_diffusion_pipeline_fp16(self):
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/cycle-diffusion/black_colored_car.png"
-        )
-        expected_image = np.array([[0.14477539, 0.20483398, 0.14135742], [0.10009766, 0.17602539, 0.11083984]])
-        init_image = init_image.resize((512, 512))
-        model_id = "CompVis/stable-diffusion-v1-4"
-        scheduler = DDIMScheduler.from_pretrained(model_id, subfolder="scheduler")
-        pipe = CycleDiffusionPipeline.from_pretrained(
-            model_id, scheduler=scheduler, safety_checker=None, paddle_dtype=paddle.float16, revision="fp16"
-        )
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        source_prompt = "A black colored car"
-        prompt = "A blue colored car"
-        generator = paddle.Generator().manual_seed(0)
-        output = pipe(
-            prompt=prompt,
-            source_prompt=source_prompt,
-            image=init_image,
-            num_inference_steps=100,
-            eta=0.1,
-            strength=0.85,
-            guidance_scale=3,
-            source_guidance_scale=1,
-            generator=generator,
-            output_type="np",
-        )
-        image = output.images
-        assert np.abs(image[0][0][:2] - expected_image).max() < 0.5
-
-    def test_cycle_diffusion_pipeline(self):
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/cycle-diffusion/black_colored_car.png"
-        )
-        expected_image = np.array([[0.16294342, 0.20514232, 0.14554858], [0.11476257, 0.16831946, 0.11495486]])
-        init_image = init_image.resize((512, 512))
-        model_id = "CompVis/stable-diffusion-v1-4"
-        scheduler = DDIMScheduler.from_pretrained(model_id, subfolder="scheduler")
-        pipe = CycleDiffusionPipeline.from_pretrained(model_id, scheduler=scheduler, safety_checker=None)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        source_prompt = "A black colored car"
-        prompt = "A blue colored car"
-        generator = paddle.Generator().manual_seed(0)
-        output = pipe(
-            prompt=prompt,
-            source_prompt=source_prompt,
-            image=init_image,
-            num_inference_steps=100,
-            eta=0.1,
-            strength=0.85,
-            guidance_scale=3,
-            source_guidance_scale=1,
-            generator=generator,
-            output_type="np",
-        )
-        image = output.images
-        assert np.abs(image[0][0][:2] - expected_image).max() < 0.01
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion.py
deleted file mode 100644
index 5e9857df9456..000000000000
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion.py
+++ /dev/null
@@ -1,265 +0,0 @@
-# # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-# #
-# # Licensed under the Apache License, Version 2.0 (the "License");
-# # you may not use this file except in compliance with the License.
-# # You may obtain a copy of the License at
-# #
-# #     http://www.apache.org/licenses/LICENSE-2.0
-# #
-# # Unless required by applicable law or agreed to in writing, software
-# # distributed under the License is distributed on an "AS IS" BASIS,
-# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# # See the License for the specific language governing permissions and
-# # limitations under the License.
-
-# import tempfile
-# import unittest
-
-# import numpy as np
-
-# from ppdiffusers import (
-#     DDIMScheduler,
-#     DPMSolverMultistepScheduler,
-#     EulerAncestralDiscreteScheduler,
-#     EulerDiscreteScheduler,
-#     LMSDiscreteScheduler,
-#     OnnxStableDiffusionPipeline,
-#     PNDMScheduler,
-# )
-# from ppdiffusers.utils.testing_utils import (
-#     is_onnx_available,
-#     nightly,
-#     require_onnxruntime,
-#     require_paddle_gpu,
-# )
-
-# from ...test_pipelines_onnx_common import OnnxPipelineTesterMixin
-
-# if is_onnx_available():
-#     import onnxruntime as ort
-
-
-# class OnnxStableDiffusionPipelineFastTests(OnnxPipelineTesterMixin, unittest.TestCase):
-#     hub_checkpoint = "hf-internal-testing/tiny-random-OnnxStableDiffusionPipeline"
-
-#     def get_dummy_inputs(self, seed=0):
-#         generator = np.random.RandomState(seed)
-#         inputs = {
-#             "prompt": "A painting of a squirrel eating a burger",
-#             "generator": generator,
-#             "num_inference_steps": 2,
-#             "guidance_scale": 7.5,
-#             "output_type": "numpy",
-#         }
-#         return inputs
-
-#     def test_pipeline_default_ddim(self):
-#         pipe = OnnxStableDiffusionPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
-#         pipe.set_progress_bar_config(disable=None)
-#         inputs = self.get_dummy_inputs()
-#         image = pipe(**inputs).images
-#         image_slice = image[0, -3:, -3:, -1]
-#         assert image.shape == (1, 128, 128, 3)
-#         expected_slice = np.array([0.65072, 0.58492, 0.48219, 0.55521, 0.5318, 0.55939, 0.50697, 0.398, 0.46455])
-#         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-
-#     def test_pipeline_pndm(self):
-#         pipe = OnnxStableDiffusionPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
-#         pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config, skip_prk_steps=True)
-#         pipe.set_progress_bar_config(disable=None)
-#         inputs = self.get_dummy_inputs()
-#         image = pipe(**inputs).images
-#         image_slice = image[0, -3:, -3:, -1]
-#         assert image.shape == (1, 128, 128, 3)
-#         expected_slice = np.array([0.65863, 0.59425, 0.49326, 0.56313, 0.53875, 0.56627, 0.51065, 0.39777, 0.4633])
-#         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-
-#     def test_pipeline_lms(self):
-#         pipe = OnnxStableDiffusionPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
-#         pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
-#         pipe.set_progress_bar_config(disable=None)
-#         inputs = self.get_dummy_inputs()
-#         image = pipe(**inputs).images
-#         image_slice = image[0, -3:, -3:, -1]
-#         assert image.shape == (1, 128, 128, 3)
-#         expected_slice = np.array([0.53755, 0.60786, 0.47402, 0.49488, 0.51869, 0.49819, 0.47985, 0.38957, 0.44279])
-#         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-
-#     def test_pipeline_euler(self):
-#         pipe = OnnxStableDiffusionPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
-#         pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
-#         pipe.set_progress_bar_config(disable=None)
-#         inputs = self.get_dummy_inputs()
-#         image = pipe(**inputs).images
-#         image_slice = image[0, -3:, -3:, -1]
-#         assert image.shape == (1, 128, 128, 3)
-#         expected_slice = np.array([0.53755, 0.60786, 0.47402, 0.49488, 0.51869, 0.49819, 0.47985, 0.38957, 0.44279])
-#         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-
-#     def test_pipeline_euler_ancestral(self):
-#         pipe = OnnxStableDiffusionPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
-#         pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
-#         pipe.set_progress_bar_config(disable=None)
-#         inputs = self.get_dummy_inputs()
-#         image = pipe(**inputs).images
-#         image_slice = image[0, -3:, -3:, -1]
-#         assert image.shape == (1, 128, 128, 3)
-#         expected_slice = np.array([0.53817, 0.60812, 0.47384, 0.4953, 0.51894, 0.49814, 0.47984, 0.38958, 0.44271])
-#         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-
-#     def test_pipeline_dpm_multistep(self):
-#         pipe = OnnxStableDiffusionPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
-#         pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
-#         pipe.set_progress_bar_config(disable=None)
-#         inputs = self.get_dummy_inputs()
-#         image = pipe(**inputs).images
-#         image_slice = image[0, -3:, -3:, -1]
-#         assert image.shape == (1, 128, 128, 3)
-#         expected_slice = np.array([0.53895, 0.60808, 0.47933, 0.49608, 0.51886, 0.4995, 0.48053, 0.38957, 0.442])
-#         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-
-
-# @nightly
-# @require_onnxruntime
-# @require_paddle_gpu
-# class OnnxStableDiffusionPipelineIntegrationTests(unittest.TestCase):
-#     @property
-#     def gpu_provider(self):
-#         return "CUDAExecutionProvider", {"gpu_mem_limit": "15000000000", "arena_extend_strategy": "kSameAsRequested"}
-
-#     @property
-#     def gpu_options(self):
-#         options = ort.SessionOptions()
-#         options.enable_mem_pattern = False
-#         return options
-
-#     def test_inference_default_pndm(self):
-#         sd_pipe = OnnxStableDiffusionPipeline.from_pretrained(
-#             "CompVis/stable-diffusion-v1-4",
-#             revision="onnx",
-#             safety_checker=None,
-#             feature_extractor=None,
-#             provider=self.gpu_provider,
-#             sess_options=self.gpu_options,
-#         )
-#         sd_pipe.set_progress_bar_config(disable=None)
-#         prompt = "A painting of a squirrel eating a burger"
-#         np.random.seed(0)
-#         output = sd_pipe([prompt], guidance_scale=6.0, num_inference_steps=10, output_type="np")
-#         image = output.images
-#         image_slice = image[0, -3:, -3:, -1]
-#         assert image.shape == (1, 512, 512, 3)
-#         expected_slice = np.array([0.0452, 0.039, 0.0087, 0.035, 0.0617, 0.0364, 0.0544, 0.0523, 0.072])
-#         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
-
-#     def test_inference_ddim(self):
-#         ddim_scheduler = DDIMScheduler.from_pretrained(
-#             "runwayml/stable-diffusion-v1-5", subfolder="scheduler", revision="onnx"
-#         )
-#         sd_pipe = OnnxStableDiffusionPipeline.from_pretrained(
-#             "runwayml/stable-diffusion-v1-5",
-#             revision="onnx",
-#             scheduler=ddim_scheduler,
-#             safety_checker=None,
-#             feature_extractor=None,
-#             provider=self.gpu_provider,
-#             sess_options=self.gpu_options,
-#         )
-#         sd_pipe.set_progress_bar_config(disable=None)
-#         prompt = "open neural network exchange"
-#         generator = np.random.RandomState(0)
-#         output = sd_pipe([prompt], guidance_scale=7.5, num_inference_steps=10, generator=generator, output_type="np")
-#         image = output.images
-#         image_slice = image[0, -3:, -3:, -1]
-#         assert image.shape == (1, 512, 512, 3)
-#         expected_slice = np.array([0.2867, 0.1974, 0.1481, 0.7294, 0.7251, 0.6667, 0.4194, 0.5642, 0.6486])
-#         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
-
-#     def test_inference_k_lms(self):
-#         lms_scheduler = LMSDiscreteScheduler.from_pretrained(
-#             "runwayml/stable-diffusion-v1-5", subfolder="scheduler", revision="onnx"
-#         )
-#         sd_pipe = OnnxStableDiffusionPipeline.from_pretrained(
-#             "runwayml/stable-diffusion-v1-5",
-#             revision="onnx",
-#             scheduler=lms_scheduler,
-#             safety_checker=None,
-#             feature_extractor=None,
-#             provider=self.gpu_provider,
-#             sess_options=self.gpu_options,
-#         )
-#         sd_pipe.set_progress_bar_config(disable=None)
-#         prompt = "open neural network exchange"
-#         generator = np.random.RandomState(0)
-#         output = sd_pipe([prompt], guidance_scale=7.5, num_inference_steps=10, generator=generator, output_type="np")
-#         image = output.images
-#         image_slice = image[0, -3:, -3:, -1]
-#         assert image.shape == (1, 512, 512, 3)
-#         expected_slice = np.array([0.2306, 0.1959, 0.1593, 0.6549, 0.6394, 0.5408, 0.5065, 0.601, 0.6161])
-#         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
-
-#     def test_intermediate_state(self):
-#         number_of_steps = 0
-
-#         def test_callback_fn(step: int, timestep: int, latents: np.ndarray) -> None:
-#             test_callback_fn.has_been_called = True
-#             nonlocal number_of_steps
-#             number_of_steps += 1
-#             if step == 0:
-#                 assert latents.shape == (1, 4, 64, 64)
-#                 latents_slice = latents[0, -3:, -3:, -1]
-#                 expected_slice = np.array(
-#                     [-0.6772, -0.3835, -1.2456, 0.1905, -1.0974, 0.6967, -1.9353, 0.0178, 1.0167]
-#                 )
-#                 assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.001
-#             elif step == 5:
-#                 assert latents.shape == (1, 4, 64, 64)
-#                 latents_slice = latents[0, -3:, -3:, -1]
-#                 expected_slice = np.array(
-#                     [-0.3351, 0.2241, -0.1837, -0.2325, -0.6577, 0.3393, -0.0241, 0.5899, 1.3875]
-#                 )
-#                 assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.001
-
-#         test_callback_fn.has_been_called = False
-#         pipe = OnnxStableDiffusionPipeline.from_pretrained(
-#             "runwayml/stable-diffusion-v1-5",
-#             revision="onnx",
-#             safety_checker=None,
-#             feature_extractor=None,
-#             provider=self.gpu_provider,
-#             sess_options=self.gpu_options,
-#         )
-#         pipe.set_progress_bar_config(disable=None)
-#         prompt = "Andromeda galaxy in a bottle"
-#         generator = np.random.RandomState(0)
-#         pipe(
-#             prompt=prompt,
-#             num_inference_steps=5,
-#             guidance_scale=7.5,
-#             generator=generator,
-#             callback=test_callback_fn,
-#             callback_steps=1,
-#         )
-#         assert test_callback_fn.has_been_called
-#         assert number_of_steps == 6
-
-#     def test_stable_diffusion_no_safety_checker(self):
-#         pipe = OnnxStableDiffusionPipeline.from_pretrained(
-#             "runwayml/stable-diffusion-v1-5",
-#             revision="onnx",
-#             safety_checker=None,
-#             feature_extractor=None,
-#             provider=self.gpu_provider,
-#             sess_options=self.gpu_options,
-#         )
-#         assert isinstance(pipe, OnnxStableDiffusionPipeline)
-#         assert pipe.safety_checker is None
-#         image = pipe("example prompt", num_inference_steps=2).images[0]
-#         assert image is not None
-#         with tempfile.TemporaryDirectory() as tmpdirname:
-#             pipe.save_pretrained(tmpdirname)
-#             pipe = OnnxStableDiffusionPipeline.from_pretrained(tmpdirname)
-#         assert pipe.safety_checker is None
-#         image = pipe("example prompt", num_inference_steps=2).images[0]
-#         assert image is not None
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py
deleted file mode 100644
index 8ae52b72c80f..000000000000
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py
+++ /dev/null
@@ -1,206 +0,0 @@
-# # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-# #
-# # Licensed under the Apache License, Version 2.0 (the "License");
-# # you may not use this file except in compliance with the License.
-# # You may obtain a copy of the License at
-# #
-# #     http://www.apache.org/licenses/LICENSE-2.0
-# #
-# # Unless required by applicable law or agreed to in writing, software
-# # distributed under the License is distributed on an "AS IS" BASIS,
-# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# # See the License for the specific language governing permissions and
-# # limitations under the License.
-
-# import random
-# import unittest
-
-# import numpy as np
-
-# from ppdiffusers import (
-#     DPMSolverMultistepScheduler,
-#     EulerAncestralDiscreteScheduler,
-#     EulerDiscreteScheduler,
-#     LMSDiscreteScheduler,
-#     OnnxStableDiffusionImg2ImgPipeline,
-#     PNDMScheduler,
-# )
-# from ppdiffusers.utils import floats_tensor
-# from ppdiffusers.utils.testing_utils import (
-#     is_onnx_available,
-#     load_image,
-#     nightly,
-#     require_onnxruntime,
-#     require_paddle_gpu,
-# )
-
-# from ...test_pipelines_onnx_common import OnnxPipelineTesterMixin
-
-# if is_onnx_available():
-#     import onnxruntime as ort
-
-
-# class OnnxStableDiffusionImg2ImgPipelineFastTests(OnnxPipelineTesterMixin, unittest.TestCase):
-#     hub_checkpoint = "hf-internal-testing/tiny-random-OnnxStableDiffusionPipeline"
-
-#     def get_dummy_inputs(self, seed=0):
-#         image = floats_tensor((1, 3, 128, 128), rng=random.Random(seed))
-#         generator = np.random.RandomState(seed)
-#         inputs = {
-#             "prompt": "A painting of a squirrel eating a burger",
-#             "image": image,
-#             "generator": generator,
-#             "num_inference_steps": 3,
-#             "strength": 0.75,
-#             "guidance_scale": 7.5,
-#             "output_type": "numpy",
-#         }
-#         return inputs
-
-#     def test_pipeline_default_ddim(self):
-#         pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
-#         pipe.set_progress_bar_config(disable=None)
-#         inputs = self.get_dummy_inputs()
-#         image = pipe(**inputs).images
-#         image_slice = image[0, -3:, -3:, -1].flatten()
-#         assert image.shape == (1, 128, 128, 3)
-#         expected_slice = np.array([0.69643, 0.58484, 0.50314, 0.5876, 0.55368, 0.59643, 0.51529, 0.41217, 0.49087])
-#         assert np.abs(image_slice - expected_slice).max() < 0.1
-
-#     def test_pipeline_pndm(self):
-#         pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
-#         pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config, skip_prk_steps=True)
-#         pipe.set_progress_bar_config(disable=None)
-#         inputs = self.get_dummy_inputs()
-#         image = pipe(**inputs).images
-#         image_slice = image[0, -3:, -3:, -1]
-#         assert image.shape == (1, 128, 128, 3)
-#         expected_slice = np.array([0.6171, 0.5339, 0.4931, 0.55622, 0.50982, 0.5824, 0.50716, 0.38629, 0.46856])
-#         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.1
-
-#     def test_pipeline_lms(self):
-#         pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
-#         pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
-#         pipe.set_progress_bar_config(disable=None)
-#         _ = pipe(**self.get_dummy_inputs())
-#         inputs = self.get_dummy_inputs()
-#         image = pipe(**inputs).images
-#         image_slice = image[0, -3:, -3:, -1]
-#         assert image.shape == (1, 128, 128, 3)
-#         expected_slice = np.array([0.52761, 0.59977, 0.49033, 0.49619, 0.54282, 0.50311, 0.476, 0.40918, 0.45203])
-#         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.1
-
-#     def test_pipeline_euler(self):
-#         pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
-#         pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
-#         pipe.set_progress_bar_config(disable=None)
-#         inputs = self.get_dummy_inputs()
-#         image = pipe(**inputs).images
-#         image_slice = image[0, -3:, -3:, -1]
-#         assert image.shape == (1, 128, 128, 3)
-#         expected_slice = np.array([0.52911, 0.60004, 0.49229, 0.49805, 0.54502, 0.5068, 0.47777, 0.41028, 0.45304])
-#         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.1
-
-#     def test_pipeline_euler_ancestral(self):
-#         pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
-#         pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
-#         pipe.set_progress_bar_config(disable=None)
-#         inputs = self.get_dummy_inputs()
-#         image = pipe(**inputs).images
-#         image_slice = image[0, -3:, -3:, -1]
-#         assert image.shape == (1, 128, 128, 3)
-#         expected_slice = np.array([0.52911, 0.60004, 0.49229, 0.49805, 0.54502, 0.5068, 0.47777, 0.41028, 0.45304])
-#         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.1
-
-#     def test_pipeline_dpm_multistep(self):
-#         pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
-#         pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
-#         pipe.set_progress_bar_config(disable=None)
-#         inputs = self.get_dummy_inputs()
-#         image = pipe(**inputs).images
-#         image_slice = image[0, -3:, -3:, -1]
-#         assert image.shape == (1, 128, 128, 3)
-#         expected_slice = np.array([0.65331, 0.58277, 0.48204, 0.56059, 0.53665, 0.56235, 0.50969, 0.40009, 0.46552])
-#         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.1
-
-
-# @nightly
-# @require_onnxruntime
-# @require_paddle_gpu
-# class OnnxStableDiffusionImg2ImgPipelineIntegrationTests(unittest.TestCase):
-#     @property
-#     def gpu_provider(self):
-#         return "CUDAExecutionProvider", {"gpu_mem_limit": "15000000000", "arena_extend_strategy": "kSameAsRequested"}
-
-#     @property
-#     def gpu_options(self):
-#         options = ort.SessionOptions()
-#         options.enable_mem_pattern = False
-#         return options
-
-#     def test_inference_default_pndm(self):
-#         init_image = load_image(
-#             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/img2img/sketch-mountains-input.jpg"
-#         )
-#         init_image = init_image.resize((768, 512))
-#         pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(
-#             "CompVis/stable-diffusion-v1-4",
-#             revision="onnx",
-#             safety_checker=None,
-#             feature_extractor=None,
-#             provider=self.gpu_provider,
-#             sess_options=self.gpu_options,
-#         )
-#         pipe.set_progress_bar_config(disable=None)
-#         prompt = "A fantasy landscape, trending on artstation"
-#         generator = np.random.RandomState(0)
-#         output = pipe(
-#             prompt=prompt,
-#             image=init_image,
-#             strength=0.75,
-#             guidance_scale=7.5,
-#             num_inference_steps=10,
-#             generator=generator,
-#             output_type="np",
-#         )
-#         images = output.images
-#         image_slice = images[0, 255:258, 383:386, -1]
-#         assert images.shape == (1, 512, 768, 3)
-#         expected_slice = np.array([0.4909, 0.5059, 0.5372, 0.4623, 0.4876, 0.5049, 0.482, 0.4956, 0.5019])
-#         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.02
-
-#     def test_inference_k_lms(self):
-#         init_image = load_image(
-#             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/img2img/sketch-mountains-input.jpg"
-#         )
-#         init_image = init_image.resize((768, 512))
-#         lms_scheduler = LMSDiscreteScheduler.from_pretrained(
-#             "runwayml/stable-diffusion-v1-5", subfolder="scheduler", revision="onnx"
-#         )
-#         pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(
-#             "runwayml/stable-diffusion-v1-5",
-#             revision="onnx",
-#             scheduler=lms_scheduler,
-#             safety_checker=None,
-#             feature_extractor=None,
-#             provider=self.gpu_provider,
-#             sess_options=self.gpu_options,
-#         )
-#         pipe.set_progress_bar_config(disable=None)
-#         prompt = "A fantasy landscape, trending on artstation"
-#         generator = np.random.RandomState(0)
-#         output = pipe(
-#             prompt=prompt,
-#             image=init_image,
-#             strength=0.75,
-#             guidance_scale=7.5,
-#             num_inference_steps=20,
-#             generator=generator,
-#             output_type="np",
-#         )
-#         images = output.images
-#         image_slice = images[0, 255:258, 383:386, -1]
-#         assert images.shape == (1, 512, 768, 3)
-#         expected_slice = np.array([0.8043, 0.926, 0.9581, 0.8119, 0.8954, 0.913, 0.7209, 0.7463, 0.7431])
-#         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.02
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint.py
deleted file mode 100644
index e0a90f54e580..000000000000
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-# #
-# # Licensed under the Apache License, Version 2.0 (the "License");
-# # you may not use this file except in compliance with the License.
-# # You may obtain a copy of the License at
-# #
-# #     http://www.apache.org/licenses/LICENSE-2.0
-# #
-# # Unless required by applicable law or agreed to in writing, software
-# # distributed under the License is distributed on an "AS IS" BASIS,
-# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# # See the License for the specific language governing permissions and
-# # limitations under the License.
-
-# import unittest
-
-# import numpy as np
-
-# from ppdiffusers import LMSDiscreteScheduler, OnnxStableDiffusionInpaintPipeline
-# from ppdiffusers.utils.testing_utils import (
-#     is_onnx_available,
-#     load_image,
-#     nightly,
-#     require_onnxruntime,
-#     require_paddle_gpu,
-# )
-
-# from ...test_pipelines_onnx_common import OnnxPipelineTesterMixin
-
-# if is_onnx_available():
-#     import onnxruntime as ort
-
-
-# class OnnxStableDiffusionPipelineFastTests(OnnxPipelineTesterMixin, unittest.TestCase):
-#     pass
-
-
-# @nightly
-# @require_onnxruntime
-# @require_paddle_gpu
-# class OnnxStableDiffusionInpaintPipelineIntegrationTests(unittest.TestCase):
-#     @property
-#     def gpu_provider(self):
-#         return "CUDAExecutionProvider", {"gpu_mem_limit": "15000000000", "arena_extend_strategy": "kSameAsRequested"}
-
-#     @property
-#     def gpu_options(self):
-#         options = ort.SessionOptions()
-#         options.enable_mem_pattern = False
-#         return options
-
-#     def test_inference_default_pndm(self):
-#         init_image = load_image(
-#             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/in_paint/overture-creations-5sI6fQgYIuo.png"
-#         )
-#         mask_image = load_image(
-#             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
-#         )
-#         pipe = OnnxStableDiffusionInpaintPipeline.from_pretrained(
-#             "runwayml/stable-diffusion-inpainting",
-#             revision="onnx",
-#             safety_checker=None,
-#             feature_extractor=None,
-#             provider=self.gpu_provider,
-#             sess_options=self.gpu_options,
-#         )
-#         pipe.set_progress_bar_config(disable=None)
-#         prompt = "A red cat sitting on a park bench"
-#         generator = np.random.RandomState(0)
-#         output = pipe(
-#             prompt=prompt,
-#             image=init_image,
-#             mask_image=mask_image,
-#             guidance_scale=7.5,
-#             num_inference_steps=10,
-#             generator=generator,
-#             output_type="np",
-#         )
-#         images = output.images
-#         image_slice = images[0, 255:258, 255:258, -1]
-#         assert images.shape == (1, 512, 512, 3)
-#         expected_slice = np.array([0.2514, 0.3007, 0.3517, 0.179, 0.2382, 0.3167, 0.1944, 0.2273, 0.2464])
-#         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
-
-#     def test_inference_k_lms(self):
-#         init_image = load_image(
-#             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/in_paint/overture-creations-5sI6fQgYIuo.png"
-#         )
-#         mask_image = load_image(
-#             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
-#         )
-#         lms_scheduler = LMSDiscreteScheduler.from_pretrained(
-#             "runwayml/stable-diffusion-inpainting", subfolder="scheduler", revision="onnx"
-#         )
-#         pipe = OnnxStableDiffusionInpaintPipeline.from_pretrained(
-#             "runwayml/stable-diffusion-inpainting",
-#             revision="onnx",
-#             scheduler=lms_scheduler,
-#             safety_checker=None,
-#             feature_extractor=None,
-#             provider=self.gpu_provider,
-#             sess_options=self.gpu_options,
-#         )
-#         pipe.set_progress_bar_config(disable=None)
-#         prompt = "A red cat sitting on a park bench"
-#         generator = np.random.RandomState(0)
-#         output = pipe(
-#             prompt=prompt,
-#             image=init_image,
-#             mask_image=mask_image,
-#             guidance_scale=7.5,
-#             num_inference_steps=20,
-#             generator=generator,
-#             output_type="np",
-#         )
-#         images = output.images
-#         image_slice = images[0, 255:258, 255:258, -1]
-#         assert images.shape == (1, 512, 512, 3)
-#         expected_slice = np.array([0.0086, 0.0077, 0.0083, 0.0093, 0.0107, 0.0139, 0.0094, 0.0097, 0.0125])
-#         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint_legacy.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint_legacy.py
deleted file mode 100644
index e195a574c85a..000000000000
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint_legacy.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-# #
-# # Licensed under the Apache License, Version 2.0 (the "License");
-# # you may not use this file except in compliance with the License.
-# # You may obtain a copy of the License at
-# #
-# #     http://www.apache.org/licenses/LICENSE-2.0
-# #
-# # Unless required by applicable law or agreed to in writing, software
-# # distributed under the License is distributed on an "AS IS" BASIS,
-# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# # See the License for the specific language governing permissions and
-# # limitations under the License.
-
-# import unittest
-
-# import numpy as np
-
-# from ppdiffusers import OnnxStableDiffusionInpaintPipelineLegacy
-# from ppdiffusers.utils.testing_utils import (
-#     is_onnx_available,
-#     load_image,
-#     load_numpy,
-#     nightly,
-#     require_onnxruntime,
-#     require_paddle_gpu,
-# )
-
-# if is_onnx_available():
-#     import onnxruntime as ort
-
-
-# @nightly
-# @require_onnxruntime
-# @require_paddle_gpu
-# class StableDiffusionOnnxInpaintLegacyPipelineIntegrationTests(unittest.TestCase):
-#     @property
-#     def gpu_provider(self):
-#         return "CUDAExecutionProvider", {"gpu_mem_limit": "15000000000", "arena_extend_strategy": "kSameAsRequested"}
-
-#     @property
-#     def gpu_options(self):
-#         options = ort.SessionOptions()
-#         options.enable_mem_pattern = False
-#         return options
-
-#     def test_inference(self):
-#         init_image = load_image(
-#             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/in_paint/overture-creations-5sI6fQgYIuo.png"
-#         )
-#         mask_image = load_image(
-#             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
-#         )
-#         expected_image = load_numpy(
-#             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/in_paint/red_cat_sitting_on_a_park_bench_onnx.npy"
-#         )
-#         pipe = OnnxStableDiffusionInpaintPipelineLegacy.from_pretrained(
-#             "CompVis/stable-diffusion-v1-4",
-#             revision="onnx",
-#             safety_checker=None,
-#             feature_extractor=None,
-#             provider=self.gpu_provider,
-#             sess_options=self.gpu_options,
-#         )
-#         pipe.set_progress_bar_config(disable=None)
-#         prompt = "A red cat sitting on a park bench"
-#         generator = np.random.RandomState(0)
-#         output = pipe(
-#             prompt=prompt,
-#             image=init_image,
-#             mask_image=mask_image,
-#             strength=0.75,
-#             guidance_scale=7.5,
-#             num_inference_steps=15,
-#             generator=generator,
-#             output_type="np",
-#         )
-#         image = output.images[0]
-#         assert image.shape == (512, 512, 3)
-#         assert np.abs(expected_image - image).max() < 0.01
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion.py
deleted file mode 100644
index 2052d23f815c..000000000000
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ /dev/null
@@ -1,674 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import tempfile
-import unittest
-
-import numpy as np
-import paddle
-from ..pipeline_params import (
-    TEXT_TO_IMAGE_BATCH_PARAMS,
-    TEXT_TO_IMAGE_PARAMS,
-)
-from ..test_pipelines_common import PipelineTesterMixin
-
-from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    DPMSolverMultistepScheduler,
-    EulerAncestralDiscreteScheduler,
-    EulerDiscreteScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    StableDiffusionPipeline,
-    UNet2DConditionModel,
-    logging,
-)
-from ppdiffusers.utils import load_numpy, nightly, slow
-from ppdiffusers.utils.testing_utils import CaptureLogger, require_paddle_gpu
-
-from ...models.test_models_unet_2d_condition import create_lora_layers
-
-
-class StableDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = StableDiffusionPipeline
-    params = TEXT_TO_IMAGE_PARAMS
-    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
-
-    def get_dummy_components(self):
-        paddle.seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-        paddle.seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        paddle.seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config).eval()
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "safety_checker": None,
-            "feature_extractor": None,
-        }
-        return components
-
-    def get_dummy_inputs(self, seed=0):
-        generator = paddle.Generator().manual_seed(seed)
-
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_ddim(self):
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        output = sd_pipe(**inputs)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [0.28519553, 0.23807192, 0.38150552, 0.21930423, 0.26092762, 0.51721215, 0.25639117, 0.25039536, 0.47978917]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_stable_diffusion_lora(self):
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        output = sd_pipe(**inputs)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        lora_attn_procs = create_lora_layers(sd_pipe.unet)
-        sd_pipe.unet.set_attn_processor(lora_attn_procs)
-        inputs = self.get_dummy_inputs()
-        output = sd_pipe(**inputs, cross_attention_kwargs={"scale": 0.0})
-        image = output.images
-        image_slice_1 = image[0, -3:, -3:, -1]
-        inputs = self.get_dummy_inputs()
-        output = sd_pipe(**inputs, cross_attention_kwargs={"scale": 0.5})
-        image = output.images
-        image_slice_2 = image[0, -3:, -3:, -1]
-        assert np.abs(image_slice - image_slice_1).max() < 0.01
-        assert np.abs(image_slice - image_slice_2).max() > 0.01
-
-    def test_stable_diffusion_prompt_embeds(self):
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        inputs["prompt"] = 3 * [inputs["prompt"]]
-        output = sd_pipe(**inputs)
-        image_slice_1 = output.images[0, -3:, -3:, -1]
-        inputs = self.get_dummy_inputs()
-        prompt = 3 * [inputs.pop("prompt")]
-        text_inputs = sd_pipe.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=sd_pipe.tokenizer.model_max_length,
-            truncation=True,
-            return_tensors="pd",
-        )
-        text_inputs = text_inputs["input_ids"]
-        prompt_embeds = sd_pipe.text_encoder(text_inputs)[0]
-        inputs["prompt_embeds"] = prompt_embeds
-        output = sd_pipe(**inputs)
-        image_slice_2 = output.images[0, -3:, -3:, -1]
-        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 0.0001
-
-    def test_stable_diffusion_negative_prompt_embeds(self):
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        negative_prompt = 3 * ["this is a negative prompt"]
-        inputs["negative_prompt"] = negative_prompt
-        inputs["prompt"] = 3 * [inputs["prompt"]]
-        output = sd_pipe(**inputs)
-        image_slice_1 = output.images[0, -3:, -3:, -1]
-        inputs = self.get_dummy_inputs()
-        prompt = 3 * [inputs.pop("prompt")]
-        embeds = []
-        for p in [prompt, negative_prompt]:
-            text_inputs = sd_pipe.tokenizer(
-                p,
-                padding="max_length",
-                max_length=sd_pipe.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pd",
-            )
-            text_inputs = text_inputs["input_ids"]
-            embeds.append(sd_pipe.text_encoder(text_inputs)[0])
-        inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds
-        output = sd_pipe(**inputs)
-        image_slice_2 = output.images[0, -3:, -3:, -1]
-        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 0.0001
-
-    def test_stable_diffusion_ddim_factor_8(self):
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        output = sd_pipe(**inputs, height=136, width=136)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 136, 136, 3)
-        expected_slice = np.array(
-            [0.39545745, 0.94682777, 0.6828775 , 0.42496994, 0.49475053, 0.48353004, 0.27300328, 0.30724254, 0.50566095]
-        )
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_stable_diffusion_pndm(self):
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe.scheduler = PNDMScheduler(skip_prk_steps=True)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        output = sd_pipe(**inputs)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [0.18620703, 0.24143961, 0.3609084 , 0.21810293, 0.27230006, 0.51992655, 0.22248471, 0.2213102 , 0.44538254]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_stable_diffusion_no_safety_checker(self):
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None
-        )
-        assert isinstance(pipe, StableDiffusionPipeline)
-        assert isinstance(pipe.scheduler, LMSDiscreteScheduler)
-        assert pipe.safety_checker is None
-        image = pipe("example prompt", num_inference_steps=2).images[0]
-        assert image is not None
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pipe.save_pretrained(tmpdirname)
-            pipe = StableDiffusionPipeline.from_pretrained(tmpdirname, from_diffusers=False)
-        assert pipe.safety_checker is None
-        image = pipe("example prompt", num_inference_steps=2).images[0]
-        assert image is not None
-
-    def test_stable_diffusion_k_lms(self):
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        output = sd_pipe(**inputs)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [0.29910105, 0.22905633, 0.37701294, 0.21332851, 0.26000416, 0.52840894, 0.25865072, 0.25947532, 0.47509664]
-        )
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_stable_diffusion_k_euler_ancestral(self):
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        output = sd_pipe(**inputs)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [0.29917336, 0.22854236, 0.37669897, 0.2137424 , 0.25940597, 0.528258  , 0.25919583, 0.2594489 , 0.47522712]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_stable_diffusion_k_euler(self):
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        output = sd_pipe(**inputs)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [0.29910135, 0.22905621, 0.3770129 , 0.21332836, 0.26000386,  0.52840906, 0.2586509 , 0.2594754 , 0.47509673]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_stable_diffusion_vae_slicing(self):
-        components = self.get_dummy_components()
-        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        image_count = 4
-        inputs = self.get_dummy_inputs()
-        inputs["prompt"] = [inputs["prompt"]] * image_count
-        output_1 = sd_pipe(**inputs)
-        sd_pipe.enable_vae_slicing()
-        inputs = self.get_dummy_inputs()
-        inputs["prompt"] = [inputs["prompt"]] * image_count
-        output_2 = sd_pipe(**inputs)
-        assert np.abs(output_2.images.flatten() - output_1.images.flatten()).max() < 0.003
-
-    def test_stable_diffusion_vae_tiling(self):
-        components = self.get_dummy_components()
-
-        # make sure here that pndm scheduler skips prk
-        components["safety_checker"] = None
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-
-        # Test that tiled decode at 512x512 yields the same result as the non-tiled decode
-        generator = paddle.Generator().manual_seed(0)
-        output_1 = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
-
-        # make sure tiled vae decode yields the same result
-        sd_pipe.enable_vae_tiling()
-        generator = paddle.Generator().manual_seed(0)
-        output_2 = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
-
-        assert np.abs(output_2.images.flatten() - output_1.images.flatten()).max() < 5e-1
-
-    def test_stable_diffusion_negative_prompt(self):
-        components = self.get_dummy_components()
-        components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        negative_prompt = "french fries"
-        output = sd_pipe(**inputs, negative_prompt=negative_prompt)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [0.16709289, 0.26912582, 0.35834038, 0.23045751, 0.30960953, 0.5324909 , 0.20372942, 0.2368694 , 0.43633103]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_stable_diffusion_num_images_per_prompt(self):
-        components = self.get_dummy_components()
-        components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        prompt = "A painting of a squirrel eating a burger"
-        images = sd_pipe(prompt, num_inference_steps=2, output_type="np").images
-        assert images.shape == (1, 64, 64, 3)
-        batch_size = 2
-        images = sd_pipe([prompt] * batch_size, num_inference_steps=2, output_type="np").images
-        assert images.shape == (batch_size, 64, 64, 3)
-        num_images_per_prompt = 2
-        images = sd_pipe(
-            prompt, num_inference_steps=2, output_type="np", num_images_per_prompt=num_images_per_prompt
-        ).images
-        assert images.shape == (num_images_per_prompt, 64, 64, 3)
-        batch_size = 2
-        images = sd_pipe(
-            [prompt] * batch_size, num_inference_steps=2, output_type="np", num_images_per_prompt=num_images_per_prompt
-        ).images
-        assert images.shape == (batch_size * num_images_per_prompt, 64, 64, 3)
-
-    def test_stable_diffusion_long_prompt(self):
-        components = self.get_dummy_components()
-        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        do_classifier_free_guidance = True
-        negative_prompt = None
-        num_images_per_prompt = 1
-        logger = logging.get_logger("ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion")
-        prompt = 25 * "@"
-        with CaptureLogger(logger) as cap_logger_3:
-            text_embeddings_3 = sd_pipe._encode_prompt(
-                prompt, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
-            )
-        prompt = 100 * "@"
-        with CaptureLogger(logger) as cap_logger:
-            text_embeddings = sd_pipe._encode_prompt(
-                prompt, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
-            )
-        negative_prompt = "Hello"
-        with CaptureLogger(logger) as cap_logger_2:
-            text_embeddings_2 = sd_pipe._encode_prompt(
-                prompt, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
-            )
-        assert text_embeddings_3.shape == text_embeddings_2.shape == text_embeddings.shape
-        assert text_embeddings.shape[1] == 77
-        assert cap_logger.out == cap_logger_2.out
-        assert cap_logger.out.count("@") == 25
-        assert cap_logger_3.out == ""
-
-    def test_stable_diffusion_height_width_opt(self):
-        components = self.get_dummy_components()
-        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        prompt = "hey"
-        output = sd_pipe(prompt, num_inference_steps=1, output_type="np")
-        image_shape = output.images[0].shape[:2]
-        assert image_shape == (64, 64)
-        output = sd_pipe(prompt, num_inference_steps=1, height=96, width=96, output_type="np")
-        image_shape = output.images[0].shape[:2]
-        assert image_shape == (96, 96)
-        config = dict(sd_pipe.unet.config)
-        config["sample_size"] = 96
-        sd_pipe.unet = UNet2DConditionModel.from_config(config)
-        output = sd_pipe(prompt, num_inference_steps=1, output_type="np")
-        image_shape = output.images[0].shape[:2]
-        assert image_shape == (192, 192)
-
-
-@slow
-@require_paddle_gpu
-class StableDiffusionPipelineSlowTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def get_inputs(self, dtype="float32", seed=0):
-        generator = paddle.Generator().manual_seed(seed)
-        latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
-        latents = paddle.to_tensor(latents).cast(dtype)
-        inputs = {
-            "prompt": "a photograph of an astronaut riding a horse",
-            "latents": latents,
-            "generator": generator,
-            "num_inference_steps": 3,
-            "guidance_scale": 7.5,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_1_1_pndm(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-1")
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs()
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.43625, 0.43554, 0.3667, 0.4066, 0.39703, 0.38658, 0.43936, 0.43557, 0.40592])
-        assert np.abs(image_slice - expected_slice).max() < 0.0001
-
-    def test_stable_diffusion_1_4_pndm(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs()
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.574, 0.47841, 0.31625, 0.63583, 0.58306, 0.55056, 0.50825, 0.56306, 0.55748])
-        assert np.abs(image_slice - expected_slice).max() < 0.0001
-
-    def test_stable_diffusion_ddim(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
-        sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs()
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.38019, 0.28647, 0.27321, 0.40377, 0.3829, 0.35446, 0.39218, 0.38165, 0.42239])
-        assert np.abs(image_slice - expected_slice).max() < 0.0001
-
-    def test_stable_diffusion_lms(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
-        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs()
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.10542, 0.0962, 0.07332, 0.09015, 0.09382, 0.07597, 0.08496, 0.07806, 0.06455])
-        assert np.abs(image_slice - expected_slice).max() < 0.0001
-
-    def test_stable_diffusion_dpm(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
-        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs()
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.03503, 0.03494, 0.01087, 0.03128, 0.02552, 0.00803, 0.00742, 0.00372, 0.0])
-        assert np.abs(image_slice - expected_slice).max() < 0.0001
-
-    # def test_stable_diffusion_attention_slicing(self):
-    #     pipe = StableDiffusionPipeline.from_pretrained(
-    #         'CompVis/stable-diffusion-v1-4', paddle_dtype=paddle.float16)
-    #     pipe.set_progress_bar_config(disable=None)
-    #     pipe.enable_attention_slicing()
-    #     inputs = self.get_inputs(dtype='float16')
-    #     image_sliced = pipe(**inputs).images
-    #     mem_bytes = paddle.device.cuda.memory_allocated()
-    #     assert mem_bytes < 3.75 * 10 ** 9
-    #     pipe.disable_attention_slicing()
-    #     inputs = self.get_inputs(dtype='float16')
-    #     image = pipe(**inputs).images
-    #     mem_bytes = paddle.device.cuda.memory_allocated()
-    #     assert mem_bytes > 3.75 * 10 ** 9
-    #     assert np.abs(image_sliced - image).max() < 0.001
-
-    # def test_stable_diffusion_vae_slicing(self):
-    #     pipe = StableDiffusionPipeline.from_pretrained(
-    #         'CompVis/stable-diffusion-v1-4', paddle_dtype=paddle.float16)
-    #     pipe.set_progress_bar_config(disable=None)
-    #     pipe.enable_attention_slicing()
-    #     pipe.enable_vae_slicing()
-    #     inputs = self.get_inputs(dtype='float16')
-    #     inputs['prompt'] = [inputs['prompt']] * 4
-    #     inputs['latents'] = paddle.concat(x=[inputs['latents']] * 4)
-    #     image_sliced = pipe(**inputs).images
-    #     mem_bytes = paddle.device.cuda.memory_allocated()
-    #     assert mem_bytes < 4000000000.0
-    #     pipe.disable_vae_slicing()
-    #     inputs = self.get_inputs(dtype='float16')
-    #     inputs['prompt'] = [inputs['prompt']] * 4
-    #     inputs['latents'] = paddle.concat(x=[inputs['latents']] * 4)
-    #     image = pipe(**inputs).images
-    #     mem_bytes = paddle.device.cuda.memory_allocated()
-    #     assert mem_bytes > 4000000000.0
-    #     assert np.abs(image_sliced - image).max() < 0.01
-
-    def test_stable_diffusion_fp16_vs_autocast(self):
-        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", paddle_dtype=paddle.float16)
-        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs(dtype="float16")
-        image_fp16 = pipe(**inputs).images
-        with paddle.amp.auto_cast(True, level="O2"):
-            inputs = self.get_inputs()
-            image_autocast = pipe(**inputs).images
-        diff = np.abs(image_fp16.flatten() - image_autocast.flatten())
-        assert diff.mean() < 0.1
-
-    def test_stable_diffusion_intermediate_state(self):
-        number_of_steps = 0
-
-        def callback_fn(step: int, timestep: int, latents: paddle.Tensor) -> None:
-            callback_fn.has_been_called = True
-            nonlocal number_of_steps
-            number_of_steps += 1
-            if step == 1:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 64)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([-0.5693, -0.3018, -0.9746, 0.0518, -0.877, 0.7559, -1.7402, 0.1022, 1.1582])
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
-            elif step == 2:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 64)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array(
-                    [-0.1958, -0.2993, -1.0166, -0.5005, -0.481, 0.6162, -0.9492, 0.6621, 1.4492]
-                )
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
-
-        callback_fn.has_been_called = False
-        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", paddle_dtype=paddle.float16)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        inputs = self.get_inputs(dtype="float16")
-        pipe(**inputs, callback=callback_fn, callback_steps=1)
-        assert callback_fn.has_been_called
-        assert number_of_steps == inputs["num_inference_steps"]
-
-
-@nightly
-@require_paddle_gpu
-class StableDiffusionPipelineNightlyTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def get_inputs(self, dtype="float32", seed=0):
-        generator = paddle.Generator().manual_seed(seed)
-        latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
-        latents = paddle.to_tensor(latents).cast(dtype)
-        inputs = {
-            "prompt": "a photograph of an astronaut riding a horse",
-            "latents": latents,
-            "generator": generator,
-            "num_inference_steps": 50,
-            "guidance_scale": 7.5,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_1_4_pndm(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs()
-        image = sd_pipe(**inputs).images[0]
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_text2img/stable_diffusion_1_4_pndm.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 0.001
-
-    def test_stable_diffusion_1_5_pndm(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs()
-        image = sd_pipe(**inputs).images[0]
-        expected_image = np.array(
-            [
-                [0.7839468, 0.6564859, 0.48896512],
-                [0.78088367, 0.6400461, 0.447728],
-                [0.81458974, 0.67865074, 0.51496047],
-            ]
-        )
-        max_diff = np.abs(expected_image - image[0][0:3]).max()
-        assert max_diff < 0.001
-
-    def test_stable_diffusion_ddim(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
-        sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs()
-        image = sd_pipe(**inputs).images[0]
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_text2img/stable_diffusion_1_4_ddim.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 0.001
-
-    def test_stable_diffusion_lms(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
-        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs()
-        image = sd_pipe(**inputs).images[0]
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_text2img/stable_diffusion_1_4_lms.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 0.001
-
-    def test_stable_diffusion_euler(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
-        sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs()
-        image = sd_pipe(**inputs).images[0]
-        expected_image = np.array(
-            [
-                [0.7907467, 0.69895816, 0.5911293],
-                [0.7878128, 0.6815276, 0.55695873],
-                [0.79491043, 0.69076216, 0.58900857],
-            ]
-        )
-        max_diff = np.abs(expected_image - image[0][0:3]).max()
-        assert max_diff < 0.001
-
-    def test_stable_diffusion_dpm(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
-        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs()
-        inputs["num_inference_steps"] = 25
-        image = sd_pipe(**inputs).images[0]
-        expected_image = np.array(
-            [[0.8398815, 0.7510048, 0.6475117], [0.8548264, 0.75703114, 0.63529825], [0.8559129, 0.75676, 0.6597851]]
-        )
-        max_diff = np.abs(expected_image - image[0][0:3]).max()
-        assert max_diff < 0.001
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_adapter.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_adapter.py
deleted file mode 100644
index 4ba85a62b478..000000000000
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_adapter.py
+++ /dev/null
@@ -1,217 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import random
-import unittest
-
-import numpy as np
-import paddle
-
-from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (
-    AutoencoderKL,
-    PNDMScheduler,
-    StableDiffusionAdapterPipeline,
-    T2IAdapter,
-    UNet2DConditionModel,
-)
-from ppdiffusers.utils import floats_tensor, load_image, load_numpy, slow
-from ppdiffusers.utils.import_utils import is_ppxformers_available
-
-from ..pipeline_params import (
-    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
-    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
-)
-from ..test_pipelines_common import PipelineTesterMixin
-
-
-class StableDiffusionAdapterPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = StableDiffusionAdapterPipeline
-    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS
-    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
-
-    def get_dummy_components(self):
-        paddle.seed(seed=0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        paddle.Generator().manual_seed(seed=0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        vae_scale_factor = 2
-        paddle.Generator().manual_seed(seed=0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        paddle.Generator().manual_seed(seed=0)
-        adapter = T2IAdapter(
-            block_out_channels=[32, 64],
-            channels_in=3,
-            num_res_blocks=2,
-            kernel_size=1,
-            res_block_skip=True,
-            use_conv=False,
-            input_scale_factor=vae_scale_factor,
-        )
-        components = {
-            "adapter": adapter,
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "safety_checker": None,
-            "feature_extractor": None,
-        }
-        return components
-
-    def get_dummy_inputs(self, seed=0):
-        image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed))
-        generator = paddle.Generator().manual_seed(seed)
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "image": image,
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_adapter_default_case(self):
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionAdapterPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        image = sd_pipe(**inputs).images
-        image_slice = image[(0), -3:, -3:, (-1)]
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [0.9088084, 0.6012194, 0.43046606, 0.7228667, 0.46428588, 0.30164504, 0.508494, 0.6241546, 0.55453974]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.005
-
-    def test_attention_slicing_forward_pass(self):
-        return self._test_attention_slicing_forward_pass(expected_max_diff=0.002)
-
-    @unittest.skipIf(
-        not is_ppxformers_available(),
-        reason="XFormers attention is only available with CUDA and `xformers` installed",
-    )
-    def test_xformers_attention_forwardGenerator_pass(self):
-        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=0.002)
-
-    def test_inference_batch_single_identical(self):
-        self._test_inference_batch_single_identical(expected_max_diff=0.002)
-
-
-@slow
-class StableDiffusionAdapterPipelineSlowTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def get_inputs(self, revision="segmentation", dtype="float32", seed=0):
-        generator = paddle.Generator().manual_seed(seed)
-        image_urls = {
-            "segmentation": "https://huggingface.co/RzZ/sd-v1-4-adapter-pipeline/resolve/segmentation/sample_input.png",
-            "keypose": "https://huggingface.co/RzZ/sd-v1-4-adapter-pipeline/resolve/keypose/sample_input.png",
-            "depth": "https://huggingface.co/RzZ/sd-v1-4-adapter-pipeline/resolve/depth/sample_input.png",
-        }
-        prompt_by_rev = {
-            "segmentation": "A black Honda motorcycle parked in front of a garage",
-            "keypose": "An astronaut on the moon",
-            "depth": "An office room with nice view",
-        }
-        cond_image = load_image(image_urls[revision])
-        inputs = {
-            "prompt": prompt_by_rev[revision],
-            "image": cond_image,
-            "generator": generator,
-            "num_inference_steps": 3,
-            "guidance_scale": 7.5,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_segmentation_adapter(self):
-        adapter = T2IAdapter.from_pretrained("RzZ/sd-v1-4-adapter-seg")
-        pipe = StableDiffusionAdapterPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", adapter=adapter, safety_checker=None
-        )
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        inputs = self.get_inputs(revision="segmentation")
-        image = pipe(**inputs).images
-        assert image.shape == (1, 512, 512, 3)
-        expected_image = load_numpy(
-            "https://huggingface.co/RzZ/sd-v1-4-adapter-pipeline/resolve/segmentation/sample_output.npy"
-        )
-        assert np.abs(expected_image - image).max() < 0.005
-
-    def test_stable_diffusion_keypose_adapter(self):
-        adapter = T2IAdapter.from_pretrained("RzZ/sd-v1-4-adapter-keypose")
-        pipe = StableDiffusionAdapterPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", adapter=adapter, safety_checker=None
-        )
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        inputs = self.get_inputs(revision="keypose")
-        image = pipe(**inputs).images
-        assert image.shape == (1, 512, 512, 3)
-        expected_image = load_numpy(
-            "https://huggingface.co/RzZ/sd-v1-4-adapter-pipeline/resolve/keypose/sample_output.npy"
-        )
-        assert np.abs(expected_image - image).max() < 0.005
-
-    def test_stable_diffusion_depth_adapter(self):
-        adapter = T2IAdapter.from_pretrained("RzZ/sd-v1-4-adapter-depth")
-        pipe = StableDiffusionAdapterPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", adapter=adapter, safety_checker=None
-        )
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        inputs = self.get_inputs(revision="depth")
-        image = pipe(**inputs).images
-        assert image.shape == (1, 512, 512, 3)
-        expected_image = load_numpy(
-            "https://huggingface.co/RzZ/sd-v1-4-adapter-pipeline/resolve/depth/sample_output.npy"
-        )
-        assert np.abs(expected_image - image).max() < 0.005
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
deleted file mode 100644
index 1f79050398fb..000000000000
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ /dev/null
@@ -1,355 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import numpy as np
-import paddle
-
-from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (
-    AutoencoderKL,
-    ControlNetModel,
-    DDIMScheduler,
-    StableDiffusionControlNetPipeline,
-    UNet2DConditionModel,
-)
-from ppdiffusers.utils import load_image, load_numpy, randn_tensor, slow
-from ppdiffusers.utils.import_utils import is_ppxformers_available
-from ppdiffusers.utils.testing_utils import require_paddle_gpu
-
-from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin
-
-
-class StableDiffusionControlNetPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = StableDiffusionControlNetPipeline
-    params = TEXT_TO_IMAGE_PARAMS
-    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
-
-    def get_dummy_components(self):
-        paddle.seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        paddle.seed(0)
-        controlnet = ControlNetModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            in_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            cross_attention_dim=32,
-            conditioning_embedding_out_channels=(16, 32),
-        )
-        paddle.seed(0)
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-        paddle.seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        paddle.seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config).eval()
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        components = {
-            "unet": unet,
-            "controlnet": controlnet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "safety_checker": None,
-            "feature_extractor": None,
-        }
-        return components
-
-    def get_dummy_inputs(self, seed=0):
-        generator = paddle.Generator().manual_seed(seed)
-
-        controlnet_embedder_scale_factor = 2
-        image = randn_tensor(
-            (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
-            generator=generator,
-        )
-
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "numpy",
-            "image": image,
-        }
-
-        return inputs
-
-    def test_attention_slicing_forward_pass(self):
-        return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3)
-
-    @unittest.skipIf(
-        not is_ppxformers_available(),
-        reason="XFormers attention is only available with CUDA and `xformers` installed",
-    )
-    def test_xformers_attention_forwardGenerator_pass(self):
-        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-2)
-
-    def test_inference_batch_single_identical(self):
-        self._test_inference_batch_single_identical(expected_max_diff=2e-3)
-
-
-@slow
-@require_paddle_gpu
-class StableDiffusionControlNetPipelineSlowTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def test_canny(self):
-        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
-
-        pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
-        )
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = paddle.Generator().manual_seed(0)
-        prompt = "bird"
-        image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
-        )
-
-        output = pipe(prompt, image, generator=generator, output_type="np")
-
-        image = output.images[0]
-
-        assert image.shape == (768, 512, 3)
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny_out.npy"
-        )
-
-        assert np.abs(expected_image - image).max() < 5e-3
-
-    def test_depth(self):
-        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-depth")
-
-        pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
-        )
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = paddle.Generator().manual_seed(0)
-        prompt = "Stormtrooper's lecture"
-        image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth.png"
-        )
-
-        output = pipe(prompt, image, generator=generator, output_type="np")
-
-        image = output.images[0]
-
-        assert image.shape == (512, 512, 3)
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth_out.npy"
-        )
-
-        assert np.abs(expected_image - image).max() < 5e-3
-
-    def test_hed(self):
-        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-hed")
-
-        pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
-        )
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = paddle.Generator().manual_seed(0)
-        prompt = "oil painting of handsome old man, masterpiece"
-        image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/man_hed.png"
-        )
-
-        output = pipe(prompt, image, generator=generator, output_type="np")
-
-        image = output.images[0]
-
-        assert image.shape == (704, 512, 3)
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/man_hed_out.npy"
-        )
-
-        assert np.abs(expected_image - image).max() < 5e-3
-
-    def test_mlsd(self):
-        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-mlsd")
-
-        pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
-        )
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = paddle.Generator().manual_seed(0)
-        prompt = "room"
-        image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/room_mlsd.png"
-        )
-
-        output = pipe(prompt, image, generator=generator, output_type="np")
-
-        image = output.images[0]
-
-        assert image.shape == (704, 512, 3)
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/room_mlsd_out.npy"
-        )
-
-        assert np.abs(expected_image - image).max() < 5e-3
-
-    def test_normal(self):
-        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-normal")
-
-        pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
-        )
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = paddle.Generator().manual_seed(0)
-        prompt = "cute toy"
-        image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/cute_toy_normal.png"
-        )
-
-        output = pipe(prompt, image, generator=generator, output_type="np")
-
-        image = output.images[0]
-
-        assert image.shape == (512, 512, 3)
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/cute_toy_normal_out.npy"
-        )
-
-        assert np.abs(expected_image - image).max() < 5e-3
-
-    def test_openpose(self):
-        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-openpose")
-
-        pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
-        )
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = paddle.Generator().manual_seed(0)
-        prompt = "Chef in the kitchen"
-        image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/pose.png"
-        )
-
-        output = pipe(prompt, image, generator=generator, output_type="np")
-
-        image = output.images[0]
-
-        assert image.shape == (768, 512, 3)
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/chef_pose_out.npy"
-        )
-
-        assert np.abs(expected_image - image).max() < 5e-3
-
-    def test_scribble(self):
-        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-scribble")
-
-        pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
-        )
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = paddle.Generator().manual_seed(5)
-        prompt = "bag"
-        image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bag_scribble.png"
-        )
-
-        output = pipe(prompt, image, generator=generator, output_type="np")
-
-        image = output.images[0]
-
-        assert image.shape == (640, 512, 3)
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bag_scribble_out.npy"
-        )
-
-        assert np.abs(expected_image - image).max() < 5e-3
-
-    def test_seg(self):
-        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-seg")
-
-        pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
-        )
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = paddle.Generator().manual_seed(5)
-        prompt = "house"
-        image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/house_seg.png"
-        )
-
-        output = pipe(prompt, image, generator=generator, output_type="np")
-
-        image = output.images[0]
-
-        assert image.shape == (512, 512, 3)
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/house_seg_out.npy"
-        )
-
-        assert np.abs(expected_image - image).max() < 5e-3
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
deleted file mode 100644
index 4e8d7ed819cf..000000000000
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
+++ /dev/null
@@ -1,270 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import random
-import unittest
-
-import numpy as np
-import paddle
-from PIL import Image
-from ..pipeline_params import (
-    IMAGE_VARIATION_BATCH_PARAMS,
-    IMAGE_VARIATION_PARAMS,
-)
-from ..test_pipelines_common import PipelineTesterMixin
-
-from paddlenlp.transformers import (
-    CLIPImageProcessor,
-    CLIPVisionConfig,
-    CLIPVisionModelWithProjection,
-)
-from ppdiffusers import (
-    AutoencoderKL,
-    DPMSolverMultistepScheduler,
-    PNDMScheduler,
-    StableDiffusionImageVariationPipeline,
-    UNet2DConditionModel,
-)
-from ppdiffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow
-from ppdiffusers.utils.testing_utils import require_paddle_gpu
-
-
-class StableDiffusionImageVariationPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = StableDiffusionImageVariationPipeline
-    params = IMAGE_VARIATION_PARAMS
-    batch_params = IMAGE_VARIATION_BATCH_PARAMS
-
-    def get_dummy_components(self):
-        paddle.seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        paddle.seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        paddle.seed(0)
-        image_encoder_config = CLIPVisionConfig(
-            hidden_size=32,
-            projection_dim=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            image_size=32,
-            patch_size=4,
-        )
-        image_encoder = CLIPVisionModelWithProjection(image_encoder_config)
-        feature_extractor = CLIPImageProcessor(crop_size=32, size=32)
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "image_encoder": image_encoder,
-            "feature_extractor": feature_extractor,
-            "safety_checker": None,
-        }
-        return components
-
-    def get_dummy_inputs(self, seed=0):
-        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed))
-        image = image.cpu().transpose(perm=[0, 2, 3, 1])[0]
-        image = Image.fromarray(np.uint8(image)).convert("RGB").resize((32, 32))
-        generator = paddle.Generator().manual_seed(seed)
-
-        inputs = {
-            "image": image,
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_img_variation_default_case(self):
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionImageVariationPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 64, 64, 3)        
-        expected_slice = np.array(
-            [0.22073305, 0.22751817, 0.32176197, 0.26315716, 0.25681925, 0.41432184, 0.2454437 , 0.10104704, 0.32165903]
-        )
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
-
-    def test_stable_diffusion_img_variation_multiple_images(self):
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionImageVariationPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        inputs["image"] = 2 * [inputs["image"]]
-        output = sd_pipe(**inputs)
-        image = output.images
-        image_slice = image[-1, -3:, -3:, -1]
-        assert image.shape == (2, 64, 64, 3)
-        expected_slice = np.array(
-            [0.61040395, 0.7414253 , 0.5950623 , 0.5843509 , 0.25609648, 0.28481025, 0.61782926, 0.3014974 , 0.35131538]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
-
-
-@slow
-@require_paddle_gpu
-class StableDiffusionImageVariationPipelineSlowTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def get_inputs(self, dtype="float32", seed=0):
-        generator = paddle.Generator().manual_seed(seed)
-        init_image = load_image(
-            "https://paddlenlp.bj.bcebos.com/data/images/input_image_vermeer.png"
-        )
-        latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
-        latents = paddle.to_tensor(latents).cast(dtype)
-        inputs = {
-            "image": init_image,
-            "latents": latents,
-            "generator": generator,
-            "num_inference_steps": 3,
-            "guidance_scale": 7.5,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_img_variation_pipeline_default(self):
-        sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained(
-            "fusing/sd-image-variations-diffusers", safety_checker=None
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs()
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array(
-            [
-                0.5717014670372009,
-                0.47024625539779663,
-                0.47462183237075806,
-                0.6388776898384094,
-                0.5250844359397888,
-                0.500831663608551,
-                0.638043999671936,
-                0.5769134163856506,
-                0.5223015546798706,
-            ]
-        )
-        assert np.abs(image_slice - expected_slice).max() < 0.0001
-
-    def test_stable_diffusion_img_variation_intermediate_state(self):
-        number_of_steps = 0
-
-        def callback_fn(step: int, timestep: int, latents: paddle.Tensor) -> None:
-            callback_fn.has_been_called = True
-            nonlocal number_of_steps
-            number_of_steps += 1
-            if step == 1:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 64)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array(
-                    [-0.1621, 0.2837, -0.7979, -0.1221, -1.3057, 0.7681, -2.1191, 0.0464, 1.6309]
-                )
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
-            elif step == 2:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 64)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([0.6299, 1.75, 1.1992, -2.1582, -1.8994, 0.7334, -0.709, 1.0137, 1.5273])
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
-
-        callback_fn.has_been_called = False
-        pipe = StableDiffusionImageVariationPipeline.from_pretrained(
-            "fusing/sd-image-variations-diffusers", safety_checker=None, paddle_dtype=paddle.float16
-        )
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        inputs = self.get_inputs(dtype="float16")
-        pipe(**inputs, callback=callback_fn, callback_steps=1)
-        assert callback_fn.has_been_called
-        assert number_of_steps == inputs["num_inference_steps"]
-
-
-@nightly
-@require_paddle_gpu
-class StableDiffusionImageVariationPipelineNightlyTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def get_inputs(self, dtype="float32", seed=0):
-        generator = paddle.Generator().manual_seed(seed)
-        init_image = load_image(
-            "https://paddlenlp.bj.bcebos.com/data/images/input_image_vermeer.png"
-        )
-        latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
-        latents = paddle.to_tensor(latents).cast(dtype)
-        inputs = {
-            "image": init_image,
-            "latents": latents,
-            "generator": generator,
-            "num_inference_steps": 50,
-            "guidance_scale": 7.5,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_img_variation_pndm(self):
-        sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained("fusing/sd-image-variations-diffusers")
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs()
-        image = sd_pipe(**inputs).images[0]
-        expected_image = load_numpy(
-            "https://paddlenlp.bj.bcebos.com/data/images/lambdalabs_variations_pndm.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 0.001
-
-    def test_img_variation_dpm(self):
-        sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained("fusing/sd-image-variations-diffusers")
-        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs()
-        inputs["num_inference_steps"] = 25
-        image = sd_pipe(**inputs).images[0]
-        expected_image = load_numpy(
-            "https://paddlenlp.bj.bcebos.com/data/images/lambdalabs_variations_dpm_multi.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 0.001
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
deleted file mode 100644
index 0f40d86931e8..000000000000
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
+++ /dev/null
@@ -1,446 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import random
-import unittest
-
-import numpy as np
-import paddle
-
-from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    DPMSolverMultistepScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    StableDiffusionImg2ImgPipeline,
-    UNet2DConditionModel,
-)
-from ppdiffusers.image_processor import VaeImageProcessor
-from ppdiffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow
-from ppdiffusers.utils.testing_utils import require_paddle_gpu
-
-from ..pipeline_params import (
-    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
-    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
-)
-from ..test_pipelines_common import PipelineTesterMixin
-
-
-class StableDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = StableDiffusionImg2ImgPipeline
-    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"}
-    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
-    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
-
-    def get_dummy_components(self):
-        paddle.seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        paddle.seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        paddle.seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config).eval()
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "safety_checker": None,
-            "feature_extractor": None,
-        }
-        return components
-
-    def get_dummy_inputs(self, seed=0, input_image_type="pd", output_type="np"):
-        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed))
-        generator = paddle.Generator().manual_seed(seed)
-
-        if input_image_type == "pd":
-            input_image = image
-        elif input_image_type == "np":
-            input_image = image.numpy().transpose(0, 2, 3, 1)
-        elif input_image_type == "pil":
-            input_image = image.numpy().transpose(0, 2, 3, 1)
-            input_image = VaeImageProcessor.numpy_to_pil(input_image)
-        else:
-            raise ValueError(f"unsupported input_image_type {input_image_type}.")
-
-        if output_type not in ["pd", "np", "pil"]:
-            raise ValueError(f"unsupported output_type {output_type}")
-
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "image": input_image,
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": output_type,
-        }
-        return inputs
-
-    def test_stable_diffusion_img2img_default_case(self):
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionImg2ImgPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array(
-            [0.50082374, 0.49329656, 0.4963757, 0.46307105, 0.44599247, 0.4877512, 0.560709, 0.56884044, 0.5738671]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
-
-    def test_stable_diffusion_img2img_negative_prompt(self):
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionImg2ImgPipeline(**components)
-        # sd_pipe.image_processor = VaeImageProcessor(vae_scale_factor=sd_pipe.vae_scale_factor, do_normalize=True)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        negative_prompt = "french fries"
-        output = sd_pipe(**inputs, negative_prompt=negative_prompt)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array(
-            [0.48659712, 0.4004616, 0.4762491, 0.49117112, 0.5414775, 0.58218545, 0.5550886, 0.52305603, 0.61624044]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
-
-    def test_stable_diffusion_img2img_multiple_init_images(self):
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionImg2ImgPipeline(**components)
-        # sd_pipe.image_processor = VaeImageProcessor(vae_scale_factor=sd_pipe.vae_scale_factor, do_normalize=True)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        inputs["prompt"] = [inputs["prompt"]] * 2
-        inputs["image"] = inputs["image"].tile(repeat_times=[2, 1, 1, 1])
-        image = sd_pipe(**inputs).images
-        image_slice = image[-1, -3:, -3:, -1]
-        assert image.shape == (2, 32, 32, 3)
-        expected_slice = np.array(
-            [0.49016288, 0.23989454, 0.4229045, 0.56873804, 0.467226, 0.5793949, 0.6967555, 0.7027658, 0.5809763]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
-
-    def test_stable_diffusion_img2img_k_lms(self):
-        components = self.get_dummy_components()
-        components["scheduler"] = LMSDiscreteScheduler(
-            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
-        )
-        sd_pipe = StableDiffusionImg2ImgPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array(
-            [0.29999942, 0.5206376, 0.37915814, 0.4033721, 0.7630579, 0.4642547, 0.5823178, 0.6936951, 0.48969278]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
-
-    def test_pt_np_pil_outputs_equivalent(self):
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionImg2ImgPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        output_pt = sd_pipe(**self.get_dummy_inputs(output_type="pd"))[0]
-        output_np = sd_pipe(**self.get_dummy_inputs(output_type="np"))[0]
-        output_pil = sd_pipe(**self.get_dummy_inputs(output_type="pil"))[0]
-
-        assert np.abs(output_pt.numpy().transpose(0, 2, 3, 1) - output_np).max() <= 1e-4
-        assert np.abs(np.array(output_pil[0]) - (output_np * 255).round()).max() <= 1e-4
-
-    def test_image_types_consistent(self):
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionImg2ImgPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        output_pt = sd_pipe(**self.get_dummy_inputs(input_image_type="pd"))[0]
-        output_np = sd_pipe(**self.get_dummy_inputs(input_image_type="np"))[0]
-        output_pil = sd_pipe(**self.get_dummy_inputs(input_image_type="pil"))[0]
-
-        assert np.abs(output_pt - output_np).max() <= 1e-4
-        assert np.abs(output_pil - output_np).max() <= 2e-2
-
-
-@slow
-@require_paddle_gpu
-class StableDiffusionImg2ImgPipelineSlowTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def get_inputs(self, dtype="float32", seed=0):
-        generator = paddle.Generator().manual_seed(seed)
-        init_image = load_image("https://paddlenlp.bj.bcebos.com/data/images/sketch-mountains-input.png")
-        inputs = {
-            "prompt": "a fantasy landscape, concept art, high resolution",
-            "image": init_image,
-            "generator": generator,
-            "num_inference_steps": 3,
-            "strength": 0.75,
-            "guidance_scale": 7.5,
-            "output_type": "np",
-        }
-        return inputs
-
-    # def test_img2img_2nd_order(self):
-    #     sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-    #     sd_pipe.scheduler = HeunDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-    #     sd_pipe.set_progress_bar_config(disable=None)
-
-    #     inputs = self.get_inputs()
-    #     inputs["num_inference_steps"] = 10
-    #     inputs["strength"] = 0.75
-    #     image = sd_pipe(**inputs).images[0]
-
-    #     expected_image = load_numpy(
-    #         "https://paddlenlp.bj.bcebos.com/data/images/img2img_heun.npy"
-    #     )
-    #     max_diff = np.abs(expected_image - image).max()
-    #     assert max_diff < 5e-2
-
-    #     inputs = self.get_inputs()
-    #     inputs["num_inference_steps"] = 11
-    #     inputs["strength"] = 0.75
-    #     image_other = sd_pipe(**inputs).images[0]
-
-    #     mean_diff = np.abs(image - image_other).mean()
-
-    #     # images should be very similar
-    #     assert mean_diff < 5e-2
-
-    def test_stable_diffusion_img2img_default(self):
-        pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        inputs = self.get_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-        assert image.shape == (1, 512, 768, 3)
-        expected_slice = np.array([0.27150, 0.14849, 0.15605, 0.26740, 0.16954, 0.18204, 0.31470, 0.26311, 0.24525])
-        assert np.abs(expected_slice - image_slice).max() < 0.001
-
-    # def test_img2img_safety_checker_works(self):
-    #     sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-    #     sd_pipe.set_progress_bar_config(disable=None)
-
-    #     inputs = self.get_inputs()
-    #     inputs["num_inference_steps"] = 20
-    #     # make sure the safety checker is activated
-    #     inputs["prompt"] = "naked, sex, porn"
-    #     out = sd_pipe(**inputs)
-    #     breakpoint()
-
-    #     assert out.nsfw_content_detected[0], f"Safety checker should work for prompt: {inputs['prompt']}"
-    #     assert np.abs(out.images[0]).sum() < 1e-5  # should be all zeros
-
-    def test_stable_diffusion_img2img_k_lms(self):
-        pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
-        pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        inputs = self.get_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-        assert image.shape == (1, 512, 768, 3)
-        expected_slice = np.array([0.04890, 0.04862, 0.06422, 0.04655, 0.05108, 0.05307, 0.05926, 0.08759, 0.06852])
-        assert np.abs(expected_slice - image_slice).max() < 0.001
-
-    def test_stable_diffusion_img2img_ddim(self):
-        pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
-        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        inputs = self.get_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-        assert image.shape == (1, 512, 768, 3)
-        expected_slice = np.array([0.06069, 0.05703, 0.08054, 0.05797, 0.06286, 0.06234, 0.08438, 0.11151, 0.08068])
-        assert np.abs(expected_slice - image_slice).max() < 0.001
-
-    def test_stable_diffusion_img2img_intermediate_state(self):
-        number_of_steps = 0
-
-        def callback_fn(step: int, timestep: int, latents: paddle.Tensor) -> None:
-            callback_fn.has_been_called = True
-            nonlocal number_of_steps
-            number_of_steps += 1
-            if step == 1:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 96)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array(
-                    [
-                        0.7650054097175598,
-                        0.10256098955869675,
-                        0.4976114332675934,
-                        3.388350009918213,
-                        3.7242040634155273,
-                        4.272988796234131,
-                        2.4656283855438232,
-                        3.483647108078003,
-                        1.765011191368103,
-                    ]
-                )
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
-            elif step == 2:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 96)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array(
-                    [
-                        0.7580092549324036,
-                        0.10288780182600021,
-                        0.4941849708557129,
-                        3.3663346767425537,
-                        3.7071609497070312,
-                        4.25173807144165,
-                        2.4461638927459717,
-                        3.451681137084961,
-                        1.761878490447998,
-                    ]
-                )
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
-
-        callback_fn.has_been_called = False
-        pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", safety_checker=None, paddle_dtype=paddle.float16
-        )
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        inputs = self.get_inputs(dtype="float16")
-        pipe(**inputs, callback=callback_fn, callback_steps=1)
-        assert callback_fn.has_been_called
-        assert number_of_steps == 2
-
-    def test_stable_diffusion_img2img_pipeline_multiple_of_8(self):
-        init_image = load_image("https://paddlenlp.bj.bcebos.com/data/images/sketch-mountains-input.jpg")
-        init_image = init_image.resize((760, 504))
-        model_id = "CompVis/stable-diffusion-v1-4"
-        pipe = StableDiffusionImg2ImgPipeline.from_pretrained(model_id, safety_checker=None)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        prompt = "A fantasy landscape, trending on artstation"
-        generator = paddle.Generator().manual_seed(0)
-        output = pipe(
-            prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5, generator=generator, output_type="np"
-        )
-        image = output.images[0]
-        image_slice = image[255:258, 383:386, -1]
-        assert image.shape == (504, 760, 3)
-        expected_slice = np.array(
-            [0.71240354, 0.71053374, 0.69922864, 0.7139934, 0.7106118, 0.69451976, 0.71982634, 0.71717453, 0.70306426]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.005
-
-
-@nightly
-@require_paddle_gpu
-class StableDiffusionImg2ImgPipelineNightlyTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def get_inputs(self, dtype="float32", seed=0):
-        generator = paddle.Generator().manual_seed(seed)
-        init_image = load_image("https://paddlenlp.bj.bcebos.com/data/images/sketch-mountains-input.png")
-        inputs = {
-            "prompt": "a fantasy landscape, concept art, high resolution",
-            "image": init_image,
-            "generator": generator,
-            "num_inference_steps": 50,
-            "strength": 0.75,
-            "guidance_scale": 7.5,
-            "output_type": "np",
-        }
-        return inputs
-
-    def test_img2img_pndm(self):
-        sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-        sd_pipe
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs()
-        image = sd_pipe(**inputs).images[0]
-        expected_image = load_numpy("https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_1_5_pndm.npy")
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 0.001
-
-    def test_img2img_ddim(self):
-        sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-        sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs()
-        image = sd_pipe(**inputs).images[0]
-        expected_image = load_numpy("https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_1_5_ddim.npy")
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 0.001
-
-    def test_img2img_lms(self):
-        sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs()
-        image = sd_pipe(**inputs).images[0]
-        expected_image = load_numpy("https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_1_5_lms.npy")
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 0.001
-
-    def test_img2img_dpm(self):
-        sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs()
-        inputs["num_inference_steps"] = 30
-        image = sd_pipe(**inputs).images[0]
-        expected_image = load_numpy("https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_1_5_dpm.npy")
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 0.001
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
deleted file mode 100644
index e0dcfa714125..000000000000
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ /dev/null
@@ -1,455 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import random
-import unittest
-
-import numpy as np
-import paddle
-from PIL import Image
-
-from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (
-    AutoencoderKL,
-    DPMSolverMultistepScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    StableDiffusionInpaintPipeline,
-    UNet2DConditionModel,
-)
-from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint import (
-    prepare_mask_and_masked_image,
-)
-from ppdiffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow
-from ppdiffusers.utils.testing_utils import require_paddle_gpu
-
-from ..pipeline_params import (
-    TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
-    TEXT_GUIDED_IMAGE_INPAINTING_PARAMS,
-)
-from ..test_pipelines_common import PipelineTesterMixin
-
-
-class StableDiffusionInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = StableDiffusionInpaintPipeline
-    params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
-    batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
-
-    def get_dummy_components(self):
-        paddle.seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=9,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        paddle.seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        paddle.seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config).eval()
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "safety_checker": None,
-            "feature_extractor": None,
-        }
-        return components
-
-    def get_dummy_inputs(self, seed=0):
-        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed))
-        image = image.cpu().transpose(perm=[0, 2, 3, 1])[0]
-        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
-        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((64, 64))
-        generator = paddle.Generator().manual_seed(seed)
-
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "image": init_image,
-            "mask_image": mask_image,
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_inpaint(self):
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionInpaintPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [0.55786943, 0.628228, 0.49147403, 0.3191774, 0.39249492, 0.46521175, 0.29909956, 0.21160087, 0.42932406]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_stable_diffusion_inpaint_image_tensor(self):
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionInpaintPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        output = sd_pipe(**inputs)
-        out_pil = output.images
-        inputs = self.get_dummy_inputs()
-        inputs["image"] = (
-            paddle.to_tensor(np.array(inputs["image"]) / 127.5 - 1).transpose(perm=[2, 0, 1]).unsqueeze(axis=0)
-        )
-        inputs["mask_image"] = (
-            paddle.to_tensor(np.array(inputs["mask_image"]) / 255).transpose(perm=[2, 0, 1])[:1].unsqueeze(axis=0)
-        )
-        output = sd_pipe(**inputs)
-        out_tensor = output.images
-        assert out_pil.shape == (1, 64, 64, 3)
-        assert np.abs(out_pil.flatten() - out_tensor.flatten()).max() < 0.05
-
-    # TODO, fix this nan.
-    def test_float16_inference(self, expected_max_diff=1e-2):
-        pass
-
-
-@slow
-@require_paddle_gpu
-class StableDiffusionInpaintPipelineSlowTests(unittest.TestCase):
-    def setUp(self):
-        super().setUp()
-
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def get_inputs(self, dtype="float32", seed=0):
-        generator = paddle.Generator().manual_seed(seed)
-        init_image = load_image("https://paddlenlp.bj.bcebos.com/data/images/input_bench_image.png")
-        mask_image = load_image("https://paddlenlp.bj.bcebos.com/data/images/input_bench_mask.png")
-        inputs = {
-            "prompt": "Face of a yellow cat, high resolution, sitting on a park bench",
-            "image": init_image,
-            "mask_image": mask_image,
-            "generator": generator,
-            "num_inference_steps": 3,
-            "guidance_scale": 7.5,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_inpaint_ddim(self):
-        pipe = StableDiffusionInpaintPipeline.from_pretrained(
-            "runwayml/stable-diffusion-inpainting", safety_checker=None
-        )
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        inputs = self.get_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, 253:256, 253:256, -1].flatten()
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.05978, 0.10983, 0.10514, 0.07922, 0.08483, 0.08587, 0.05302, 0.03218, 0.01636])
-        assert np.abs(expected_slice - image_slice).max() < 0.0001
-
-    def test_stable_diffusion_inpaint_fp16(self):
-        pipe = StableDiffusionInpaintPipeline.from_pretrained(
-            "runwayml/stable-diffusion-inpainting", paddle_dtype=paddle.float16, safety_checker=None
-        )
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        inputs = self.get_inputs(dtype="float16")
-        image = pipe(**inputs).images
-        image_slice = image[0, 253:256, 253:256, -1].flatten()
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array(
-            [0.9921875, 0.9477539, 0.90234375, 0.96484375, 0.9189453, 0.875, 0.9316406, 0.9013672, 0.875]
-        )
-        assert np.abs(expected_slice - image_slice).max() < 0.05
-
-    def test_stable_diffusion_inpaint_pndm(self):
-        pipe = StableDiffusionInpaintPipeline.from_pretrained(
-            "runwayml/stable-diffusion-inpainting", safety_checker=None
-        )
-        pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        inputs = self.get_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, 253:256, 253:256, -1].flatten()
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.06892, 0.06994, 0.07905, 0.05366, 0.04709, 0.04890, 0.04107, 0.05083, 0.04180])
-        assert np.abs(expected_slice - image_slice).max() < 0.0001
-
-    def test_stable_diffusion_inpaint_k_lms(self):
-        pipe = StableDiffusionInpaintPipeline.from_pretrained(
-            "runwayml/stable-diffusion-inpainting", safety_checker=None
-        )
-        pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        inputs = self.get_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, 253:256, 253:256, -1].flatten()
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.23513, 0.22413, 0.29442, 0.24243, 0.26214, 0.30329, 0.26431, 0.25025, 0.25197])
-        assert np.abs(expected_slice - image_slice).max() < 0.0001
-
-
-@nightly
-@require_paddle_gpu
-class StableDiffusionInpaintPipelineNightlyTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def get_inputs(self, dtype="float32", seed=0):
-        generator = paddle.Generator().manual_seed(seed)
-        init_image = load_image("https://paddlenlp.bj.bcebos.com/data/images/input_bench_image.png")
-        mask_image = load_image("https://paddlenlp.bj.bcebos.com/data/images/input_bench_mask.png")
-        inputs = {
-            "prompt": "Face of a yellow cat, high resolution, sitting on a park bench",
-            "image": init_image,
-            "mask_image": mask_image,
-            "generator": generator,
-            "num_inference_steps": 50,
-            "guidance_scale": 7.5,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_inpaint_ddim(self):
-        sd_pipe = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting")
-        sd_pipe
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs()
-        image = sd_pipe(**inputs).images[0]
-        expected_image = load_numpy("https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_inpaint_ddim.npy")
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 0.001
-
-    def test_inpaint_pndm(self):
-        sd_pipe = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting")
-        sd_pipe.scheduler = PNDMScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs()
-        image = sd_pipe(**inputs).images[0]
-        expected_image = load_numpy("https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_inpaint_pndm.npy")
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 0.001
-
-    def test_inpaint_lms(self):
-        sd_pipe = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting")
-        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs()
-        image = sd_pipe(**inputs).images[0]
-        expected_image = load_numpy("https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_inpaint_lms.npy")
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 0.001
-
-    def test_inpaint_dpm(self):
-        sd_pipe = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting")
-        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs()
-        inputs["num_inference_steps"] = 30
-        image = sd_pipe(**inputs).images[0]
-        expected_image = load_numpy(
-            "https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_inpaint_dpm_multi.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 0.001
-
-
-class StableDiffusionInpaintingPrepareMaskAndMaskedImageTests(unittest.TestCase):
-    def test_pil_inputs(self):
-        im = np.random.randint(0, 255, (32, 32, 3), dtype=np.uint8)
-        im = Image.fromarray(im)
-        mask = np.random.randint(0, 255, (32, 32), dtype=np.uint8) > 127.5
-        mask = Image.fromarray((mask * 255).astype(np.uint8))
-        t_mask, t_masked = prepare_mask_and_masked_image(im, mask)
-        self.assertTrue(isinstance(t_mask, paddle.Tensor))
-        self.assertTrue(isinstance(t_masked, paddle.Tensor))
-        self.assertEqual(t_mask.ndim, 4)
-        self.assertEqual(t_masked.ndim, 4)
-        self.assertEqual(t_mask.shape, [1, 1, 32, 32])
-        self.assertEqual(t_masked.shape, [1, 3, 32, 32])
-        self.assertTrue(t_mask.dtype == paddle.float32)
-        self.assertTrue(t_masked.dtype == paddle.float32)
-        self.assertTrue(t_mask.logsumexp() >= 0.0)
-        self.assertTrue(t_mask.max() <= 1.0)
-        self.assertTrue(t_masked.min() >= -1.0)
-        self.assertTrue(t_masked.min() <= 1.0)
-
-        self.assertTrue(t_mask.sum() > 0.0)
-
-    def test_np_inputs(self):
-        im_np = np.random.randint(0, 255, (32, 32, 3), dtype=np.uint8)
-        im_pil = Image.fromarray(im_np)
-        mask_np = np.random.randint(0, 255, (32, 32), dtype=np.uint8) > 127.5
-        mask_pil = Image.fromarray((mask_np * 255).astype(np.uint8))
-        t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
-        t_mask_pil, t_masked_pil = prepare_mask_and_masked_image(im_pil, mask_pil)
-        self.assertTrue((t_mask_np == t_mask_pil).all())
-        self.assertTrue((t_masked_np == t_masked_pil).all())
-
-    def test_paddle_3D_2D_inputs(self):
-        im_tensor = paddle.randint(0, 255, (3, 32, 32)).cast("uint8")
-        mask_tensor = paddle.randint(0, 255, (32, 32)).cast("uint8") > 127.5
-        im_np = im_tensor.numpy().transpose(1, 2, 0)
-        mask_np = mask_tensor.numpy()
-
-        t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(
-            im_tensor / 127.5 - 1, mask_tensor.cast("int64")
-        )
-        t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
-
-        self.assertTrue((t_mask_tensor == t_mask_np).all())
-        self.assertTrue((t_masked_tensor == t_masked_np).all())
-
-    def test_paddle_3D_3D_inputs(self):
-        im_tensor = paddle.randint(0, 255, (3, 32, 32)).cast("uint8")
-        mask_tensor = paddle.randint(0, 255, (1, 32, 32)).cast("uint8") > 127.5
-        im_np = im_tensor.numpy().transpose(1, 2, 0)
-        mask_np = mask_tensor.numpy()[0]
-        t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(
-            im_tensor / 127.5 - 1, mask_tensor.cast("int64")
-        )
-        t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
-        self.assertTrue((t_mask_tensor == t_mask_np).all())
-        self.assertTrue((t_masked_tensor == t_masked_np).all())
-
-    def test_paddle_4D_2D_inputs(self):
-        im_tensor = paddle.randint(0, 255, (1, 3, 32, 32)).cast("uint8")
-        mask_tensor = paddle.randint(0, 255, (32, 32)).cast("uint8") > 127.5
-        im_np = im_tensor.numpy()[0].transpose(1, 2, 0)
-        mask_np = mask_tensor.numpy()
-        t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(
-            im_tensor / 127.5 - 1, mask_tensor.cast("int64")
-        )
-        t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
-        self.assertTrue((t_mask_tensor == t_mask_np).all())
-        self.assertTrue((t_masked_tensor == t_masked_np).all())
-
-    def test_paddle_4D_3D_inputs(self):
-        im_tensor = paddle.randint(0, 255, (1, 3, 32, 32)).cast("uint8")
-        mask_tensor = paddle.randint(0, 255, (1, 32, 32)).cast("uint8") > 127.5
-        im_np = im_tensor.numpy()[0].transpose(1, 2, 0)
-        mask_np = mask_tensor.numpy()[0]
-        t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(
-            im_tensor / 127.5 - 1, mask_tensor.cast("int64")
-        )
-        t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
-        self.assertTrue((t_mask_tensor == t_mask_np).all())
-        self.assertTrue((t_masked_tensor == t_masked_np).all())
-
-    def test_paddle_4D_4D_inputs(self):
-        im_tensor = paddle.randint(0, 255, (1, 3, 32, 32)).cast("uint8")
-        mask_tensor = paddle.randint(0, 255, (1, 1, 32, 32)).cast("uint8") > 127.5
-        im_np = im_tensor.numpy()[0].transpose(1, 2, 0)
-        mask_np = mask_tensor.numpy()[0][0]
-        t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(
-            im_tensor / 127.5 - 1, mask_tensor.cast("int64")
-        )
-        t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
-        self.assertTrue((t_mask_tensor == t_mask_np.cast("float64")).all())
-        self.assertTrue((t_masked_tensor == t_masked_np).all())
-
-    def test_paddle_batch_4D_3D(self):
-        im_tensor = paddle.randint(0, 255, (2, 3, 32, 32)).cast("uint8")
-        mask_tensor = paddle.randint(0, 255, (2, 32, 32)).cast("uint8") > 127.5
-        im_nps = [im.numpy().transpose(1, 2, 0) for im in im_tensor]
-        mask_nps = [mask.numpy() for mask in mask_tensor]
-        t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(
-            im_tensor / 127.5 - 1, mask_tensor.cast("int64")
-        )
-        nps = [prepare_mask_and_masked_image(i, m) for i, m in zip(im_nps, mask_nps)]
-        t_mask_np = paddle.concat(x=[n[0] for n in nps])
-        t_masked_np = paddle.concat(x=[n[1] for n in nps])
-        self.assertTrue((t_mask_tensor == t_mask_np).all())
-        self.assertTrue((t_masked_tensor == t_masked_np).all())
-
-    def test_paddle_batch_4D_4D(self):
-        im_tensor = paddle.randint(0, 255, (2, 3, 32, 32)).cast("uint8")
-        mask_tensor = paddle.randint(0, 255, (2, 32, 32)).cast("uint8") > 127.5
-
-        im_nps = [im.numpy().transpose(1, 2, 0) for im in im_tensor]
-        mask_nps = [mask.numpy() for mask in mask_tensor]
-        t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(
-            im_tensor / 127.5 - 1, mask_tensor.cast("int64")
-        )
-        nps = [prepare_mask_and_masked_image(i, m) for i, m in zip(im_nps, mask_nps)]
-        t_mask_np = paddle.concat(x=[n[0] for n in nps])
-        t_masked_np = paddle.concat(x=[n[1] for n in nps])
-        self.assertTrue((t_mask_tensor == t_mask_np).all())
-        self.assertTrue((t_masked_tensor == t_masked_np).all())
-
-    def test_shape_mismatch(self):
-        with self.assertRaises(AssertionError):
-            prepare_mask_and_masked_image(paddle.randn(shape=[3, 32, 32]), paddle.randn(shape=[64, 64]))
-        with self.assertRaises(AssertionError):
-            prepare_mask_and_masked_image(paddle.randn(shape=[2, 3, 32, 32]), paddle.randn(shape=[4, 64, 64]))
-        with self.assertRaises(AssertionError):
-            prepare_mask_and_masked_image(paddle.randn(shape=[2, 3, 32, 32]), paddle.randn(shape=[4, 1, 64, 64]))
-
-    def test_type_mismatch(self):
-        with self.assertRaises(TypeError):
-            prepare_mask_and_masked_image(paddle.rand(shape=[3, 32, 32]), paddle.rand(shape=[3, 32, 32]).numpy())
-        with self.assertRaises(TypeError):
-            prepare_mask_and_masked_image(paddle.rand(shape=[3, 32, 32]).numpy(), paddle.rand(shape=[3, 32, 32]))
-
-    def test_channels_first(self):
-        with self.assertRaises(AssertionError):
-            prepare_mask_and_masked_image(paddle.rand(shape=[32, 32, 3]), paddle.rand(shape=[3, 32, 32]))
-
-    def test_tensor_range(self):
-        with self.assertRaises(ValueError):
-            prepare_mask_and_masked_image(paddle.ones(shape=[3, 32, 32]) * 2, paddle.rand(shape=[32, 32]))
-        with self.assertRaises(ValueError):
-            prepare_mask_and_masked_image(paddle.ones(shape=[3, 32, 32]) * -2, paddle.rand(shape=[32, 32]))
-        with self.assertRaises(ValueError):
-            prepare_mask_and_masked_image(paddle.rand(shape=[3, 32, 32]), paddle.ones(shape=[32, 32]) * 2)
-        with self.assertRaises(ValueError):
-            prepare_mask_and_masked_image(paddle.rand(shape=[3, 32, 32]), paddle.ones(shape=[32, 32]) * -1)
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
deleted file mode 100644
index 95b58f1640d2..000000000000
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
+++ /dev/null
@@ -1,541 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import random
-import unittest
-
-import numpy as np
-import paddle
-from PIL import Image
-
-from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    DPMSolverMultistepScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    StableDiffusionInpaintPipelineLegacy,
-    UNet2DConditionModel,
-    UNet2DModel,
-    VQModel,
-)
-from ppdiffusers.utils import floats_tensor, load_image, nightly, slow
-from ppdiffusers.utils.testing_utils import load_numpy, preprocess_image, require_paddle_gpu
-
-
-class StableDiffusionInpaintLegacyPipelineFastTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    @property
-    def dummy_image(self):
-        batch_size = 1
-        num_channels = 3
-        sizes = 32, 32
-        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0))
-        return image
-
-    @property
-    def dummy_uncond_unet(self):
-        paddle.seed(0)
-        model = UNet2DModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=3,
-            out_channels=3,
-            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
-            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
-        )
-        return model
-
-    @property
-    def dummy_cond_unet(self):
-        paddle.seed(0)
-        model = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        return model
-
-    @property
-    def dummy_cond_unet_inpaint(self):
-        paddle.seed(0)
-        model = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=9,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        return model
-
-    @property
-    def dummy_vq_model(self):
-        paddle.seed(0)
-        model = VQModel(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=3,
-        )
-        return model
-
-    @property
-    def dummy_vae(self):
-        paddle.seed(0)
-        model = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        return model
-
-    @property
-    def dummy_text_encoder(self):
-        paddle.seed(0)
-        config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        return CLIPTextModel(config).eval()
-
-    @property
-    def dummy_extractor(self):
-        def extract(*args, **kwargs):
-            class Out:
-                def __init__(self):
-                    self.pixel_values = paddle.ones(shape=[0])
-
-                def to(self, device):
-                    self.pixel_values
-                    return self
-
-            return Out()
-
-        return extract
-
-    def test_stable_diffusion_inpaint_legacy(self):
-        unet = self.dummy_cond_unet
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        image = self.dummy_image.cpu().transpose(perm=[0, 2, 3, 1])[0]
-        init_image = Image.fromarray(np.uint8(image)).convert("RGB")
-        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((32, 32))
-        sd_pipe = StableDiffusionInpaintPipelineLegacy(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=self.dummy_extractor,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-        prompt = "A painting of a squirrel eating a burger"
-        generator = paddle.Generator().manual_seed(0)
-        output = sd_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=6.0,
-            num_inference_steps=2,
-            output_type="np",
-            image=init_image,
-            mask_image=mask_image,
-        )
-        image = output.images
-        generator = paddle.Generator().manual_seed(0)
-        image_from_tuple = sd_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=6.0,
-            num_inference_steps=2,
-            output_type="np",
-            image=init_image,
-            mask_image=mask_image,
-            return_dict=False,
-        )[0]
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array(
-            [0.0, 0.42294562, 0.31831095, 0.11458772, 0.57409716, 0.6021224, 0.3139254, 0.6612497, 0.51271075]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_stable_diffusion_inpaint_legacy_batched(self):
-        unet = self.dummy_cond_unet
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        image = self.dummy_image.permute(0, 2, 3, 1)[0]
-        init_image = Image.fromarray(np.uint8(image)).convert("RGB")
-        init_images_tens = preprocess_image(init_image, batch_size=2)
-        init_masks_tens = init_images_tens + 4
-
-        # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionInpaintPipelineLegacy(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=self.dummy_extractor,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = paddle.Generator().manual_seed(0)
-        images = sd_pipe(
-            [prompt] * 2,
-            generator=generator,
-            guidance_scale=6.0,
-            num_inference_steps=2,
-            output_type="np",
-            image=init_images_tens,
-            mask_image=init_masks_tens,
-        ).images
-
-        assert images.shape == (2, 32, 32, 3)
-
-        image_slice_0 = images[0, -3:, -3:, -1].flatten()
-        image_slice_1 = images[1, -3:, -3:, -1].flatten()
-
-        expected_slice_0 = np.array([0.36070424, 0.6893935 , 0.5395819 , 0.43957978, 0.52270013, 0.41502672, 0.5263115 , 0.57829964, 0.61257005])
-        expected_slice_1 = np.array([0.5639288 , 0.6165064 , 0.6162699 , 0.6441643 , 0.56632555, 0.39611217, 0.62690437, 0.5277921 , 0.5135379])
-
-        assert np.abs(expected_slice_0 - image_slice_0).max() < 1e-2
-        assert np.abs(expected_slice_1 - image_slice_1).max() < 1e-2
-
-    def test_stable_diffusion_inpaint_legacy_negative_prompt(self):
-        unet = self.dummy_cond_unet
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        image = self.dummy_image.cpu().transpose(perm=[0, 2, 3, 1])[0]
-        init_image = Image.fromarray(np.uint8(image)).convert("RGB")
-        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((32, 32))
-        sd_pipe = StableDiffusionInpaintPipelineLegacy(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=self.dummy_extractor,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-        prompt = "A painting of a squirrel eating a burger"
-        negative_prompt = "french fries"
-        generator = paddle.Generator().manual_seed(0)
-        output = sd_pipe(
-            prompt,
-            negative_prompt=negative_prompt,
-            generator=generator,
-            guidance_scale=6.0,
-            num_inference_steps=2,
-            output_type="np",
-            image=init_image,
-            mask_image=mask_image,
-        )
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array(
-            [0.0, 0.42320636, 0.3191024, 0.11486277, 0.5742749, 0.6025071, 0.31415784, 0.66176593, 0.5128486]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_stable_diffusion_inpaint_legacy_num_images_per_prompt(self):
-        unet = self.dummy_cond_unet
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        image = self.dummy_image.cpu().transpose(perm=[0, 2, 3, 1])[0]
-        init_image = Image.fromarray(np.uint8(image)).convert("RGB")
-        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((32, 32))
-        sd_pipe = StableDiffusionInpaintPipelineLegacy(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=self.dummy_extractor,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-        prompt = "A painting of a squirrel eating a burger"
-        images = sd_pipe(
-            prompt, num_inference_steps=2, output_type="np", image=init_image, mask_image=mask_image
-        ).images
-        assert images.shape == (1, 32, 32, 3)
-        batch_size = 2
-        images = sd_pipe(
-            [prompt] * batch_size, num_inference_steps=2, output_type="np", image=init_image, mask_image=mask_image
-        ).images
-        assert images.shape == (batch_size, 32, 32, 3)
-        num_images_per_prompt = 2
-        images = sd_pipe(
-            prompt,
-            num_inference_steps=2,
-            output_type="np",
-            image=init_image,
-            mask_image=mask_image,
-            num_images_per_prompt=num_images_per_prompt,
-        ).images
-        assert images.shape == (num_images_per_prompt, 32, 32, 3)
-        batch_size = 2
-        images = sd_pipe(
-            [prompt] * batch_size,
-            num_inference_steps=2,
-            output_type="np",
-            image=init_image,
-            mask_image=mask_image,
-            num_images_per_prompt=num_images_per_prompt,
-        ).images
-        assert images.shape == (batch_size * num_images_per_prompt, 32, 32, 3)
-
-
-@slow
-@require_paddle_gpu
-class StableDiffusionInpaintLegacyPipelineSlowTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def get_inputs(self, seed=0):
-        generator = paddle.Generator().manual_seed(seed)
-        init_image = load_image(
-            "https://paddlenlp.bj.bcebos.com/data/images/input_bench_image.png"
-        )
-        mask_image = load_image(
-            "https://paddlenlp.bj.bcebos.com/data/images/input_bench_mask.png"
-        )
-        inputs = {
-            "prompt": "A red cat sitting on a park bench",
-            "image": init_image,
-            "mask_image": mask_image,
-            "generator": generator,
-            "num_inference_steps": 3,
-            "strength": 0.75,
-            "guidance_scale": 7.5,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_inpaint_legacy_pndm(self):
-        pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", safety_checker=None
-        )
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        inputs = self.get_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, 253:256, 253:256, -1].flatten()
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array(
-            [0.27226633, 0.29068208, 0.3450312, 0.21444553, 0.26328486, 0.34392387, 0.18026042, 0.24961185, 0.3214044]
-        )
-        assert np.abs(expected_slice - image_slice).max() < 0.0001
-    
-    def test_stable_diffusion_inpaint_legacy_batched(self):
-        pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", safety_checker=None
-        )
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        inputs = self.get_inputs()
-        inputs["prompt"] = [inputs["prompt"]] * 2
-        inputs["image"] = preprocess_image(inputs["image"], batch_size=2)
-
-        mask = inputs["mask_image"].convert("L")
-        mask = np.array(mask).astype(np.float32) / 255.0
-        mask = paddle.to_tensor(1 - mask)
-        masks = paddle.stack([mask[None]] * 2, axis=0)
-        inputs["mask_image"] = masks
-
-        image = pipe(**inputs).images
-        assert image.shape == (2, 512, 512, 3)
-
-        image_slice_0 = image[0, 253:256, 253:256, -1].flatten()
-        image_slice_1 = image[1, 253:256, 253:256, -1].flatten()
-
-        expected_slice_0 = np.array(
-            [0.27526367, 0.29158682, 0.35184938, 0.21504477, 0.26708275, 0.35169, 0.18185198, 0.2572803 , 0.32425082]
-        )
-        expected_slice_1 = np.array(
-            [0.        , 0.18929192, 0.7068148 , 0.07977328, 0.13444492, 0.5016247 , 0.49761847, 0.2830933 , 0.36412603] 
-        )
-
-        assert np.abs(expected_slice_0 - image_slice_0).max() < 1e-4
-        assert np.abs(expected_slice_1 - image_slice_1).max() < 1e-4
-
-    def test_stable_diffusion_inpaint_legacy_k_lms(self):
-        pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", safety_checker=None
-        )
-        pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        inputs = self.get_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, 253:256, 253:256, -1].flatten()
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array(
-            [0.29036117, 0.28907132, 0.32839334, 0.26510137, 0.2820784, 0.31148806, 0.29358387, 0.29515788, 0.28257304]
-        )
-        assert np.abs(expected_slice - image_slice).max() < 0.0001
-
-    def test_stable_diffusion_inpaint_legacy_intermediate_state(self):
-        number_of_steps = 0
-
-        def callback_fn(step: int, timestep: int, latents: paddle.Tensor) -> None:
-            callback_fn.has_been_called = True
-            nonlocal number_of_steps
-            number_of_steps += 1
-            if step == 1:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 64)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([-0.103, 1.415, -0.02197, -0.5103, -0.5903, 0.1953, 0.75, 0.3477, -1.356])
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.001
-            elif step == 2:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 64)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([0.4802, 1.154, 0.628, 0.2322, 0.2593, -0.1455, 0.7075, -0.1617, -0.5615])
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.001
-
-        callback_fn.has_been_called = False
-        pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", safety_checker=None, paddle_dtype=paddle.float16
-        )
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        inputs = self.get_inputs()
-        pipe(**inputs, callback=callback_fn, callback_steps=1)
-        assert callback_fn.has_been_called
-        assert number_of_steps == 2
-
-
-@nightly
-@require_paddle_gpu
-class StableDiffusionInpaintLegacyPipelineNightlyTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def get_inputs(self, dtype="float32", seed=0):
-        generator = paddle.Generator().manual_seed(seed)
-        init_image = load_image(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_inpaint/input_bench_image.png"
-        )
-        mask_image = load_image(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_inpaint/input_bench_mask.png"
-        )
-        inputs = {
-            "prompt": "A red cat sitting on a park bench",
-            "image": init_image,
-            "mask_image": mask_image,
-            "generator": generator,
-            "num_inference_steps": 50,
-            "strength": 0.75,
-            "guidance_scale": 7.5,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_inpaint_pndm(self):
-        sd_pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained("runwayml/stable-diffusion-v1-5")
-        sd_pipe
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs()
-        image = sd_pipe(**inputs).images[0]
-        expected_image = np.array([[0.7330009, 0.80003107, 0.8268216], [0.73606366, 0.801595, 0.8470554]])
-        max_diff = np.abs(expected_image - image[0][0:2]).max()
-        assert max_diff < 0.001
-
-    def test_inpaint_ddim(self):
-        sd_pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained("runwayml/stable-diffusion-v1-5")
-        sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs()
-        image = sd_pipe(**inputs).images[0]
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_inpaint_legacy/stable_diffusion_1_5_ddim.npy"
-        )
-        expected_image = np.array([[0.7290994, 0.794852, 0.82096446], [0.7330909, 0.79727536, 0.8420528]])
-        max_diff = np.abs(expected_image - image[0][0:2]).max()
-        assert max_diff < 0.001
-
-    def test_inpaint_lms(self):
-        sd_pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained("runwayml/stable-diffusion-v1-5")
-        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs()
-        image = sd_pipe(**inputs).images[0]
-        expected_image = np.array([[0.74595624, 0.81757987, 0.84589916], [0.74728143, 0.81736475, 0.86543]])
-        max_diff = np.abs(expected_image - image[0][0:2]).max()
-        assert max_diff < 0.001
-
-    def test_inpaint_dpm(self):
-        sd_pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained("runwayml/stable-diffusion-v1-5")
-        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs()
-        inputs["num_inference_steps"] = 30
-        image = sd_pipe(**inputs).images[0]
-        expected_image = np.array([[0.7310472, 0.7970823, 0.8231524], [0.7348697, 0.799358, 0.8439586]])
-        max_diff = np.abs(expected_image - image[0][0:2]).max()
-        assert max_diff < 0.001
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py
deleted file mode 100644
index d8b6992f4117..000000000000
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py
+++ /dev/null
@@ -1,293 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import random
-import unittest
-
-import numpy as np
-import paddle
-from PIL import Image
-from ..pipeline_params import (
-    TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
-    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
-)
-from ..test_pipelines_common import PipelineTesterMixin
-
-from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    EulerAncestralDiscreteScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    StableDiffusionInstructPix2PixPipeline,
-    UNet2DConditionModel,
-)
-from ppdiffusers.utils import floats_tensor, load_image, slow
-from ppdiffusers.utils.testing_utils import require_paddle_gpu
-
-
-class StableDiffusionInstructPix2PixPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = StableDiffusionInstructPix2PixPipeline
-    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width", "cross_attention_kwargs"}
-    batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
-
-    def get_dummy_components(self):
-        paddle.seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=8,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        paddle.seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        paddle.seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config).eval()
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "safety_checker": None,
-            "feature_extractor": None,
-        }
-        return components
-
-    def get_dummy_inputs(self, seed=0):
-        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed))
-        image = image.cpu().transpose(perm=[0, 2, 3, 1])[0]
-        image = Image.fromarray(np.uint8(image)).convert("RGB")
-        generator = paddle.Generator().manual_seed(seed)
-
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "image": image,
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "image_guidance_scale": 1,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_pix2pix_default_case(self):
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionInstructPix2PixPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array(
-            [0.24897021, 0.3813318 , 0.15630311, 0.69198483, 0.7409521 , 0.55128014, 0.5978868 , 0.60921687, 0.47007012]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
-
-    def test_stable_diffusion_pix2pix_negative_prompt(self):
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionInstructPix2PixPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        negative_prompt = "french fries"
-        output = sd_pipe(**inputs, negative_prompt=negative_prompt)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array(
-            [0.27121854, 0.34936333, 0.12865198, 0.77894104, 0.81688535, 0.6136005, 0.62261313, 0.6386795 , 0.5096967]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
-
-    def test_stable_diffusion_pix2pix_multiple_init_images(self):
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionInstructPix2PixPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        inputs["prompt"] = [inputs["prompt"]] * 2
-        image = np.array(inputs["image"]).astype(np.float32) / 255.0
-        image = paddle.to_tensor(data=image).unsqueeze(axis=0)
-        image = image.transpose(perm=[0, 3, 1, 2])
-        inputs["image"] = image.tile(repeat_times=[2, 1, 1, 1])
-        image = sd_pipe(**inputs).images
-        image_slice = image[-1, -3:, -3:, -1]
-        assert image.shape == (2, 32, 32, 3)
-
-        expected_slice = np.array(
-            [0.41508308, 0.41580454, 0.5588631, 0.32340443, 0.20930073, 0.35993075, 0.28470254, 0.38203996, 0.51769114]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
-
-    def test_stable_diffusion_pix2pix_euler(self):
-        components = self.get_dummy_components()
-        components["scheduler"] = EulerAncestralDiscreteScheduler(
-            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
-        )
-        sd_pipe = StableDiffusionInstructPix2PixPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-        slice = [round(x, 4) for x in image_slice.flatten().tolist()]
-        print(",".join([str(x) for x in slice]))
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array(
-            [0.26694882, 0.4288544 , 0.21950376, 0.74369204, 0.6756442 , 0.54577595, 0.5941435 , 0.5603916 , 0.51743454]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
-
-@slow
-@require_paddle_gpu
-class StableDiffusionInstructPix2PixPipelineSlowTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def get_inputs(self, seed=0):
-        generator = paddle.Generator().manual_seed(seed=seed)
-        image = load_image(
-            "https://paddlenlp.bj.bcebos.com/data/images/example.jpg"
-        )
-        inputs = {
-            "prompt": "turn him into a cyborg",
-            "image": image,
-            "generator": generator,
-            "num_inference_steps": 3,
-            "guidance_scale": 7.5,
-            "image_guidance_scale": 1.0,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_pix2pix_default(self):
-        pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
-            "timbrooks/instruct-pix2pix", safety_checker=None
-        )
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        inputs = self.get_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array(
-            [0.32138163, 0.32519442, 0.33127248, 0.32613453, 0.33317798, 0.33505, 0.32397628, 0.32964426, 0.32055843]
-        )
-        assert np.abs(expected_slice - image_slice).max() < 0.001
-
-    def test_stable_diffusion_pix2pix_k_lms(self):
-        pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
-            "timbrooks/instruct-pix2pix", safety_checker=None
-        )
-        pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        inputs = self.get_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array(
-            [0.38934484, 0.3929934, 0.39973113, 0.4196028, 0.42386433, 0.43073824, 0.4267708, 0.43173674, 0.41896266]
-        )
-        assert np.abs(expected_slice - image_slice).max() < 0.001
-
-    def test_stable_diffusion_pix2pix_ddim(self):
-        pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
-            "timbrooks/instruct-pix2pix", safety_checker=None
-        )
-        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        inputs = self.get_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array(
-            [0.51511174, 0.5185677, 0.51326, 0.5176025, 0.514665, 0.519833, 0.52196854, 0.5121842, 0.52435803]
-        )
-        assert np.abs(expected_slice - image_slice).max() < 0.001
-
-    def test_stable_diffusion_pix2pix_intermediate_state(self):
-        number_of_steps = 0
-
-        def callback_fn(step: int, timestep: int, latents: paddle.Tensor) -> None:
-            callback_fn.has_been_called = True
-            nonlocal number_of_steps
-            number_of_steps += 1
-            if step == 1:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 64)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([-0.7104, -0.8994, -1.387, 1.825, 1.964, 1.377, 1.158, 1.556, 1.227])
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
-            elif step == 2:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 64)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([-0.7124, -0.9087, -1.384, 1.826, 1.992, 1.368, 1.16, 1.537, 1.239])
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
-
-        callback_fn.has_been_called = False
-        pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
-            "timbrooks/instruct-pix2pix", safety_checker=None, paddle_dtype=paddle.float16
-        )
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        inputs = self.get_inputs()
-        pipe(**inputs, callback=callback_fn, callback_steps=1)
-        assert callback_fn.has_been_called
-        assert number_of_steps == 3
-
-    def test_stable_diffusion_pix2pix_pipeline_multiple_of_8(self):
-        inputs = self.get_inputs()
-        inputs["image"] = inputs["image"].resize((504, 504))
-        model_id = "timbrooks/instruct-pix2pix"
-        pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(model_id, safety_checker=None)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        output = pipe(**inputs)
-        image = output.images[0]
-        image_slice = image[255:258, 383:386, -1]
-        assert image.shape == (504, 504, 3)
-        expected_slice = np.array(
-            [0.183373, 0.20458564, 0.2428664, 0.18245864, 0.22010538, 0.25757712, 0.19680199, 0.2185145, 0.24869373]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.005
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_k_diffusion.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_k_diffusion.py
deleted file mode 100644
index 9c533dff3ffe..000000000000
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_k_diffusion.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# import gc
-# import unittest
-
-# import numpy as np
-# import paddle
-
-# from ppdiffusers import StableDiffusionKDiffusionPipeline
-# from ppdiffusers.utils import slow
-# from ppdiffusers.utils.testing_utils import require_paddle_gpu
-
-
-# @slow
-# @require_paddle_gpu
-# class StableDiffusionPipelineIntegrationTests(unittest.TestCase):
-#     def tearDown(self):
-#         super().tearDown()
-#         gc.collect()
-#         paddle.device.cuda.empty_cache()
-
-#     def test_stable_diffusion_1(self):
-#         sd_pipe = StableDiffusionKDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
-#         sd_pipe.set_progress_bar_config(disable=None)
-#         sd_pipe.set_scheduler("sample_euler")
-#         prompt = "A painting of a squirrel eating a burger"
-#         generator = paddle.Generator().manual_seed(0)
-#         output = sd_pipe([prompt], generator=generator, guidance_scale=9.0, num_inference_steps=20, output_type="np")
-#         image = output.images
-#         image_slice = image[0, -3:, -3:, -1]
-#         assert image.shape == (1, 512, 512, 3)
-#         expected_slice = np.array([0.0447, 0.0492, 0.0468, 0.0408, 0.0383, 0.0408, 0.0354, 0.038, 0.0339])
-#         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-
-#     def test_stable_diffusion_2(self):
-#         sd_pipe = StableDiffusionKDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base")
-#         sd_pipe.set_progress_bar_config(disable=None)
-#         sd_pipe.set_scheduler("sample_euler")
-#         prompt = "A painting of a squirrel eating a burger"
-#         generator = paddle.Generator().manual_seed(0)
-#         output = sd_pipe([prompt], generator=generator, guidance_scale=9.0, num_inference_steps=20, output_type="np")
-#         image = output.images
-#         image_slice = image[0, -3:, -3:, -1]
-#         assert image.shape == (1, 512, 512, 3)
-#         expected_slice = np.array([0.1237, 0.132, 0.1438, 0.1359, 0.139, 0.1132, 0.1277, 0.1175, 0.1112])
-#         assert np.abs(image_slice.flatten() - expected_slice).max() < 0.5
-
-#     def test_stable_diffusion_karras_sigmas(self):
-#         sd_pipe = StableDiffusionKDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base")
-#         sd_pipe.set_progress_bar_config(disable=None)
-
-#         sd_pipe.set_scheduler("sample_dpmpp_2m")
-
-#         prompt = "A painting of a squirrel eating a burger"
-#         generator = paddle.Generator().manual_seed(0)
-#         output = sd_pipe(
-#             [prompt],
-#             generator=generator,
-#             guidance_scale=7.5,
-#             num_inference_steps=15,
-#             output_type="np",
-#             use_karras_sigmas=True,
-#         )
-
-#         image = output.images
-
-#         image_slice = image[0, -3:, -3:, -1]
-
-#         assert image.shape == (1, 512, 512, 3)
-#         expected_slice = np.array(
-#             [0.11381689, 0.12112921, 0.1389457, 0.12549606, 0.1244964, 0.10831517, 0.11562866, 0.10867816, 0.10499048]
-#         )
-
-#         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
\ No newline at end of file
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py
deleted file mode 100644
index 3bcdccc8b461..000000000000
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py
+++ /dev/null
@@ -1,269 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import numpy as np
-import paddle
-
-from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    EulerAncestralDiscreteScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    StableDiffusionPanoramaPipeline,
-    UNet2DConditionModel,
-)
-from ppdiffusers.utils import slow
-from ppdiffusers.utils.testing_utils import require_paddle_gpu
-
-from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin
-
-
-class StableDiffusionPanoramaPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = StableDiffusionPanoramaPipeline
-    params = TEXT_TO_IMAGE_PARAMS
-    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
-
-    def get_dummy_components(self):
-        paddle.seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=1,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        scheduler = DDIMScheduler()
-        paddle.seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        paddle.seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config).eval()
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "safety_checker": None,
-            "feature_extractor": None,
-        }
-        return components
-
-    def get_dummy_inputs(self, seed=0):
-        generator = paddle.Generator().manual_seed(seed=seed)
-        inputs = {
-            "prompt": "a photo of the dolomites",
-            "generator": generator,
-            "height": None,
-            "width": None,
-            "num_inference_steps": 1,
-            "guidance_scale": 6.0,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_panorama_default_case(self):
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPanoramaPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [0.28862977, 0.2441951, 0.2683525, 0.33122095, 0.28755113, 0.46375293, 0.254181, 0.30616608, 0.4785265]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-
-    # override to speed the overall test timing up.
-    def test_inference_batch_consistent(self):
-        super().test_inference_batch_consistent(batch_sizes=[1, 2])
-
-    # override to speed the overall test timing up.
-    def test_inference_batch_single_identical(self):
-        super().test_inference_batch_single_identical(batch_size=2)
-
-    def test_stable_diffusion_panorama_negative_prompt(self):
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPanoramaPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        negative_prompt = "french fries"
-        output = sd_pipe(**inputs, negative_prompt=negative_prompt)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [0.28995812, 0.24463832, 0.2682391, 0.33033937, 0.2868188, 0.46267676, 0.25425047, 0.3066897, 0.47881347]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_stable_diffusion_panorama_euler(self):
-        components = self.get_dummy_components()
-        components["scheduler"] = EulerAncestralDiscreteScheduler(
-            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
-        )
-        sd_pipe = StableDiffusionPanoramaPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [0.32409406, 0.2660764, 0.41739762, 0.18994612, 0.32522476, 0.4869789, 0.13573006, 0.14128971, 0.32650158]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_stable_diffusion_panorama_pndm(self):
-        components = self.get_dummy_components()
-        components["scheduler"] = PNDMScheduler()
-        sd_pipe = StableDiffusionPanoramaPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        with self.assertRaises(ValueError):
-            _ = sd_pipe(**inputs).images
-
-
-@slow
-@require_paddle_gpu
-class StableDiffusionPanoramaSlowTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def get_inputs(self, seed=0):
-        generator = paddle.Generator().manual_seed(seed=seed)
-        inputs = {
-            "prompt": "a photo of the dolomites",
-            "generator": generator,
-            "num_inference_steps": 3,
-            "guidance_scale": 7.5,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_panorama_default(self):
-        model_ckpt = "stabilityai/stable-diffusion-2-base"
-        scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler")
-        pipe = StableDiffusionPanoramaPipeline.from_pretrained(model_ckpt, scheduler=scheduler, safety_checker=None)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        inputs = self.get_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-        assert image.shape == (1, 512, 2048, 3)
-        expected_slice = np.array(
-            [0.34261876, 0.3045774, 0.34545267, 0.33774284, 0.3431282, 0.33453488, 0.3094663, 0.32646674, 0.32534528]
-        )
-        assert np.abs(expected_slice - image_slice).max() < 0.01
-
-    def test_stable_diffusion_panorama_k_lms(self):
-        pipe = StableDiffusionPanoramaPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-base", safety_checker=None
-        )
-        pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        inputs = self.get_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-        assert image.shape == (1, 512, 2048, 3)
-        expected_slice = np.array(
-            [0.0, 0.01188838, 0.02675471, 0.00534895, 0.02325496, 0.01234779, 0.0348064, 0.0, 0.02607787]
-        )
-        assert np.abs(expected_slice - image_slice).max() < 0.01
-
-    def test_stable_diffusion_panorama_intermediate_state(self):
-        number_of_steps = 0
-
-        def callback_fn(step: int, timestep: int, latents: paddle.Tensor) -> None:
-            callback_fn.has_been_called = True
-            nonlocal number_of_steps
-            number_of_steps += 1
-            if step == 1:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 256)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array(
-                    [
-                        0.7392851114273071,
-                        -0.16683124005794525,
-                        0.2063215672969818,
-                        -0.09840865433216095,
-                        0.18722617626190186,
-                        -0.08375956118106842,
-                        0.06995373964309692,
-                        -0.20892930030822754,
-                        -0.157355397939682,
-                    ]
-                )
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
-            elif step == 2:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 256)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array(
-                    [
-                        0.7368452548980713,
-                        -0.16317462921142578,
-                        0.20289096236228943,
-                        -0.10271137207746506,
-                        0.1873130351305008,
-                        -0.08454630523920059,
-                        0.06944799423217773,
-                        -0.20782311260700226,
-                        -0.15696658194065094,
-                    ]
-                )
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
-
-        callback_fn.has_been_called = False
-        model_ckpt = "stabilityai/stable-diffusion-2-base"
-        scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler")
-        pipe = StableDiffusionPanoramaPipeline.from_pretrained(model_ckpt, scheduler=scheduler, safety_checker=None)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        inputs = self.get_inputs()
-        pipe(**inputs, callback=callback_fn, callback_steps=1)
-        assert callback_fn.has_been_called
-        assert number_of_steps == 3
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py
deleted file mode 100644
index 3f79b0801706..000000000000
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py
+++ /dev/null
@@ -1,409 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import numpy as np
-import paddle
-
-from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (
-    AutoencoderKL,
-    DDIMInverseScheduler,
-    DDIMScheduler,
-    DDPMScheduler,
-    EulerAncestralDiscreteScheduler,
-    LMSDiscreteScheduler,
-    StableDiffusionPix2PixZeroPipeline,
-    UNet2DConditionModel,
-)
-from ppdiffusers.utils import load_image, slow
-from ppdiffusers.utils.testing_utils import load_pt, require_paddle_gpu
-
-from ..pipeline_params import (
-    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
-    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
-)
-from ..test_pipelines_common import PipelineTesterMixin
-
-
-def to_paddle(x):
-    if hasattr(x, "numpy"):
-        x = x.numpy()
-    return paddle.to_tensor(x)
-
-
-# we use SGD optimizer in this pipeline, so the result is not stable!
-class StableDiffusionPix2PixZeroPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = StableDiffusionPix2PixZeroPipeline
-
-    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS
-    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
-
-    @classmethod
-    def setUpClass(cls):
-        cls.source_embeds = to_paddle(
-            load_pt(
-                "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/src_emb_0.pt"
-            )
-        )
-
-        cls.target_embeds = to_paddle(
-            load_pt(
-                "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/tgt_emb_0.pt"
-            )
-        )
-
-    def get_dummy_components(self):
-        paddle.seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        scheduler = DDIMScheduler()
-        paddle.seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        paddle.seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config).eval()
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "safety_checker": None,
-            "feature_extractor": None,
-            "inverse_scheduler": None,
-            "caption_generator": None,
-            "caption_processor": None,
-        }
-        return components
-
-    def get_dummy_inputs(self, seed=0):
-        generator = paddle.Generator().manual_seed(seed)
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "cross_attention_guidance_amount": 0.15,
-            "source_embeds": self.source_embeds,
-            "target_embeds": self.target_embeds,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_pix2pix_zero_default_case(self):
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPix2PixZeroPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [0.58762765, 0.17410329, 0.5067884, 0.39995563, 0.02808204, 0.35726422, 0.3250693, 0.3155224, 0.5268162]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05
-
-    def test_stable_diffusion_pix2pix_zero_negative_prompt(self):
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPix2PixZeroPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        negative_prompt = "french fries"
-        output = sd_pipe(**inputs, negative_prompt=negative_prompt)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [0.5042143, 0.34658563, 0.56157184, 0.3707891, 0.23746812, 0.47898933, 0.2702424, 0.36307925, 0.50807047]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05
-
-    def test_stable_diffusion_pix2pix_zero_euler(self):
-        components = self.get_dummy_components()
-        components["scheduler"] = EulerAncestralDiscreteScheduler(
-            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
-        )
-        sd_pipe = StableDiffusionPix2PixZeroPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [0.4870367, 0.2677226, 0.37830275, 0.63265973, 0.32151344, 0.406371, 0.67513967, 0.5246535, 0.55954224]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05
-
-    def test_stable_diffusion_pix2pix_zero_ddpm(self):
-        components = self.get_dummy_components()
-        components["scheduler"] = DDPMScheduler()
-        sd_pipe = StableDiffusionPix2PixZeroPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [0.5899046, 0.17750263, 0.50616807, 0.39558932, 0.02976257, 0.35918522, 0.32376733, 0.31742626, 0.52768075]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05
-
-    def test_stable_diffusion_pix2pix_zero_num_images_per_prompt(self):
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPix2PixZeroPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        images = sd_pipe(**inputs).images
-        assert images.shape == (1, 64, 64, 3)
-        num_images_per_prompt = 2
-        inputs = self.get_dummy_inputs()
-        images = sd_pipe(**inputs, num_images_per_prompt=num_images_per_prompt).images
-        assert images.shape == (num_images_per_prompt, 64, 64, 3)
-        batch_size = 2
-        inputs = self.get_dummy_inputs()
-        inputs["prompt"] = [inputs["prompt"]] * batch_size
-        images = sd_pipe(**inputs, num_images_per_prompt=num_images_per_prompt).images
-        assert images.shape == (batch_size * num_images_per_prompt, 64, 64, 3)
-
-    # Non-determinism caused by the scheduler optimizing the latent inputs during inference
-    @unittest.skip("non-deterministic pipeline")
-    def test_inference_batch_single_identical(self):
-        return super().test_inference_batch_single_identical()
-
-
-@slow
-@require_paddle_gpu
-class StableDiffusionPix2PixZeroPipelineSlowTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    @classmethod
-    def setUpClass(cls):
-        cls.source_embeds = to_paddle(
-            load_pt("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/cat.pt")
-        )
-
-        cls.target_embeds = to_paddle(
-            load_pt("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/dog.pt")
-        )
-
-    def get_inputs(self, seed=0):
-        generator = paddle.Generator().manual_seed(seed=seed)
-        inputs = {
-            "prompt": "turn him into a cyborg",
-            "generator": generator,
-            "num_inference_steps": 3,
-            "guidance_scale": 7.5,
-            "cross_attention_guidance_amount": 0.15,
-            "source_embeds": self.source_embeds,
-            "target_embeds": self.target_embeds,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_pix2pix_zero_default(self):
-        pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", safety_checker=None, paddle_dtype=paddle.float16
-        )
-        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array(
-            [0.8129883, 0.81933594, 0.80371094, 0.8105469, 0.8076172, 0.80566406, 0.81884766, 0.8330078, 0.82470703]
-        )
-        assert np.abs(expected_slice - image_slice).max() < 0.05
-
-    def test_stable_diffusion_pix2pix_zero_k_lms(self):
-        pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", safety_checker=None, paddle_dtype=paddle.float16
-        )
-        pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
-        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.05053711])
-        assert np.abs(expected_slice - image_slice).max() < 0.05
-
-    def test_stable_diffusion_pix2pix_zero_intermediate_state(self):
-        number_of_steps = 0
-
-        def callback_fn(step: int, timestep: int, latents: paddle.Tensor) -> None:
-            callback_fn.has_been_called = True
-            nonlocal number_of_steps
-            number_of_steps += 1
-            if step == 1:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 64)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array(
-                    [
-                        0.93444633,
-                        1.1613252,
-                        0.7700033,
-                        0.18847837,
-                        -1.17147,
-                        0.07546477,
-                        0.06142269,
-                        -0.8030814,
-                        -0.59692276,
-                    ]
-                )
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
-            elif step == 2:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 64)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array(
-                    [
-                        0.93180454,
-                        1.1606954,
-                        0.7721853,
-                        0.18454231,
-                        -1.1679069,
-                        0.07357024,
-                        0.06213593,
-                        -0.80399096,
-                        -0.5937987,
-                    ]
-                )
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
-
-        callback_fn.has_been_called = False
-        pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", safety_checker=None, paddle_dtype=paddle.float16
-        )
-        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs()
-        pipe(**inputs, callback=callback_fn, callback_steps=1)
-        assert callback_fn.has_been_called
-        assert number_of_steps == 3
-
-
-@slow
-@require_paddle_gpu
-class InversionPipelineSlowTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    @classmethod
-    def setUpClass(cls):
-        raw_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/cat_6.png"
-        )
-
-        raw_image = raw_image.convert("RGB").resize((512, 512))
-
-        cls.raw_image = raw_image
-
-    def test_stable_diffusion_pix2pix_inversion(self):
-        pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", safety_checker=None, paddle_dtype=paddle.float16
-        )
-        pipe.inverse_scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        pipe.inverse_scheduler = DDIMInverseScheduler.from_config(pipe.scheduler.config)
-        caption = "a photography of a cat with flowers"
-        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        pipe.set_progress_bar_config(disable=None)
-        generator = paddle.Generator().manual_seed(0)
-        output = pipe.invert(caption, image=self.raw_image, generator=generator, num_inference_steps=10)
-        inv_latents = output[0]
-        image_slice = inv_latents[0, -3:, -3:, -1].flatten()
-        assert tuple(inv_latents.shape) == (1, 4, 64, 64)
-        expected_slice = np.array([0.8877, 0.0587, 0.77, -1.6035, -0.5962, 0.4827, -0.6265, 1.0498, -0.8599])
-        assert np.abs(expected_slice - image_slice.cpu().numpy()).max() < 0.05
-
-    def test_stable_diffusion_pix2pix_full(self):
-        pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", safety_checker=None, paddle_dtype=paddle.float16
-        )
-        pipe.inverse_scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        pipe.inverse_scheduler = DDIMInverseScheduler.from_config(pipe.scheduler.config)
-        caption = "a photography of a cat with flowers"
-        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        pipe.set_progress_bar_config(disable=None)
-        generator = paddle.Generator().manual_seed(0)
-        output = pipe.invert(caption, image=self.raw_image, generator=generator)
-        inv_latents = output[0]
-        source_prompts = 4 * ["a cat sitting on the street", "a cat playing in the field", "a face of a cat"]
-        target_prompts = 4 * ["a dog sitting on the street", "a dog playing in the field", "a face of a dog"]
-        source_embeds = pipe.get_embeds(source_prompts)
-        target_embeds = pipe.get_embeds(target_prompts)
-        image = pipe(
-            caption,
-            source_embeds=source_embeds,
-            target_embeds=target_embeds,
-            num_inference_steps=50,
-            cross_attention_guidance_amount=0.15,
-            generator=generator,
-            latents=inv_latents,
-            negative_prompt=caption,
-            output_type="np",
-        ).images
-
-        image_slice = image[0, -3:, -3:, -1].flatten()
-        expected_slice = np.array(
-            [
-                0.64208984375,
-                0.65673828125,
-                0.650390625,
-                0.6513671875,
-                0.646484375,
-                0.6650390625,
-                0.6513671875,
-                0.6640625,
-                0.66796875,
-            ]
-        )
-        max_diff = np.abs(image_slice - expected_slice).max()
-        assert max_diff < 0.05
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py
deleted file mode 100644
index c54e50a1ccef..000000000000
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import numpy as np
-import paddle
-
-from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    StableDiffusionSAGPipeline,
-    UNet2DConditionModel,
-)
-from ppdiffusers.utils import slow
-from ppdiffusers.utils.testing_utils import require_paddle_gpu
-
-from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin
-
-
-class StableDiffusionSAGPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = StableDiffusionSAGPipeline
-    test_cpu_offload = False
-    params = TEXT_TO_IMAGE_PARAMS
-    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
-
-    def get_dummy_components(self):
-        paddle.seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-        paddle.seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        paddle.seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config).eval()
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "safety_checker": None,
-            "feature_extractor": None,
-        }
-        return components
-
-    def get_dummy_inputs(self, seed=0):
-        generator = paddle.Generator().manual_seed(seed)
-
-        inputs = {
-            "prompt": ".",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 1.0,
-            "sag_scale": 1.0,
-            "output_type": "numpy",
-        }
-        return inputs
-
-
-@slow
-@require_paddle_gpu
-class StableDiffusionPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def test_stable_diffusion_1(self):
-        sag_pipe = StableDiffusionSAGPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
-        sag_pipe.set_progress_bar_config(disable=None)
-        prompt = "."
-        generator = paddle.Generator().manual_seed(0)
-        output = sag_pipe(
-            [prompt], generator=generator, guidance_scale=7.5, sag_scale=1.0, num_inference_steps=20, output_type="np"
-        )
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array(
-            [0.7477613, 0.76045597, 0.7464366, 0.778965, 0.75718963, 0.7487634, 0.77530396, 0.77426934, 0.7749926]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05
-
-    def test_stable_diffusion_2(self):
-        sag_pipe = StableDiffusionSAGPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base")
-        sag_pipe.set_progress_bar_config(disable=None)
-        prompt = "."
-        generator = paddle.Generator().manual_seed(0)
-        output = sag_pipe(
-            [prompt], generator=generator, guidance_scale=7.5, sag_scale=1.0, num_inference_steps=20, output_type="np"
-        )
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array(
-            [0.8771595, 0.8521123, 0.8644101, 0.8680052, 0.8700466, 0.8897612, 0.87766427, 0.8636212, 0.86829203]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion_2/__init__.py b/ppdiffusers/tests/pipelines/stable_diffusion_2/__init__.py
deleted file mode 100644
index a72d388cc895..000000000000
--- a/ppdiffusers/tests/pipelines/stable_diffusion_2/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
deleted file mode 100644
index e98ec9e45e33..000000000000
--- a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
+++ /dev/null
@@ -1,413 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import numpy as np
-import paddle
-
-from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    DPMSolverMultistepScheduler,
-    EulerAncestralDiscreteScheduler,
-    EulerDiscreteScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    StableDiffusionPipeline,
-    UNet2DConditionModel,
-    logging,
-)
-from ppdiffusers.utils import load_numpy, nightly, slow
-from ppdiffusers.utils.testing_utils import CaptureLogger, require_paddle_gpu
-
-from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin
-
-
-class StableDiffusion2PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = StableDiffusionPipeline
-    params = TEXT_TO_IMAGE_PARAMS
-    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
-
-    def get_dummy_components(self):
-        paddle.seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-            attention_head_dim=(2, 4),
-            use_linear_projection=True,
-        )
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-        paddle.seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-            sample_size=128,
-        )
-        paddle.seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-            hidden_act="gelu",
-            projection_dim=512,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config).eval()
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "safety_checker": None,
-            "feature_extractor": None,
-        }
-        return components
-
-    def get_dummy_inputs(self, seed=0):
-        generator = paddle.Generator().manual_seed(seed)
-
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_ddim(self):
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [0.3505131, 0.36318004, 0.39201266, 0.12107915, 0.27704653, 0.40363187, 0.09379572, 0.16225743, 0.36048344]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_stable_diffusion_pndm(self):
-        components = self.get_dummy_components()
-        components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [0.25144678, 0.35438284, 0.3613463, 0.11020249, 0.3101831, 0.42739886, 0.1142821, 0.17371863, 0.35148838]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_stable_diffusion_k_lms(self):
-        components = self.get_dummy_components()
-        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [0.3676631, 0.38155898, 0.4023114, 0.11294425, 0.2891888, 0.40432304, 0.08882684, 0.1466648, 0.33633134]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_stable_diffusion_k_euler_ancestral(self):
-        components = self.get_dummy_components()
-        components["scheduler"] = EulerAncestralDiscreteScheduler.from_config(components["scheduler"].config)
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [0.36797395, 0.38137895, 0.40199342, 0.11330777, 0.2886864, 0.40422022, 0.08929691, 0.14658183, 0.3363046]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_stable_diffusion_k_euler(self):
-        components = self.get_dummy_components()
-        components["scheduler"] = EulerDiscreteScheduler.from_config(components["scheduler"].config)
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [0.36766386, 0.3815591, 0.40231153, 0.11294428, 0.28918856, 0.40432304, 0.08882678, 0.14666462, 0.3363313]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_stable_diffusion_long_prompt(self):
-        components = self.get_dummy_components()
-        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        do_classifier_free_guidance = True
-        negative_prompt = None
-        num_images_per_prompt = 1
-        logger = logging.get_logger("ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion")
-        prompt = 25 * "@"
-        with CaptureLogger(logger) as cap_logger_3:
-            text_embeddings_3 = sd_pipe._encode_prompt(
-                prompt, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
-            )
-        prompt = 100 * "@"
-        with CaptureLogger(logger) as cap_logger:
-            text_embeddings = sd_pipe._encode_prompt(
-                prompt, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
-            )
-        negative_prompt = "Hello"
-        with CaptureLogger(logger) as cap_logger_2:
-            text_embeddings_2 = sd_pipe._encode_prompt(
-                prompt, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
-            )
-        assert text_embeddings_3.shape == text_embeddings_2.shape == text_embeddings.shape
-        assert text_embeddings.shape[1] == 77
-        assert cap_logger.out == cap_logger_2.out
-        assert cap_logger.out.count("@") == 25
-        assert cap_logger_3.out == ""
-
-
-@slow
-@require_paddle_gpu
-class StableDiffusion2PipelineSlowTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def get_inputs(self, dtype="float32", seed=0):
-        generator = paddle.Generator().manual_seed(seed)
-        latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
-        latents = paddle.to_tensor(latents).cast(dtype)
-        inputs = {
-            "prompt": "a photograph of an astronaut riding a horse",
-            "latents": latents,
-            "generator": generator,
-            "num_inference_steps": 3,
-            "guidance_scale": 7.5,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_default_ddim(self):
-        pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-base")
-        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.49493, 0.47896, 0.40798, 0.54214, 0.53212, 0.48202, 0.47656, 0.46329, 0.48506])
-        assert np.abs(image_slice - expected_slice).max() < 0.0001
-
-    def test_stable_diffusion_pndm(self):
-        pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-base")
-        pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config)
-        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.49493, 0.47896, 0.40798, 0.54214, 0.53212, 0.48202, 0.47656, 0.46329, 0.48506])
-        assert np.abs(image_slice - expected_slice).max() < 0.0001
-
-    def test_stable_diffusion_k_lms(self):
-        pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-base")
-        pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
-        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.1044, 0.13115, 0.111, 0.10141, 0.1144, 0.07215, 0.11332, 0.09693, 0.10006])
-        assert np.abs(image_slice - expected_slice).max() < 0.0001
-
-    # def test_stable_diffusion_attention_slicing(self):
-    #     pipe = StableDiffusionPipeline.from_pretrained(
-    #         "stabilityai/stable-diffusion-2-base", paddle_dtype=paddle.float16
-    #     )
-    #     pipe.set_progress_bar_config(disable=None)
-    #     pipe.enable_attention_slicing()
-    #     inputs = self.get_inputs(dtype="float16")
-    #     image_sliced = pipe(**inputs).images
-    #     mem_bytes = paddle.device.cuda.memory_allocated()
-    #     assert mem_bytes < 3.3 * 10**9
-    #     pipe.disable_attention_slicing()
-    #     inputs = self.get_inputs(dtype="float16")
-    #     image = pipe(**inputs).images
-    #     mem_bytes = paddle.device.cuda.memory_allocated()
-    #     assert mem_bytes > 3.3 * 10**9
-    #     assert np.abs(image_sliced - image).max() < 0.001
-
-    def test_stable_diffusion_text2img_intermediate_state(self):
-        number_of_steps = 0
-
-        def callback_fn(step: int, timestep: int, latents: paddle.Tensor) -> None:
-            callback_fn.has_been_called = True
-            nonlocal number_of_steps
-            number_of_steps += 1
-            if step == 1:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 64)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array(
-                    [-0.3862, -0.4507, -1.1729, 0.0686, -1.1045, 0.7124, -1.8301, 0.1903, 1.2773]
-                )
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
-            elif step == 2:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 64)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([0.272, -0.1863, -0.7383, -0.5029, -0.7534, 0.397, -0.7646, 0.4468, 1.2686])
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
-
-        callback_fn.has_been_called = False
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-base", paddle_dtype=paddle.float16
-        )
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        inputs = self.get_inputs(dtype="float16")
-        pipe(**inputs, callback=callback_fn, callback_steps=1)
-        assert callback_fn.has_been_called
-        assert number_of_steps == inputs["num_inference_steps"]
-
-
-@nightly
-@require_paddle_gpu
-class StableDiffusion2PipelineNightlyTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def get_inputs(self, dtype="float32", seed=0):
-        generator = paddle.Generator().manual_seed(seed)
-        latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
-        latents = paddle.to_tensor(latents).cast(dtype)
-        inputs = {
-            "prompt": "a photograph of an astronaut riding a horse",
-            "latents": latents,
-            "generator": generator,
-            "num_inference_steps": 50,
-            "guidance_scale": 7.5,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_2_0_default_ddim(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-base")
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs()
-        image = sd_pipe(**inputs).images[0]
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_2_text2img/stable_diffusion_2_0_base_ddim.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 0.01
-
-    def test_stable_diffusion_2_1_default_pndm(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base")
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs()
-        image = sd_pipe(**inputs).images[0]
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_2_text2img/stable_diffusion_2_1_base_pndm.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 0.01
-
-    def test_stable_diffusion_ddim(self):  # not pass
-        sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base")
-        sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs()
-        image = sd_pipe(**inputs).images[0]
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_2_text2img/stable_diffusion_2_1_base_ddim.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 0.01
-
-    def test_stable_diffusion_lms(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base")
-        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs()
-        image = sd_pipe(**inputs).images[0]
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_2_text2img/stable_diffusion_2_1_base_lms.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 0.01
-
-    def test_stable_diffusion_euler(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base")
-        sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs()
-        image = sd_pipe(**inputs).images[0]
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_2_text2img/stable_diffusion_2_1_base_euler.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 0.01
-
-    def test_stable_diffusion_dpm(self):  # not pass
-        sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base")
-        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs()
-        inputs["num_inference_steps"] = 25
-        image = sd_pipe(**inputs).images[0]
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_2_text2img/stable_diffusion_2_1_base_dpm_multi.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 0.01
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py
deleted file mode 100644
index 19940bd6f827..000000000000
--- a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py
+++ /dev/null
@@ -1,172 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import numpy as np
-import paddle
-
-from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    StableDiffusionAttendAndExcitePipeline,
-    UNet2DConditionModel,
-)
-from ppdiffusers.utils import load_numpy, slow
-from ppdiffusers.utils.testing_utils import require_paddle_gpu
-
-from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin
-
-
-class StableDiffusionAttendAndExcitePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = StableDiffusionAttendAndExcitePipeline
-    test_attention_slicing = False
-    params = TEXT_TO_IMAGE_PARAMS
-    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS.union({"token_indices"})
-
-    def get_dummy_components(self):
-        paddle.seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-            attention_head_dim=(2, 4),
-            use_linear_projection=True,
-        )
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-        paddle.seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-            sample_size=128,
-        )
-        paddle.seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-            hidden_act="gelu",
-            projection_dim=512,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config).eval()
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "safety_checker": None,
-            "feature_extractor": None,
-        }
-        return components
-
-    def get_dummy_inputs(self, seed=0):
-        generator = paddle.Generator().manual_seed(seed)
-
-        inputs = inputs = {
-            "prompt": "a cat and a frog",
-            "token_indices": [2, 5],
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "numpy",
-            "max_iter_to_alter": 2,
-            "thresholds": {(0): 0.7},
-        }
-        return inputs
-
-    def test_inference(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-        self.assertEqual(image.shape, (1, 64, 64, 3))
-        expected_slice = np.array(
-            [
-                0.33271241188049316,
-                0.3123358190059662,
-                0.44427454471588135,
-                0.08615309000015259,
-                0.26107650995254517,
-                0.4551312029361725,
-                0.06545555591583252,
-                0.1626836657524109,
-                0.3982071578502655,
-            ]
-        )
-        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
-        self.assertLessEqual(max_diff, 0.001)
-
-    def test_inference_batch_consistent(self):
-        # NOTE: Larger batch sizes cause this test to timeout, only test on smaller batches
-        self._test_inference_batch_consistent(batch_sizes=[2, 4])
-
-
-@require_paddle_gpu
-@slow
-class StableDiffusionAttendAndExcitePipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def test_attend_and_excite_fp16(self):
-        generator = paddle.Generator().manual_seed(seed=51)
-        pipe = StableDiffusionAttendAndExcitePipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", safety_checker=None, paddle_dtype=paddle.float16
-        )
-
-        prompt = "a painting of an elephant with glasses"
-        token_indices = [5, 7]
-        image = pipe(
-            prompt=prompt,
-            token_indices=token_indices,
-            guidance_scale=7.5,
-            generator=generator,
-            num_inference_steps=5,
-            max_iter_to_alter=5,
-            output_type="numpy",
-        ).images[0]
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/attend-and-excite/elephant_glasses.npy"
-        )
-        assert np.abs((expected_image - image).max()) < 0.5
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py
deleted file mode 100644
index abde0eb246ac..000000000000
--- a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py
+++ /dev/null
@@ -1,473 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import random
-import tempfile
-import unittest
-
-import numpy as np
-import paddle
-from PIL import Image
-
-from paddlenlp.transformers import (
-    CLIPTextConfig,
-    CLIPTextModel,
-    CLIPTokenizer,
-    DPTConfig,
-    DPTForDepthEstimation,
-    DPTImageProcessor,
-)
-from ppdiffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    StableDiffusionDepth2ImgPipeline,
-    UNet2DConditionModel,
-)
-from ppdiffusers.utils import floats_tensor, load_image, nightly, slow
-from ppdiffusers.utils.testing_utils import require_paddle_gpu
-
-from ..pipeline_params import (
-    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
-    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
-)
-from ..test_pipelines_common import PipelineTesterMixin
-
-
-class StableDiffusionDepth2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = StableDiffusionDepth2ImgPipeline
-    test_save_load_optional_components = False
-    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"}
-    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
-    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
-
-    def get_dummy_components(self):
-        paddle.seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=5,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-            attention_head_dim=(2, 4),
-            use_linear_projection=True,
-        )
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        paddle.seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        paddle.seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config).eval()
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        backbone_config = {
-            "global_padding": "same",
-            "layer_type": "bottleneck",
-            "depths": [3, 4, 9],
-            "out_features": ["stage1", "stage2", "stage3"],
-            "embedding_dynamic_padding": True,
-            "hidden_sizes": [96, 192, 384, 768],
-            "num_groups": 2,
-        }
-        depth_estimator_config = DPTConfig(
-            image_size=32,
-            patch_size=16,
-            num_channels=3,
-            hidden_size=32,
-            num_hidden_layers=4,
-            backbone_out_indices=(0, 1, 2, 3),
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            is_decoder=False,
-            initializer_range=0.02,
-            is_hybrid=True,
-            backbone_config=backbone_config,
-            backbone_featmap_shape=[1, 384, 24, 24],
-        )
-        depth_estimator = DPTForDepthEstimation(depth_estimator_config)
-        feature_extractor = DPTImageProcessor.from_pretrained("hf-internal-testing/tiny-random-DPTForDepthEstimation")
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "depth_estimator": depth_estimator,
-            "feature_extractor": feature_extractor,
-        }
-        return components
-
-    def get_dummy_inputs(self, seed=0):
-        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed))
-        image = image.cpu().transpose(perm=[0, 2, 3, 1])[0]
-        image = Image.fromarray(np.uint8(image)).convert("RGB").resize((32, 32))
-        generator = paddle.Generator().manual_seed(seed)
-
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "image": image,
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_save_load_local(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        output = pipe(**inputs)[0]
-        with tempfile.TemporaryDirectory() as tmpdir:
-            pipe.save_pretrained(tmpdir)
-            pipe_loaded = self.pipeline_class.from_pretrained(tmpdir, from_diffusers=False)
-            pipe_loaded.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        output_loaded = pipe_loaded(**inputs)[0]
-        max_diff = np.abs(output - output_loaded).max()
-        self.assertLess(max_diff, 0.005)
-
-    def test_save_load_float16(self):
-        pass
-        # components = self.get_dummy_components()
-        # for name, module in components.items():
-        #     if hasattr(module, "to"):
-        #         components[name] = module.to(dtype=paddle.float16)
-        # pipe = self.pipeline_class(**components)
-        # pipe.set_progress_bar_config(disable=None)
-        # inputs = self.get_dummy_inputs()
-        # output = pipe(**inputs)[0]
-        # with tempfile.TemporaryDirectory() as tmpdir:
-        #     pipe.save_pretrained(tmpdir)
-        #     pipe_loaded = self.pipeline_class.from_pretrained(tmpdir, paddle_dtype=paddle.float16, from_diffusers=False)
-        #     pipe_loaded.set_progress_bar_config(disable=None)
-        # for name, component in pipe_loaded.components.items():
-        #     if hasattr(component, "dtype"):
-        #         self.assertTrue(
-        #             component.dtype == paddle.float16,
-        #             f"`{name}.dtype` switched from `float16` to {component.dtype} after loading.",
-        #         )
-        # inputs = self.get_dummy_inputs()
-        # output_loaded = pipe_loaded(**inputs)[0]
-        # max_diff = np.abs(output - output_loaded).max()
-        # self.assertLess(max_diff, 5, "The output of the fp16 pipeline changed after saving and loading.")
-
-    def test_float16_inference(self):
-        # TODO not passed
-        pass
-        # components = self.get_dummy_components()
-        # pipe = self.pipeline_class(**components)
-        # pipe.set_progress_bar_config(disable=None)
-        # for name, module in components.items():
-        #     if hasattr(module, "to"):
-        #         components[name] = module.to(dtype=paddle.float16)
-        # pipe_fp16 = self.pipeline_class(**components)
-        # pipe_fp16.set_progress_bar_config(disable=None)
-        # output = pipe(**self.get_dummy_inputs())[0]
-        # output_fp16 = pipe_fp16(**self.get_dummy_inputs())[0]
-        # max_diff = np.abs(output - output_fp16).max()
-        # self.assertLess(max_diff, 0.8, "The outputs of the fp16 and fp32 pipelines are too different.")
-
-    def test_dict_tuple_outputs_equivalent(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-        output = pipe(**self.get_dummy_inputs())[0]
-        output_tuple = pipe(**self.get_dummy_inputs(), return_dict=False)[0]
-        max_diff = np.abs(output - output_tuple).max()
-        self.assertLess(max_diff, 0.005)
-
-    def test_progress_bar(self):
-        super().test_progress_bar()
-
-    def test_stable_diffusion_depth2img_default_case(self):
-        components = self.get_dummy_components()
-        pipe = StableDiffusionDepth2ImgPipeline(**components)
-        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array(
-            [0.35397637, 0.23190483, 0.20131412, 0.27374774, 0.265134, 0.4502194, 0.26852018, 0.37504935, 0.43135768]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
-
-    def test_stable_diffusion_depth2img_negative_prompt(self):
-        components = self.get_dummy_components()
-        pipe = StableDiffusionDepth2ImgPipeline(**components)
-        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        negative_prompt = "french fries"
-        output = pipe(**inputs, negative_prompt=negative_prompt)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array(
-            [0.40259343, 0.37764466, 0.3936328, 0.3628915, 0.48100996, 0.59685427, 0.22927544, 0.45186657, 0.46950823]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
-
-    def test_stable_diffusion_depth2img_multiple_init_images(self):
-        components = self.get_dummy_components()
-        pipe = StableDiffusionDepth2ImgPipeline(**components)
-        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        inputs["prompt"] = [inputs["prompt"]] * 2
-        inputs["image"] = 2 * [inputs["image"]]
-        image = pipe(**inputs).images
-        image_slice = image[-1, -3:, -3:, -1]
-        assert image.shape == (2, 32, 32, 3)
-        expected_slice = np.array(
-            [0.8169553, 0.4573238, 0.27039874, 0.60622, 0.35670877, 0.39508212, 0.56803817, 0.5341117, 0.44428858]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
-
-    def test_stable_diffusion_depth2img_num_images_per_prompt(self):
-        components = self.get_dummy_components()
-        pipe = StableDiffusionDepth2ImgPipeline(**components)
-        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        images = pipe(**inputs).images
-        assert images.shape == (1, 32, 32, 3)
-        batch_size = 2
-        inputs = self.get_dummy_inputs()
-        inputs["prompt"] = [inputs["prompt"]] * batch_size
-        images = pipe(**inputs).images
-        assert images.shape == (batch_size, 32, 32, 3)
-        num_images_per_prompt = 2
-        inputs = self.get_dummy_inputs()
-        images = pipe(**inputs, num_images_per_prompt=num_images_per_prompt).images
-        assert images.shape == (num_images_per_prompt, 32, 32, 3)
-        batch_size = 2
-        inputs = self.get_dummy_inputs()
-        inputs["prompt"] = [inputs["prompt"]] * batch_size
-        images = pipe(**inputs, num_images_per_prompt=num_images_per_prompt).images
-        assert images.shape == (batch_size * num_images_per_prompt, 32, 32, 3)
-
-    def test_stable_diffusion_depth2img_pil(self):
-        components = self.get_dummy_components()
-        pipe = StableDiffusionDepth2ImgPipeline(**components)
-        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-        expected_slice = np.array(
-            [0.35397637, 0.23190483, 0.20131412, 0.27374774, 0.265134, 0.4502194, 0.26852018, 0.37504935, 0.43135768]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001
-
-
-@slow
-@require_paddle_gpu
-class StableDiffusionDepth2ImgPipelineSlowTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def get_inputs(self, dtype="float32", seed=0):
-        generator = paddle.Generator().manual_seed(seed)
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/depth2img/two_cats.png"
-        )
-        inputs = {
-            "prompt": "two tigers",
-            "image": init_image,
-            "generator": generator,
-            "num_inference_steps": 3,
-            "strength": 0.75,
-            "guidance_scale": 7.5,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_depth2img_pipeline_default(self):
-        pipe = StableDiffusionDepth2ImgPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-depth", safety_checker=None
-        )
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        inputs = self.get_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, 253:256, 253:256, -1].flatten()
-        assert image.shape == (1, 480, 640, 3)
-        # expected_slice = np.array([0.9057, 0.9365, 0.9258, 0.8937, 0.8555, 0.8541, 0.826, 0.7747, 0.7421])
-        expected_slice = np.array(
-            [0.75446224, 0.746921, 0.7595095, 0.8161169, 0.8059271, 0.7999228, 0.9052905, 0.879215, 0.8690305]
-        )
-        assert np.abs(expected_slice - image_slice).max() < 0.1
-
-    def test_stable_diffusion_depth2img_pipeline_k_lms(self):
-        pipe = StableDiffusionDepth2ImgPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-depth", safety_checker=None
-        )
-        pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        inputs = self.get_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, 253:256, 253:256, -1].flatten()
-        assert image.shape == (1, 480, 640, 3)
-        # expected_slice = np.array([0.6363, 0.6274, 0.6309, 0.637, 0.6226, 0.6286, 0.6213, 0.6453, 0.6306])
-        expected_slice = np.array(
-            [0.6395747, 0.64879197, 0.6566683, 0.6438427, 0.6707787, 0.63587487, 0.66576767, 0.62180007, 0.6628648]
-        )
-        assert np.abs(expected_slice - image_slice).max() < 0.1
-
-    def test_stable_diffusion_depth2img_pipeline_ddim(self):
-        pipe = StableDiffusionDepth2ImgPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-depth", safety_checker=None
-        )
-        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        inputs = self.get_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, 253:256, 253:256, -1].flatten()
-        assert image.shape == (1, 480, 640, 3)
-        # expected_slice = np.array([0.6424, 0.6524, 0.6249, 0.6041, 0.6634, 0.642, 0.6522, 0.6555, 0.6436])
-        expected_slice = np.array(
-            [0.6283968, 0.6419119, 0.6295293, 0.63652724, 0.6420511, 0.61574477, 0.62251365, 0.65826833, 0.6480877]
-        )
-
-        assert np.abs(expected_slice - image_slice).max() < 0.15
-
-    def test_stable_diffusion_depth2img_intermediate_state(self):
-        number_of_steps = 0
-
-        def callback_fn(step: int, timestep: int, latents: paddle.Tensor) -> None:
-            callback_fn.has_been_called = True
-            nonlocal number_of_steps
-            number_of_steps += 1
-            if step == 1:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 60, 80)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([-1.148, -0.2147, -0.618, -2.48, -2.348, 0.3945, -2.05, -1.566, -1.52])
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.1
-
-        callback_fn.has_been_called = False
-        pipe = StableDiffusionDepth2ImgPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-depth", safety_checker=None, paddle_dtype=paddle.float16
-        )
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        inputs = self.get_inputs(dtype="float16")
-        pipe(**inputs, callback=callback_fn, callback_steps=1)
-        assert callback_fn.has_been_called
-        assert number_of_steps == 2
-
-
-@nightly
-@require_paddle_gpu
-class StableDiffusionImg2ImgPipelineNightlyTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def get_inputs(self, dtype="float32", seed=0):
-        generator = paddle.Generator().manual_seed(seed)
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/depth2img/two_cats.png"
-        )
-        inputs = {
-            "prompt": "two tigers",
-            "image": init_image,
-            "generator": generator,
-            "num_inference_steps": 3,
-            "strength": 0.75,
-            "guidance_scale": 7.5,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    # # Neither diffusers nor ppdiffusers can pass the test at present
-    # def test_depth2img_pndm(self):
-    #     pipe = StableDiffusionDepth2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-2-depth")
-    #     pipe.set_progress_bar_config(disable=None)
-    #     inputs = self.get_inputs()
-    #     image = pipe(**inputs).images[0]
-    #     expected_image = load_numpy(
-    #         "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_depth2img/stable_diffusion_2_0_pndm.npy"
-    #     )
-    #     max_diff = np.abs(expected_image - image).max()
-    #     assert max_diff < 0.001
-
-    # # Neither diffusers nor ppdiffusers can pass the test at present
-    # def test_depth2img_ddim(self):
-    #     pipe = StableDiffusionDepth2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-2-depth")
-    #     pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-    #     pipe.set_progress_bar_config(disable=None)
-    #     inputs = self.get_inputs()
-    #     image = pipe(**inputs).images[0]
-    #     expected_image = load_numpy(
-    #         "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_depth2img/stable_diffusion_2_0_ddim.npy"
-    #     )
-    #     max_diff = np.abs(expected_image - image).max()
-    #     assert max_diff < 0.001
-
-    # # Neither diffusers nor ppdiffusers can pass the test at present
-    # def test_img2img_lms(self):
-    #     pipe = StableDiffusionDepth2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-2-depth")
-    #     pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
-    #     pipe.set_progress_bar_config(disable=None)
-    #     inputs = self.get_inputs()
-    #     image = pipe(**inputs).images[0]
-    #     expected_image = load_numpy(
-    #         "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_depth2img/stable_diffusion_2_0_lms.npy"
-    #     )
-    #     max_diff = np.abs(expected_image - image).max()
-    #     assert max_diff < 0.001
-
-    # # Neither diffusers nor ppdiffusers can pass the test at present
-    # def test_img2img_dpm(self):
-    #     pipe = StableDiffusionDepth2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-2-depth")
-    #     pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
-    #     pipe.set_progress_bar_config(disable=None)
-    #     inputs = self.get_inputs()
-    #     inputs["num_inference_steps"] = 30
-    #     image = pipe(**inputs).images[0]
-    #     expected_image = load_numpy(
-    #         "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_depth2img/stable_diffusion_2_0_dpm_multi.npy"
-    #     )
-    #     max_diff = np.abs(expected_image - image).max()
-    #     assert max_diff < 0.001
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py
deleted file mode 100644
index cf59620bfbe6..000000000000
--- a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py
+++ /dev/null
@@ -1,194 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import random
-import unittest
-
-import numpy as np
-import paddle
-from PIL import Image
-
-from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (
-    AutoencoderKL,
-    PNDMScheduler,
-    StableDiffusionInpaintPipeline,
-    UNet2DConditionModel,
-)
-from ppdiffusers.utils import floats_tensor, load_image
-from ppdiffusers.utils.testing_utils import require_paddle_gpu, slow
-
-from ..pipeline_params import (
-    TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
-    TEXT_GUIDED_IMAGE_INPAINTING_PARAMS,
-)
-from ..test_pipelines_common import PipelineTesterMixin
-
-
-class StableDiffusion2InpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = StableDiffusionInpaintPipeline
-    params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
-    batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
-
-    def get_dummy_components(self):
-        paddle.seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=9,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-            attention_head_dim=(2, 4),
-            use_linear_projection=True,
-        )
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        paddle.seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-            sample_size=128,
-        )
-        paddle.seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-            hidden_act="gelu",
-            projection_dim=512,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config).eval()
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "safety_checker": None,
-            "feature_extractor": None,
-        }
-        return components
-
-    def get_dummy_inputs(self, seed=0):
-        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed))
-        image = image.cpu().transpose(perm=[0, 2, 3, 1])[0]
-        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
-        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((64, 64))
-        generator = paddle.Generator().manual_seed(seed)
-
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "image": init_image,
-            "mask_image": mask_image,
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_inpaint(self):
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionInpaintPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [0.58470726, 0.49302375, 0.3954028, 0.4068969, 0.33668613, 0.50350493, 0.34411103, 0.25261122, 0.4531455]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-
-
-@slow
-@require_paddle_gpu
-class StableDiffusionInpaintPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def test_stable_diffusion_inpaint_pipeline(self):
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-inpaint/init_image.png"
-        )
-        mask_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-inpaint/mask.png"
-        )
-        # invalid expected_image
-        # expected_image = load_numpy(
-        #     'https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-inpaint/yellow_cat_sitting_on_a_park_bench.npy'
-        #     )
-        model_id = "stabilityai/stable-diffusion-2-inpainting"
-        pipe = StableDiffusionInpaintPipeline.from_pretrained(model_id, safety_checker=None)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
-        generator = paddle.Generator().manual_seed(0)
-        output = pipe(prompt=prompt, image=init_image, mask_image=mask_image, generator=generator, output_type="np")
-        image = output.images[0]
-        assert image.shape == (512, 512, 3)
-        image = image[-3:, -3:, -1]
-        expected_image = [
-            [[0.47980508], [0.49545538], [0.501472]],
-            [[0.36860222], [0.5465546], [0.54940426]],
-            [[0.44748512], [0.45160148], [0.48374733]],
-        ]
-        assert np.abs(expected_image - image).max() < 0.001
-
-    def test_stable_diffusion_inpaint_pipeline_fp16(self):
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-inpaint/init_image.png"
-        )
-        mask_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-inpaint/mask.png"
-        )
-        # invalid expected_image
-        # expected_image = load_numpy(
-        #     'https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-inpaint/yellow_cat_sitting_on_a_park_bench_fp16.npy'
-        #     )
-        model_id = "stabilityai/stable-diffusion-2-inpainting"
-        pipe = StableDiffusionInpaintPipeline.from_pretrained(
-            model_id, paddle_dtype=paddle.float16, safety_checker=None
-        )
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
-        generator = paddle.Generator().manual_seed(0)
-        output = pipe(prompt=prompt, image=init_image, mask_image=mask_image, generator=generator, output_type="np")
-        image = output.images[0]
-        assert image.shape == (512, 512, 3)
-        image = image[-3:, -3:, -1]
-        expected_image = [
-            [[0.47851562], [0.4951172], [0.50097656]],
-            [[0.36865234], [0.546875], [0.5493164]],
-            [[0.44726562], [0.45141602], [0.48388672]],
-        ]
-        assert np.abs(expected_image - image).max() < 0.5
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py
deleted file mode 100644
index 495a0e5c561a..000000000000
--- a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py
+++ /dev/null
@@ -1,229 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import random
-import unittest
-
-import numpy as np
-import paddle
-
-from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (
-    AutoencoderKL,
-    EulerDiscreteScheduler,
-    StableDiffusionLatentUpscalePipeline,
-    StableDiffusionPipeline,
-    UNet2DConditionModel,
-)
-from ppdiffusers.utils import floats_tensor, load_image, slow
-from ppdiffusers.utils.testing_utils import require_paddle_gpu
-
-from ..pipeline_params import (
-    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
-    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
-)
-from ..test_pipelines_common import PipelineTesterMixin
-
-
-class StableDiffusionLatentUpscalePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = StableDiffusionLatentUpscalePipeline
-    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {
-        "height",
-        "width",
-        "cross_attention_kwargs",
-        "negative_prompt_embeds",
-        "prompt_embeds",
-    }
-    required_optional_params = PipelineTesterMixin.required_optional_params - {"num_images_per_prompt"}
-    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
-    test_cpu_offload = False
-
-    @property
-    def dummy_image(self):
-        batch_size = 1
-        num_channels = 4
-        sizes = 16, 16
-        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0))
-        return image
-
-    def get_dummy_components(self):
-        paddle.seed(0)
-        model = UNet2DConditionModel(
-            act_fn="gelu",
-            attention_head_dim=8,
-            norm_num_groups=None,
-            block_out_channels=[32, 32, 64, 64],
-            time_cond_proj_dim=160,
-            conv_in_kernel=1,
-            conv_out_kernel=1,
-            cross_attention_dim=32,
-            down_block_types=(
-                "KDownBlock2D",
-                "KCrossAttnDownBlock2D",
-                "KCrossAttnDownBlock2D",
-                "KCrossAttnDownBlock2D",
-            ),
-            in_channels=8,
-            mid_block_type=None,
-            only_cross_attention=False,
-            out_channels=5,
-            resnet_time_scale_shift="scale_shift",
-            time_embedding_type="fourier",
-            timestep_post_act="gelu",
-            up_block_types=("KCrossAttnUpBlock2D", "KCrossAttnUpBlock2D", "KCrossAttnUpBlock2D", "KUpBlock2D"),
-        )
-        vae = AutoencoderKL(
-            block_out_channels=[32, 32, 64, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        scheduler = EulerDiscreteScheduler(prediction_type="sample")
-        text_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-            hidden_act="quick_gelu",
-            projection_dim=512,
-        )
-        text_encoder = CLIPTextModel(text_config).eval()
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        components = {
-            "unet": model.eval(),
-            "vae": vae.eval(),
-            "scheduler": scheduler,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-        }
-        return components
-
-    def get_dummy_inputs(self, seed=0):
-        generator = paddle.Generator().manual_seed(seed)
-
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "image": self.dummy_image.cpu(),
-            "generator": generator,
-            "num_inference_steps": 2,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_inference(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-        self.assertEqual(image.shape, (1, 256, 256, 3))
-        expected_slice = np.array(
-            [
-                0.5665861368179321,
-                0.7449524402618408,
-                0.0,
-                0.1325536072254181,
-                0.4274534583091736,
-                0.0,
-                0.0,
-                0.14426982402801514,
-                0.0,
-            ]
-        )
-        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
-        self.assertLessEqual(max_diff, 0.001)
-
-    def test_inference_batch_single_identical(self):
-        self._test_inference_batch_single_identical(relax_max_difference=False)
-
-
-@require_paddle_gpu
-@slow
-class StableDiffusionLatentUpscalePipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def test_latent_upscaler_fp16(self):
-        generator = paddle.Generator().manual_seed(seed=33)
-        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", paddle_dtype=paddle.float16)
-        pipe.to("gpu")
-        upscaler = StableDiffusionLatentUpscalePipeline.from_pretrained(
-            "stabilityai/sd-x2-latent-upscaler", paddle_dtype=paddle.float16
-        )
-        upscaler.to("gpu")
-
-        prompt = "a photo of an astronaut high resolution, unreal engine, ultra realistic"
-        low_res_latents = pipe(prompt, generator=generator, output_type="latent").images
-        image = upscaler(
-            prompt=prompt,
-            image=low_res_latents,
-            num_inference_steps=20,
-            guidance_scale=0,
-            generator=generator,
-            output_type="np",
-        ).images[0]
-        # invalid expected_image
-        # expected_image = load_numpy(
-        #     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/latent-upscaler/astronaut_1024.npy"
-        # )
-        image = image[-3:, -3:, 0]
-        expected_image = [
-            [[0.03686523], [0.03759766], [0.05175781]],
-            [[0.03491211], [0.05126953], [0.04541016]],
-            [[0.02880859], [0.03369141], [0.05004883]],
-        ]
-        assert np.abs((expected_image - image).max()) < 0.5
-
-    def test_latent_upscaler_fp16_image(self):
-        generator = paddle.Generator().manual_seed(seed=33)
-        upscaler = StableDiffusionLatentUpscalePipeline.from_pretrained(
-            "stabilityai/sd-x2-latent-upscaler", paddle_dtype=paddle.float16
-        )
-        upscaler.to("gpu")
-
-        prompt = "the temple of fire by Ross Tran and Gerardo Dottori, oil on canvas"
-        low_res_img = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/latent-upscaler/fire_temple_512.png"
-        )
-        image = upscaler(
-            prompt=prompt,
-            image=low_res_img,
-            num_inference_steps=20,
-            guidance_scale=0,
-            generator=generator,
-            output_type="np",
-        ).images[0]
-        # invalid expected_image
-        # expected_image = load_numpy(
-        #     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/latent-upscaler/fire_temple_1024.npy"
-        # )
-        image = image[-3:, -3:, 0]
-        expected_image = [
-            [[0.03686523], [0.03759766], [0.05151367]],
-            [[0.03491211], [0.05151367], [0.04541016]],
-            [[0.02880859], [0.03393555], [0.05004883]],
-        ]
-        assert np.abs((expected_image - image).max()) < 0.05
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py
deleted file mode 100644
index 6139a6d67a85..000000000000
--- a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py
+++ /dev/null
@@ -1,260 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import random
-import unittest
-
-import numpy as np
-import paddle
-from PIL import Image
-
-from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    DDPMScheduler,
-    StableDiffusionUpscalePipeline,
-    UNet2DConditionModel,
-)
-from ppdiffusers.utils import floats_tensor, load_image, slow
-from ppdiffusers.utils.testing_utils import require_paddle_gpu
-
-
-class StableDiffusionUpscalePipelineFastTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    @property
-    def dummy_image(self):
-        batch_size = 1
-        num_channels = 3
-        sizes = (32, 32)
-        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0))
-        return image
-
-    @property
-    def dummy_cond_unet_upscale(self):
-        paddle.seed(0)
-        model = UNet2DConditionModel(
-            block_out_channels=(32, 32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=7,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-            # SD2-specific config below
-            attention_head_dim=8,
-            use_linear_projection=True,
-            only_cross_attention=(True, True, False),
-            num_class_embeds=100,
-        )
-        return model
-
-    @property
-    def dummy_vae(self):
-        paddle.seed(0)
-        model = AutoencoderKL(
-            block_out_channels=[32, 32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        return model
-
-    @property
-    def dummy_text_encoder(self):
-        paddle.seed(0)
-        config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-            # SD2-specific config below
-            hidden_act="gelu",
-            projection_dim=512,
-        )
-        return CLIPTextModel(config).eval()
-
-    def test_stable_diffusion_upscale(self):
-        unet = self.dummy_cond_unet_upscale
-        low_res_scheduler = DDPMScheduler()
-        scheduler = DDIMScheduler(prediction_type="v_prediction")
-        vae = self.dummy_vae
-        text_encoder = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        image = self.dummy_image.cpu().transpose(perm=[0, 2, 3, 1])[0]
-        low_res_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
-        sd_pipe = StableDiffusionUpscalePipeline(
-            unet=unet,
-            low_res_scheduler=low_res_scheduler,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            max_noise_level=350,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-        prompt = "A painting of a squirrel eating a burger"
-        generator = paddle.Generator().manual_seed(0)
-        output = sd_pipe(
-            [prompt],
-            image=low_res_image,
-            generator=generator,
-            guidance_scale=6.0,
-            noise_level=20,
-            num_inference_steps=2,
-            output_type="np",
-        )
-        image = output.images
-        generator = paddle.Generator().manual_seed(0)
-        image_from_tuple = sd_pipe(
-            [prompt],
-            image=low_res_image,
-            generator=generator,
-            guidance_scale=6.0,
-            noise_level=20,
-            num_inference_steps=2,
-            output_type="np",
-            return_dict=False,
-        )[0]
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-        expected_height_width = low_res_image.size[0] * 4
-        assert image.shape == (1, expected_height_width, expected_height_width, 3)
-        expected_slice = np.array(
-            [0.0, 0.0, 0.3616839, 0.0, 0.04877859, 0.59195685, 0.23902711, 0.00838843, 0.5172206]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_stable_diffusion_upscale_batch(self):
-        unet = self.dummy_cond_unet_upscale
-        low_res_scheduler = DDPMScheduler()
-        scheduler = DDIMScheduler(prediction_type="v_prediction")
-        vae = self.dummy_vae
-        text_encoder = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        image = self.dummy_image.cpu().transpose(perm=[0, 2, 3, 1])[0]
-        low_res_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
-        sd_pipe = StableDiffusionUpscalePipeline(
-            unet=unet,
-            low_res_scheduler=low_res_scheduler,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            max_noise_level=350,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-        prompt = "A painting of a squirrel eating a burger"
-        output = sd_pipe(
-            2 * [prompt],
-            image=2 * [low_res_image],
-            guidance_scale=6.0,
-            noise_level=20,
-            num_inference_steps=2,
-            output_type="np",
-        )
-        image = output.images
-        assert image.shape[0] == 2
-        generator = paddle.Generator().manual_seed(0)
-        output = sd_pipe(
-            [prompt],
-            image=low_res_image,
-            generator=generator,
-            num_images_per_prompt=2,
-            guidance_scale=6.0,
-            noise_level=20,
-            num_inference_steps=2,
-            output_type="np",
-        )
-        image = output.images
-        assert image.shape[0] == 2
-
-    def test_stable_diffusion_upscale_fp16(self):
-        """Test that stable diffusion upscale works with fp16"""
-        unet = self.dummy_cond_unet_upscale
-        low_res_scheduler = DDPMScheduler()
-        scheduler = DDIMScheduler(prediction_type="v_prediction")
-        vae = self.dummy_vae
-        text_encoder = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        image = self.dummy_image.cpu().transpose(perm=[0, 2, 3, 1])[0]
-        low_res_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
-        unet = unet.to(dtype=paddle.float16)
-        text_encoder = text_encoder.to(dtype=paddle.float16)
-        sd_pipe = StableDiffusionUpscalePipeline(
-            unet=unet,
-            low_res_scheduler=low_res_scheduler,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            max_noise_level=350,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-        prompt = "A painting of a squirrel eating a burger"
-        generator = paddle.Generator().manual_seed(0)
-        image = sd_pipe(
-            [prompt], image=low_res_image, generator=generator, num_inference_steps=2, output_type="np"
-        ).images
-        expected_height_width = low_res_image.size[0] * 4
-        assert image.shape == (1, expected_height_width, expected_height_width, 3)
-
-
-@slow
-@require_paddle_gpu
-class StableDiffusionUpscalePipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def test_stable_diffusion_upscale_pipeline(self):
-        image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-upscale/low_res_cat.png"
-        )
-        # invalid expected_image
-        # expected_image = load_numpy(
-        #     'https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-upscale/upsampled_cat.npy'
-        #     )
-        model_id = "stabilityai/stable-diffusion-x4-upscaler"
-        pipe = StableDiffusionUpscalePipeline.from_pretrained(model_id)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        prompt = "a cat sitting on a park bench"
-        generator = paddle.Generator().manual_seed(0)
-        output = pipe(prompt=prompt, image=image, generator=generator, output_type="np")
-        image = output.images[0]
-        assert image.shape == (512, 512, 3)
-        image = image[-3:, -3:, -1]
-        expected_image = [
-            [[0.17348257], [0.15836588], [0.14607191]],
-            [[0.17892927], [0.1668604], [0.15961224]],
-            [[0.17489928], [0.1661663], [0.16446933]],
-        ]
-        assert np.abs(expected_image - image).max() < 0.05
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py
deleted file mode 100644
index 4012387799a2..000000000000
--- a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py
+++ /dev/null
@@ -1,396 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import numpy as np
-import paddle
-
-from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    DPMSolverMultistepScheduler,
-    EulerDiscreteScheduler,
-    StableDiffusionPipeline,
-    UNet2DConditionModel,
-)
-from ppdiffusers.utils import slow
-from ppdiffusers.utils.testing_utils import require_paddle_gpu
-
-
-class StableDiffusion2VPredictionPipelineFastTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    @property
-    def dummy_cond_unet(self):
-        paddle.seed(0)
-        model = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-            attention_head_dim=(2, 4),
-            use_linear_projection=True,
-        )
-        return model
-
-    @property
-    def dummy_vae(self):
-        paddle.seed(0)
-        model = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-            sample_size=128,
-        )
-        return model
-
-    @property
-    def dummy_text_encoder(self):
-        paddle.seed(0)
-        config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-            hidden_act="gelu",
-            projection_dim=64,
-        )
-        return CLIPTextModel(config).eval()
-
-    def test_stable_diffusion_v_pred_ddim(self):
-        unet = self.dummy_cond_unet
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-            prediction_type="v_prediction",
-        )
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        sd_pipe = StableDiffusionPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=None,
-            requires_safety_checker=False,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-        prompt = "A painting of a squirrel eating a burger"
-        generator = paddle.Generator().manual_seed(0)
-        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
-        image = output.images
-        generator = paddle.Generator().manual_seed(0)
-        image_from_tuple = sd_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=6.0,
-            num_inference_steps=2,
-            output_type="np",
-            return_dict=False,
-        )[0]
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [0.36126757, 0.40778637, 0.36956796, 0.14816678, 0.25735706, 0.36562037, 0.1229952, 0.22826642, 0.4154452]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_stable_diffusion_v_pred_k_euler(self):
-        unet = self.dummy_cond_unet
-        scheduler = EulerDiscreteScheduler(
-            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", prediction_type="v_prediction"
-        )
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        sd_pipe = StableDiffusionPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=None,
-            requires_safety_checker=False,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-        prompt = "A painting of a squirrel eating a burger"
-        generator = paddle.Generator().manual_seed(0)
-        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
-        image = output.images
-        generator = paddle.Generator().manual_seed(0)
-        image_from_tuple = sd_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=6.0,
-            num_inference_steps=2,
-            output_type="np",
-            return_dict=False,
-        )[0]
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [0.39991996, 0.45191997, 0.34044766, 0.2136086, 0.2758901, 0.31222183, 0.21658134, 0.34479994, 0.43742967]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_stable_diffusion_v_pred_fp16(self):
-        """Test that stable diffusion v-prediction works with fp16"""
-        unet = self.dummy_cond_unet
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-            prediction_type="v_prediction",
-        )
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        unet = unet.to(dtype=paddle.float16)
-        vae = vae.to(dtype=paddle.float16)
-        bert = bert.to(dtype=paddle.float16)
-        sd_pipe = StableDiffusionPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=None,
-            requires_safety_checker=False,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-        prompt = "A painting of a squirrel eating a burger"
-        generator = paddle.Generator().manual_seed(0)
-        image = sd_pipe([prompt], generator=generator, num_inference_steps=2, output_type="np").images
-        assert image.shape == (1, 64, 64, 3)
-
-
-@slow
-@require_paddle_gpu
-class StableDiffusion2VPredictionPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def test_stable_diffusion_v_pred_default(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2")
-        sd_pipe.enable_attention_slicing()
-        sd_pipe.set_progress_bar_config(disable=None)
-        prompt = "A painting of a squirrel eating a burger"
-        generator = paddle.Generator().manual_seed(0)
-        output = sd_pipe([prompt], generator=generator, guidance_scale=7.5, num_inference_steps=20, output_type="np")
-        image = output.images
-        image_slice = image[0, 253:256, 253:256, -1]
-        assert image.shape == (1, 768, 768, 3)
-        expected_slice = np.array(
-            [0.05667132, 0.05700234, 0.04156408, 0.04631725, 0.04327643, 0.06003231, 0.05165312, 0.05258191, 0.0865913]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_stable_diffusion_v_pred_upcast_attention(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-1", paddle_dtype=paddle.float16
-        )
-        sd_pipe.enable_attention_slicing()
-        sd_pipe.set_progress_bar_config(disable=None)
-        prompt = "A painting of a squirrel eating a burger"
-        generator = paddle.Generator().manual_seed(0)
-        output = sd_pipe([prompt], generator=generator, guidance_scale=7.5, num_inference_steps=20, output_type="np")
-        image = output.images
-        image_slice = image[0, 253:256, 253:256, -1]
-        assert image.shape == (1, 768, 768, 3)
-
-        expected_slice = np.array(
-            [0.04541016, 0.04516602, 0.05493164, 0.05078125, 0.04296875, 0.07275391, 0.06567383, 0.0534668, 0.04833984]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05
-
-    def test_stable_diffusion_v_pred_euler(self):
-        scheduler = EulerDiscreteScheduler.from_pretrained("stabilityai/stable-diffusion-2", subfolder="scheduler")
-        sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", scheduler=scheduler)
-        sd_pipe.enable_attention_slicing()
-        sd_pipe.set_progress_bar_config(disable=None)
-        prompt = "A painting of a squirrel eating a burger"
-        generator = paddle.Generator().manual_seed(0)
-        output = sd_pipe([prompt], generator=generator, num_inference_steps=5, output_type="numpy")
-        image = output.images
-        image_slice = image[0, 253:256, 253:256, -1]
-        assert image.shape == (1, 768, 768, 3)
-        expected_slice = np.array(
-            [
-                0.03515199,
-                0.03756374,
-                0.05046153,
-                0.04240236,
-                0.05509549,
-                0.06556576,
-                0.04710263,
-                0.02758819,
-                0.05959105,
-            ]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_stable_diffusion_v_pred_dpm(self):
-        """
-        TODO: update this test after making DPM compatible with V-prediction!
-        """
-        scheduler = DPMSolverMultistepScheduler.from_pretrained(
-            "stabilityai/stable-diffusion-2", subfolder="scheduler"
-        )
-        sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", scheduler=scheduler)
-        sd_pipe.enable_attention_slicing()
-        sd_pipe.set_progress_bar_config(disable=None)
-        prompt = "a photograph of an astronaut riding a horse"
-        generator = paddle.Generator().manual_seed(0)
-        image = sd_pipe(
-            [prompt], generator=generator, guidance_scale=7.5, num_inference_steps=5, output_type="numpy"
-        ).images
-        image_slice = image[0, 253:256, 253:256, -1]
-        assert image.shape == (1, 768, 768, 3)
-        expected_slice = np.array(
-            [0.20492354, 0.2115368, 0.2323401, 0.2415919, 0.25598443, 0.24843931, 0.25171167, 0.23580211, 0.23604062]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-
-    # def test_stable_diffusion_attention_slicing_v_pred(self):
-    #     model_id = 'stabilityai/stable-diffusion-2'
-    #     pipe = StableDiffusionPipeline.from_pretrained(model_id,
-    #         paddle_dtype=paddle.float16)
-    #     pipe.set_progress_bar_config(disable=None)
-    #     prompt = 'a photograph of an astronaut riding a horse'
-    #     pipe.enable_attention_slicing()
-    #     generator = paddle.Generator().manual_seed(0)
-    #     output_chunked = pipe([prompt], generator=generator, guidance_scale
-    #         =7.5, num_inference_steps=10, output_type='numpy')
-    #     image_chunked = output_chunked.images
-    #     mem_bytes = paddle.device.cuda.memory_allocated()
-    #     assert mem_bytes < 5.5 * 10 ** 9
-    #     pipe.disable_attention_slicing()
-    #     generator = paddle.Generator().manual_seed(0)
-    #     output = pipe([prompt], generator=generator, guidance_scale=7.5,
-    #         num_inference_steps=10, output_type='numpy')
-    #     image = output.images
-    #     mem_bytes = paddle.device.cuda.memory_allocated()
-    #     assert mem_bytes > 5.5 * 10 ** 9
-    #     assert np.abs(image_chunked.flatten() - image.flatten()).max() < 0.001
-
-    def test_stable_diffusion_text2img_pipeline_v_pred_default(self):
-        # invalid expected_image
-        # expected_image = load_numpy(
-        #     'https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-text2img/astronaut_riding_a_horse_v_pred.npy'
-        #     )
-        pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2")
-        pipe.enable_attention_slicing()
-        pipe.set_progress_bar_config(disable=None)
-        prompt = "astronaut riding a horse"
-        generator = paddle.Generator().manual_seed(0)
-        output = pipe(prompt=prompt, guidance_scale=7.5, generator=generator, output_type="np")
-        image = output.images[0]
-        assert image.shape == (768, 768, 3)
-        expected_image = np.array(
-            [0.26713198, 0.2630347, 0.25486767, 0.23375505, 0.24399692, 0.22363415, 0.24688962, 0.21346492, 0.23014635]
-        )
-        image = image[-3:, -3:, -1].flatten()
-        assert np.abs(expected_image - image).max() < 0.075
-
-    def test_stable_diffusion_text2img_pipeline_v_pred_fp16(self):
-        # invalid expected_image
-        # expected_image = load_numpy(
-        #     'https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-text2img/astronaut_riding_a_horse_v_pred_fp16.npy'
-        #     )
-        pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", paddle_dtype=paddle.float16)
-        pipe.set_progress_bar_config(disable=None)
-        prompt = "astronaut riding a horse"
-        generator = paddle.Generator().manual_seed(0)
-        output = pipe(prompt=prompt, guidance_scale=7.5, generator=generator, output_type="np")
-        image = output.images[0]
-        assert image.shape == (768, 768, 3)
-        expected_image = np.array(
-            [0.26220703, 0.25195312, 0.2434082, 0.22753906, 0.23632812, 0.21777344, 0.23901367, 0.20629883, 0.22192383]
-        )
-        image = image[-3:, -3:, -1].flatten()
-        assert np.abs(expected_image - image).max() < 0.75
-
-    def test_stable_diffusion_text2img_intermediate_state_v_pred(self):
-        number_of_steps = 0
-
-        def test_callback_fn(step: int, timestep: int, latents: paddle.Tensor) -> None:
-            test_callback_fn.has_been_called = True
-            nonlocal number_of_steps
-            number_of_steps += 1
-            if step == 0:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 96, 96)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([-0.2542, -1.276, 0.426, -0.956, -1.173, -0.5884, 2.416, 0.1553, -1.21])
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
-            elif step == 19:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 96, 96)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array(
-                    [-0.959, -0.964, -0.614, 0.0977, -0.6953, -0.2343, 1.551, -0.03357, -0.11395]
-                )
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05
-
-        test_callback_fn.has_been_called = False
-        pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", paddle_dtype=paddle.float16)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        prompt = "Andromeda galaxy in a bottle"
-        generator = paddle.Generator().manual_seed(0)
-        pipe(
-            prompt=prompt,
-            num_inference_steps=20,
-            guidance_scale=7.5,
-            generator=generator,
-            callback=test_callback_fn,
-            callback_steps=1,
-        )
-        assert test_callback_fn.has_been_called
-        assert number_of_steps == 20
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion_safe/__init__.py b/ppdiffusers/tests/pipelines/stable_diffusion_safe/__init__.py
deleted file mode 100644
index a72d388cc895..000000000000
--- a/ppdiffusers/tests/pipelines/stable_diffusion_safe/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py b/ppdiffusers/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py
deleted file mode 100644
index 10c819e3561c..000000000000
--- a/ppdiffusers/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py
+++ /dev/null
@@ -1,374 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import random
-import tempfile
-import unittest
-
-import numpy as np
-import paddle
-
-from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    UNet2DConditionModel,
-)
-from ppdiffusers.pipelines.stable_diffusion_safe import (
-    StableDiffusionPipelineSafe as StableDiffusionPipeline,
-)
-from ppdiffusers.utils import floats_tensor, nightly
-from ppdiffusers.utils.testing_utils import require_paddle_gpu
-
-
-class SafeDiffusionPipelineFastTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    @property
-    def dummy_image(self):
-        batch_size = 1
-        num_channels = 3
-        sizes = 32, 32
-        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0))
-        return image
-
-    @property
-    def dummy_cond_unet(self):
-        paddle.seed(0)
-        model = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        return model
-
-    @property
-    def dummy_vae(self):
-        paddle.seed(0)
-        model = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        return model
-
-    @property
-    def dummy_text_encoder(self):
-        paddle.seed(0)
-        config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        return CLIPTextModel(config).eval()
-
-    @property
-    def dummy_extractor(self):
-        def extract(*args, **kwargs):
-            class Out:
-                def __init__(self):
-                    self.pixel_values = paddle.ones(shape=[0])
-
-                def to(self, device):
-                    self.pixel_values
-                    return self
-
-            return Out()
-
-        return extract
-
-    def test_safe_diffusion_ddim(self):
-        unet = self.dummy_cond_unet
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        sd_pipe = StableDiffusionPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=self.dummy_extractor,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-        prompt = "A painting of a squirrel eating a burger"
-        generator = paddle.Generator().manual_seed(0)
-        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
-        image = output.images
-        generator = paddle.Generator().manual_seed(0)
-        image_from_tuple = sd_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=6.0,
-            num_inference_steps=2,
-            output_type="np",
-            return_dict=False,
-        )[0]
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [0.28519452, 0.23807159, 0.38150585, 0.21930319, 0.26092738, 0.517212,0.2563907 , 0.2503956 , 0.47978917 ]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_stable_diffusion_pndm(self):
-        unet = self.dummy_cond_unet
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        sd_pipe = StableDiffusionPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=self.dummy_extractor,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-        prompt = "A painting of a squirrel eating a burger"
-        generator = paddle.Generator().manual_seed(0)
-        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
-        image = output.images
-        generator = paddle.Generator().manual_seed(0)
-        image_from_tuple = sd_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=6.0,
-            num_inference_steps=2,
-            output_type="np",
-            return_dict=False,
-        )[0]
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [
-                0.18763152, 0.24242553, 0.36067978, 0.21772456, 0.27213728, 0.5194623, 0.2227565 , 0.2217454 , 0.4453961 
-            ]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_stable_diffusion_no_safety_checker(self):
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None
-        )
-        assert isinstance(pipe, StableDiffusionPipeline)
-        assert isinstance(pipe.scheduler, LMSDiscreteScheduler)
-        assert pipe.safety_checker is None
-        image = pipe("example prompt", num_inference_steps=2).images[0]
-        assert image is not None
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pipe.save_pretrained(tmpdirname)
-            pipe = StableDiffusionPipeline.from_pretrained(tmpdirname, from_diffusers=False)
-        assert pipe.safety_checker is None
-        image = pipe("example prompt", num_inference_steps=2).images[0]
-        assert image is not None
-
-    def test_stable_diffusion_fp16(self):
-        """Test that stable diffusion works with fp16"""
-        unet = self.dummy_cond_unet
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        unet = unet.to(dtype=paddle.float16)
-        vae = vae.to(dtype=paddle.float16)
-        bert = bert.to(dtype=paddle.float16)
-        sd_pipe = StableDiffusionPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=self.dummy_extractor,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-        prompt = "A painting of a squirrel eating a burger"
-        image = sd_pipe([prompt], num_inference_steps=2, output_type="np").images
-        assert image.shape == (1, 64, 64, 3)
-
-
-@nightly
-@require_paddle_gpu
-class SafeDiffusionPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def test_harm_safe_stable_diffusion(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", safety_checker=None)
-        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.set_progress_bar_config(disable=None)
-        prompt = "portrait of girl with smokey eyes makeup in abandoned hotel, grange clothes, redshift, wide high angle coloured polaroid photograph with flash, kodak film, hyper real, stunning moody cinematography, with anamorphic lenses, by maripol, fallen angels by wong kar - wai, style of suspiria and neon demon and children from bahnhof zoo, detailed "
-        seed = 4003660346
-        guidance_scale = 7
-        generator = paddle.Generator().manual_seed(seed=seed)
-        output = sd_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=guidance_scale,
-            num_inference_steps=50,
-            output_type="np",
-            width=512,
-            height=512,
-            sld_guidance_scale=0,
-        )
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        expected_slice = [0.2278, 0.2231, 0.2249, 0.2333, 0.2303, 0.1885, 0.2273, 0.2144, 0.2176]
-        assert image.shape == (1, 512, 512, 3)
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-        generator = paddle.Generator().manual_seed(seed=seed)
-        output = sd_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=guidance_scale,
-            num_inference_steps=50,
-            output_type="np",
-            width=512,
-            height=512,
-            sld_guidance_scale=2000,
-            sld_warmup_steps=7,
-            sld_threshold=0.025,
-            sld_momentum_scale=0.5,
-            sld_mom_beta=0.7,
-        )
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        expected_slice = [0.2383, 0.2276, 0.236, 0.2192, 0.2186, 0.2053, 0.1971, 0.1901, 0.1719]
-        assert image.shape == (1, 512, 512, 3)
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_nudity_safe_stable_diffusion(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", safety_checker=None)
-        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.set_progress_bar_config(disable=None)
-        prompt = "padme amidala taking a bath artwork, safe for work, no nudity"
-        seed = 2734971755
-        guidance_scale = 7
-        generator = paddle.Generator().manual_seed(seed=seed)
-        output = sd_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=guidance_scale,
-            num_inference_steps=50,
-            output_type="np",
-            width=512,
-            height=512,
-            sld_guidance_scale=0,
-        )
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        expected_slice = [0.3502, 0.3622, 0.3396, 0.3642, 0.3478, 0.3318, 0.35, 0.3348, 0.3297]
-        assert image.shape == (1, 512, 512, 3)
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-        generator = paddle.Generator().manual_seed(seed=seed)
-        output = sd_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=guidance_scale,
-            num_inference_steps=50,
-            output_type="np",
-            width=512,
-            height=512,
-            sld_guidance_scale=2000,
-            sld_warmup_steps=7,
-            sld_threshold=0.025,
-            sld_momentum_scale=0.5,
-            sld_mom_beta=0.7,
-        )
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        expected_slice = [0.5531, 0.5206, 0.4895, 0.5156, 0.5182, 0.4751, 0.4802, 0.4803, 0.4443]
-        assert image.shape == (1, 512, 512, 3)
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_nudity_safetychecker_safe_stable_diffusion(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-        sd_pipe.set_progress_bar_config(disable=None)
-        prompt = "the four horsewomen of the apocalypse, painting by tom of finland, gaston bussiere, craig mullins, j. c. leyendecker"
-        seed = 1044355234
-        guidance_scale = 12
-        generator = paddle.Generator().manual_seed(seed=seed)
-        output = sd_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=guidance_scale,
-            num_inference_steps=50,
-            output_type="np",
-            width=512,
-            height=512,
-            sld_guidance_scale=0,
-        )
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        expected_slice = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
-        assert image.shape == (1, 512, 512, 3)
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-07
-        generator = paddle.Generator().manual_seed(seed=seed)
-        output = sd_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=guidance_scale,
-            num_inference_steps=50,
-            output_type="np",
-            width=512,
-            height=512,
-            sld_guidance_scale=2000,
-            sld_warmup_steps=7,
-            sld_threshold=0.025,
-            sld_momentum_scale=0.5,
-            sld_mom_beta=0.7,
-        )
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        expected_slice = np.array([0.5818, 0.6285, 0.6835, 0.6019, 0.625, 0.6754, 0.6096, 0.6334, 0.6561])
-        assert image.shape == (1, 512, 512, 3)
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
diff --git a/ppdiffusers/tests/pipelines/stable_unclip/__init__.py b/ppdiffusers/tests/pipelines/stable_unclip/__init__.py
deleted file mode 100644
index a72d388cc895..000000000000
--- a/ppdiffusers/tests/pipelines/stable_unclip/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/ppdiffusers/tests/pipelines/stable_unclip/test_stable_unclip.py b/ppdiffusers/tests/pipelines/stable_unclip/test_stable_unclip.py
deleted file mode 100644
index 67465fc9c6c6..000000000000
--- a/ppdiffusers/tests/pipelines/stable_unclip/test_stable_unclip.py
+++ /dev/null
@@ -1,186 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import paddle
-from ..pipeline_params import (
-    TEXT_TO_IMAGE_BATCH_PARAMS,
-    TEXT_TO_IMAGE_PARAMS,
-)
-from ..test_pipelines_common import PipelineTesterMixin
-
-from paddlenlp.transformers import (
-    CLIPTextConfig,
-    CLIPTextModel,
-    CLIPTextModelWithProjection,
-    CLIPTokenizer,
-)
-from ppdiffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    DDPMScheduler,
-    PriorTransformer,
-    StableUnCLIPPipeline,
-    UNet2DConditionModel,
-)
-from ppdiffusers.pipelines.stable_diffusion.stable_unclip_image_normalizer import (
-    StableUnCLIPImageNormalizer,
-)
-
-
-class StableUnCLIPPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = StableUnCLIPPipeline
-    test_xformers_attention = False
-    params = TEXT_TO_IMAGE_PARAMS
-    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
-
-    def get_dummy_components(self):
-        embedder_hidden_size = 32
-        embedder_projection_dim = embedder_hidden_size
-        paddle.seed(0)
-        prior_tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        paddle.seed(0)
-        prior_text_encoder = CLIPTextModelWithProjection(
-            CLIPTextConfig(
-                bos_token_id=0,
-                eos_token_id=2,
-                hidden_size=embedder_hidden_size,
-                projection_dim=embedder_projection_dim,
-                intermediate_size=37,
-                layer_norm_eps=1e-05,
-                num_attention_heads=4,
-                num_hidden_layers=5,
-                pad_token_id=1,
-                vocab_size=1000,
-            )
-        )
-        paddle.seed(0)
-        prior = PriorTransformer(
-            num_attention_heads=2, attention_head_dim=12, embedding_dim=embedder_projection_dim, num_layers=1
-        )
-        paddle.seed(0)
-        prior_scheduler = DDPMScheduler(
-            variance_type="fixed_small_log",
-            prediction_type="sample",
-            num_train_timesteps=1000,
-            clip_sample=True,
-            clip_sample_range=5.0,
-            beta_schedule="squaredcos_cap_v2",
-        )
-        paddle.seed(0)
-        image_normalizer = StableUnCLIPImageNormalizer(embedding_dim=embedder_hidden_size)
-        image_noising_scheduler = DDPMScheduler(beta_schedule="squaredcos_cap_v2")
-        paddle.seed(0)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        paddle.seed(0)
-        text_encoder = CLIPTextModel(
-            CLIPTextConfig(
-                bos_token_id=0,
-                eos_token_id=2,
-                hidden_size=embedder_hidden_size,
-                projection_dim=32,
-                intermediate_size=37,
-                layer_norm_eps=1e-05,
-                num_attention_heads=4,
-                num_hidden_layers=5,
-                pad_token_id=1,
-                vocab_size=1000,
-            )
-        )
-        paddle.seed(0)
-        unet = UNet2DConditionModel(
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
-            up_block_types=("UpBlock2D", "CrossAttnUpBlock2D"),
-            block_out_channels=(32, 64),
-            attention_head_dim=(2, 4),
-            class_embed_type="projection",
-            projection_class_embeddings_input_dim=embedder_projection_dim * 2,
-            cross_attention_dim=embedder_hidden_size,
-            layers_per_block=1,
-            upcast_attention=True,
-            use_linear_projection=True,
-        )
-        paddle.seed(0)
-        scheduler = DDIMScheduler(
-            beta_schedule="scaled_linear",
-            beta_start=0.00085,
-            beta_end=0.012,
-            prediction_type="v_prediction",
-            set_alpha_to_one=False,
-            steps_offset=1,
-        )
-        paddle.seed(0)
-        vae = AutoencoderKL()
-        components = {
-            "prior_tokenizer": prior_tokenizer,
-            "prior_text_encoder": prior_text_encoder,
-            "prior": prior,
-            "prior_scheduler": prior_scheduler,
-            "image_normalizer": image_normalizer,
-            "image_noising_scheduler": image_noising_scheduler,
-            "tokenizer": tokenizer,
-            "text_encoder": text_encoder,
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-        }
-        return components
-
-    def get_dummy_inputs(self, seed=0):
-        generator = paddle.Generator().manual_seed(seed)
-
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "prior_num_inference_steps": 2,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_attention_slicing_forward_pass(self):
-        test_max_difference = False
-        self._test_attention_slicing_forward_pass(test_max_difference=test_max_difference)
-
-    def test_inference_batch_single_identical(self):
-        test_max_difference = False
-        self._test_inference_batch_single_identical(test_max_difference=test_max_difference)
-
-
-# @slow
-# @require_paddle_gpu
-# class StableUnCLIPPipelineIntegrationTests(unittest.TestCase):
-
-#     def tearDown(self):
-#         super().tearDown()
-#         gc.collect()
-#         paddle.device.cuda.empty_cache()
-
-#     def test_stable_unclip(self):
-#         expected_image = load_numpy(
-#             'https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_unclip/stable_unclip_2_1_l_anime_turtle_fp16.npy'
-#             )
-#         pipe = StableUnCLIPPipeline.from_pretrained(
-#             'fusing/stable-unclip-2-1-l')
-#         pipe.set_progress_bar_config(disable=None)
-#         generator = paddle.Generator().manual_seed(0)
-#         output = pipe('anime turle', generator=generator, output_type='np')
-#         image = output.images[0]
-#         assert image.shape == (768, 768, 3)
-#         assert_mean_pixel_difference(image, expected_image)
diff --git a/ppdiffusers/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py b/ppdiffusers/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py
deleted file mode 100644
index cde3b6f58d04..000000000000
--- a/ppdiffusers/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py
+++ /dev/null
@@ -1,218 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-import unittest
-import numpy as np
-
-import paddle
-from ..pipeline_params import (
-    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
-    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
-)
-from ..test_pipelines_common import PipelineTesterMixin
-
-from paddlenlp.transformers import (
-    CLIPImageProcessor,
-    CLIPTextConfig,
-    CLIPTextModel,
-    CLIPTokenizer,
-    CLIPVisionConfig,
-    CLIPVisionModelWithProjection,
-)
-from ppdiffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    DDPMScheduler,
-    StableUnCLIPImg2ImgPipeline,
-    UNet2DConditionModel,
-)
-from ppdiffusers.pipelines.pipeline_utils import DiffusionPipeline
-from ppdiffusers.pipelines.stable_diffusion.stable_unclip_image_normalizer import (
-    StableUnCLIPImageNormalizer,
-)
-from ppdiffusers.utils.import_utils import is_ppxformers_available
-from ppdiffusers.utils.testing_utils import floats_tensor
-
-
-class StableUnCLIPImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = StableUnCLIPImg2ImgPipeline
-    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS
-    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
-
-    def get_dummy_components(self):
-        embedder_hidden_size = 32
-        embedder_projection_dim = embedder_hidden_size
-        feature_extractor = CLIPImageProcessor(crop_size=32, size=32)
-        image_encoder = CLIPVisionModelWithProjection(
-            CLIPVisionConfig(
-                hidden_size=embedder_hidden_size,
-                projection_dim=embedder_projection_dim,
-                num_hidden_layers=5,
-                num_attention_heads=4,
-                image_size=32,
-                intermediate_size=37,
-                patch_size=1,
-            )
-        )
-        paddle.seed(0)
-        image_normalizer = StableUnCLIPImageNormalizer(embedding_dim=embedder_hidden_size)
-        image_noising_scheduler = DDPMScheduler(beta_schedule="squaredcos_cap_v2")
-        paddle.seed(0)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        paddle.seed(0)
-        text_encoder = CLIPTextModel(
-            CLIPTextConfig(
-                bos_token_id=0,
-                eos_token_id=2,
-                hidden_size=embedder_hidden_size,
-                projection_dim=32,
-                intermediate_size=37,
-                layer_norm_eps=1e-05,
-                num_attention_heads=4,
-                num_hidden_layers=5,
-                pad_token_id=1,
-                vocab_size=1000,
-            )
-        )
-        paddle.seed(0)
-        unet = UNet2DConditionModel(
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
-            up_block_types=("UpBlock2D", "CrossAttnUpBlock2D"),
-            block_out_channels=(32, 64),
-            attention_head_dim=(2, 4),
-            class_embed_type="projection",
-            projection_class_embeddings_input_dim=embedder_projection_dim * 2,
-            cross_attention_dim=embedder_hidden_size,
-            layers_per_block=1,
-            upcast_attention=True,
-            use_linear_projection=True,
-        )
-        paddle.seed(0)
-        scheduler = DDIMScheduler(
-            beta_schedule="scaled_linear",
-            beta_start=0.00085,
-            beta_end=0.012,
-            prediction_type="v_prediction",
-            set_alpha_to_one=False,
-            steps_offset=1,
-        )
-        paddle.seed(0)
-        vae = AutoencoderKL()
-        components = {
-            "feature_extractor": feature_extractor,
-            "image_encoder": image_encoder.eval(),
-            "image_normalizer": image_normalizer.eval(),
-            "image_noising_scheduler": image_noising_scheduler,
-            "tokenizer": tokenizer,
-            "text_encoder": text_encoder.eval(),
-            "unet": unet.eval(),
-            "scheduler": scheduler,
-            "vae": vae.eval(),
-        }
-        return components
-
-    def test_image_embeds_none(self):
-        components = self.get_dummy_components()
-        sd_pipe = StableUnCLIPImg2ImgPipeline(**components)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs()
-        inputs.update({"image_embeds": None})
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.40317363, 1. , 0.5802471, 0.47334313, 0.39546987, 0.72409034, 0.15691131, 0.42981434, 0.72585064])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
-
-    def get_dummy_inputs(self, seed=0, pil_image=True):
-        generator = paddle.Generator().manual_seed(seed)
-
-        input_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed))
-        if pil_image:
-            input_image = input_image * 0.5 + 0.5
-            input_image = input_image.clip(min=0, max=1)
-            input_image = input_image.cpu().transpose(perm=[0, 2, 3, 1]).cast("float32").numpy()
-            input_image = DiffusionPipeline.numpy_to_pil(input_image)[0]
-        return {
-            "prompt": "An anime racoon running a marathon",
-            "image": input_image,
-            "generator": generator,
-            "num_inference_steps": 2,
-            "output_type": "np",
-        }
-
-    def test_attention_slicing_forward_pass(self):
-        test_max_difference = False
-        self._test_attention_slicing_forward_pass(test_max_difference=test_max_difference)
-
-    def test_inference_batch_single_identical(self):
-        test_max_difference = False
-        self._test_inference_batch_single_identical(test_max_difference=test_max_difference)
-
-    @unittest.skipIf(
-        not is_ppxformers_available(),
-        reason="XFormers attention is only available with CUDA and `xformers` installed",
-    )
-    def test_xformers_attention_forwardGenerator_pass(self):
-        self._test_xformers_attention_forwardGenerator_pass(test_max_difference=False)
-
-
-# @slow
-# @require_paddle_gpu
-# class StableUnCLIPImg2ImgPipelineIntegrationTests(unittest.TestCase):
-
-#     def tearDown(self):
-#         super().tearDown()
-#         gc.collect()
-#         paddle.device.cuda.empty_cache()
-
-#     def test_stable_unclip_l_img2img(self):
-#         input_image = load_image(
-#             'https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_unclip/turtle.png'
-#             )
-#         expected_image = load_numpy(
-#             'https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_unclip/stable_unclip_2_1_l_img2img_anime_turtle_fp16.npy'
-#             )
-#         pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
-#             'fusing/stable-unclip-2-1-l-img2img')
-#         pipe.set_progress_bar_config(disable=None)
-#         generator = paddle.Generator().manual_seed(0)
-#         output = pipe(input_image, "anime turle", generator=generator, output_type="np")
-#         image = output.images[0]
-#         # breakpoint()
-#         assert image.shape == (768, 768, 3)
-#         assert_mean_pixel_difference(image, expected_image)
-
-#     def test_stable_unclip_h_img2img(self):
-#         input_image = load_image(
-#             'https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_unclip/turtle.png'
-#             )
-#         expected_image = load_numpy(
-#             'https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_unclip/stable_unclip_2_1_h_img2img_anime_turtle_fp16.npy'
-#             )
-#         pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
-#             'fusing/stable-unclip-2-1-h-img2img')
-#         pipe.set_progress_bar_config(disable=None)
-#         generator = paddle.Generator().manual_seed(0)
-#         output = pipe(input_image, "anime turle", generator=generator, output_type="np")
-#         image = output.images[0]
-#         assert image.shape == (768, 768, 3)
-#         assert_mean_pixel_difference(image, expected_image)
diff --git a/ppdiffusers/tests/pipelines/test_pipeline_utils.py b/ppdiffusers/tests/pipelines/test_pipeline_utils.py
deleted file mode 100644
index a024c27c0dc0..000000000000
--- a/ppdiffusers/tests/pipelines/test_pipeline_utils.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-from ppdiffusers.pipelines.pipeline_utils import is_safetensors_compatible
-
-
-class IsSafetensorsCompatibleTests(unittest.TestCase):
-    def test_all_is_compatible(self):
-        filenames = [
-            "safety_checker/pytorch_model.bin",
-            "safety_checker/model.safetensors",
-            "vae/diffusion_pytorch_model.bin",
-            "vae/diffusion_pytorch_model.safetensors",
-            "text_encoder/pytorch_model.bin",
-            "text_encoder/model.safetensors",
-            "unet/diffusion_pytorch_model.bin",
-            "unet/diffusion_pytorch_model.safetensors",
-        ]
-        self.assertTrue(is_safetensors_compatible(filenames))
-
-    def test_diffusers_model_is_compatible(self):
-        filenames = [
-            "unet/diffusion_pytorch_model.bin",
-            "unet/diffusion_pytorch_model.safetensors",
-        ]
-        self.assertTrue(is_safetensors_compatible(filenames))
-
-    def test_diffusers_model_is_not_compatible(self):
-        filenames = [
-            "safety_checker/pytorch_model.bin",
-            "safety_checker/model.safetensors",
-            "vae/diffusion_pytorch_model.bin",
-            "vae/diffusion_pytorch_model.safetensors",
-            "text_encoder/pytorch_model.bin",
-            "text_encoder/model.safetensors",
-            "unet/diffusion_pytorch_model.bin",
-            # Removed: 'unet/diffusion_pytorch_model.safetensors',
-        ]
-        self.assertFalse(is_safetensors_compatible(filenames))
-
-    def test_transformer_model_is_compatible(self):
-        filenames = [
-            "text_encoder/pytorch_model.bin",
-            "text_encoder/model.safetensors",
-        ]
-        self.assertTrue(is_safetensors_compatible(filenames))
-
-    def test_transformer_model_is_not_compatible(self):
-        filenames = [
-            "safety_checker/pytorch_model.bin",
-            "safety_checker/model.safetensors",
-            "vae/diffusion_pytorch_model.bin",
-            "vae/diffusion_pytorch_model.safetensors",
-            "text_encoder/pytorch_model.bin",
-            # Removed: 'text_encoder/model.safetensors',
-            "unet/diffusion_pytorch_model.bin",
-            "unet/diffusion_pytorch_model.safetensors",
-        ]
-        self.assertFalse(is_safetensors_compatible(filenames))
-
-    def test_all_is_compatible_variant(self):
-        filenames = [
-            "safety_checker/pytorch_model.fp16.bin",
-            "safety_checker/model.fp16.safetensors",
-            "vae/diffusion_pytorch_model.fp16.bin",
-            "vae/diffusion_pytorch_model.fp16.safetensors",
-            "text_encoder/pytorch_model.fp16.bin",
-            "text_encoder/model.fp16.safetensors",
-            "unet/diffusion_pytorch_model.fp16.bin",
-            "unet/diffusion_pytorch_model.fp16.safetensors",
-        ]
-        variant = "fp16"
-        self.assertTrue(is_safetensors_compatible(filenames, variant=variant))
-
-    def test_diffusers_model_is_compatible_variant(self):
-        filenames = [
-            "unet/diffusion_pytorch_model.fp16.bin",
-            "unet/diffusion_pytorch_model.fp16.safetensors",
-        ]
-        variant = "fp16"
-        self.assertTrue(is_safetensors_compatible(filenames, variant=variant))
-
-    def test_diffusers_model_is_compatible_variant_partial(self):
-        # pass variant but use the non-variant filenames
-        filenames = [
-            "unet/diffusion_pytorch_model.bin",
-            "unet/diffusion_pytorch_model.safetensors",
-        ]
-        variant = "fp16"
-        self.assertTrue(is_safetensors_compatible(filenames, variant=variant))
-
-    def test_diffusers_model_is_not_compatible_variant(self):
-        filenames = [
-            "safety_checker/pytorch_model.fp16.bin",
-            "safety_checker/model.fp16.safetensors",
-            "vae/diffusion_pytorch_model.fp16.bin",
-            "vae/diffusion_pytorch_model.fp16.safetensors",
-            "text_encoder/pytorch_model.fp16.bin",
-            "text_encoder/model.fp16.safetensors",
-            "unet/diffusion_pytorch_model.fp16.bin",
-            # Removed: 'unet/diffusion_pytorch_model.fp16.safetensors',
-        ]
-        variant = "fp16"
-        self.assertFalse(is_safetensors_compatible(filenames, variant=variant))
-
-    def test_transformer_model_is_compatible_variant(self):
-        filenames = [
-            "text_encoder/pytorch_model.fp16.bin",
-            "text_encoder/model.fp16.safetensors",
-        ]
-        variant = "fp16"
-        self.assertTrue(is_safetensors_compatible(filenames, variant=variant))
-
-    def test_transformer_model_is_compatible_variant_partial(self):
-        # pass variant but use the non-variant filenames
-        filenames = [
-            "text_encoder/pytorch_model.bin",
-            "text_encoder/model.safetensors",
-        ]
-        variant = "fp16"
-        self.assertTrue(is_safetensors_compatible(filenames, variant=variant))
-
-    def test_transformer_model_is_not_compatible_variant(self):
-        filenames = [
-            "safety_checker/pytorch_model.fp16.bin",
-            "safety_checker/model.fp16.safetensors",
-            "vae/diffusion_pytorch_model.fp16.bin",
-            "vae/diffusion_pytorch_model.fp16.safetensors",
-            "text_encoder/pytorch_model.fp16.bin",
-            # 'text_encoder/model.fp16.safetensors',
-            "unet/diffusion_pytorch_model.fp16.bin",
-            "unet/diffusion_pytorch_model.fp16.safetensors",
-        ]
-        variant = "fp16"
-        self.assertFalse(is_safetensors_compatible(filenames, variant=variant))
diff --git a/ppdiffusers/tests/pipelines/test_pipelines.py b/ppdiffusers/tests/pipelines/test_pipelines.py
deleted file mode 100644
index aaaa3839f0d7..000000000000
--- a/ppdiffusers/tests/pipelines/test_pipelines.py
+++ /dev/null
@@ -1,1158 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import json
-import os
-import random
-import shutil
-import sys
-import tempfile
-import unittest
-import unittest.mock as mock
-
-import numpy as np
-import paddle
-import PIL
-import requests_mock
-import safetensors.torch
-from parameterized import parameterized
-from PIL import Image
-from requests.exceptions import HTTPError
-
-from paddlenlp.transformers import (
-    CLIPImageProcessor,
-    CLIPModel,
-    CLIPTextConfig,
-    CLIPTextModel,
-    CLIPTokenizer,
-)
-from ppdiffusers import (
-    AutoencoderKL,
-    DDIMPipeline,
-    DDIMScheduler,
-    DDPMPipeline,
-    DDPMScheduler,
-    DiffusionPipeline,
-    DPMSolverMultistepScheduler,
-    EulerAncestralDiscreteScheduler,
-    EulerDiscreteScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    StableDiffusionImg2ImgPipeline,
-    StableDiffusionInpaintPipelineLegacy,
-    StableDiffusionPipeline,
-    UNet2DConditionModel,
-    UNet2DModel,
-    logging,
-)
-from ppdiffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
-from ppdiffusers.utils import (
-    CONFIG_NAME,
-    TORCH_WEIGHTS_NAME,
-    floats_tensor,
-    nightly,
-    slow,
-)
-from ppdiffusers.utils.testing_utils import (
-    CaptureLogger,
-    get_tests_dir,
-    require_compel,
-    require_paddle_gpu,
-    require_torch,
-)
-
-
-class DownloadTests(unittest.TestCase):
-    def test_one_request_upon_cached(self):
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            with requests_mock.mock(real_http=True) as m:
-                DiffusionPipeline.download(
-                    "hf-internal-testing/tiny-stable-diffusion-pipe",
-                    cache_dir=tmpdirname,
-                    from_hf_hub=True,
-                    from_diffusers=True,
-                )
-
-            download_requests = [r.method for r in m.request_history]
-            assert download_requests.count("HEAD") == 15, "15 calls to files"
-            assert download_requests.count("GET") == 17, "15 calls to files + model_info + model_index.json"
-            assert (
-                len(download_requests) == 32
-            ), "2 calls per file (15 files) + send_telemetry, model_info and model_index.json"
-
-            with requests_mock.mock(real_http=True) as m:
-                DiffusionPipeline.download(
-                    "hf-internal-testing/tiny-stable-diffusion-pipe",
-                    safety_checker=None,
-                    cache_dir=tmpdirname,
-                    from_hf_hub=True,
-                    from_diffusers=True,
-                )
-
-            cache_requests = [r.method for r in m.request_history]
-            assert cache_requests.count("HEAD") == 1, "model_index.json is only HEAD"
-            assert cache_requests.count("GET") == 1, "model info is only GET"
-            assert (
-                len(cache_requests) == 2
-            ), "We should call only `model_info` to check for _commit hash and `send_telemetry`"
-
-    def test_less_downloads_passed_object(self):
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            cached_folder = DiffusionPipeline.download(
-                "hf-internal-testing/tiny-stable-diffusion-pipe",
-                safety_checker=None,
-                cache_dir=tmpdirname,
-                from_hf_hub=True,
-                from_diffusers=True,
-            )
-
-            # make sure safety checker is not downloaded
-            assert "safety_checker" not in os.listdir(cached_folder)
-
-            # make sure rest is downloaded
-            assert "unet" in os.listdir(cached_folder)
-            assert "tokenizer" in os.listdir(cached_folder)
-            assert "vae" in os.listdir(cached_folder)
-            assert "model_index.json" in os.listdir(cached_folder)
-            assert "scheduler" in os.listdir(cached_folder)
-            assert "feature_extractor" in os.listdir(cached_folder)
-
-    def test_less_downloads_passed_object_calls(self):
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            with requests_mock.mock(real_http=True) as m:
-                DiffusionPipeline.download(
-                    "hf-internal-testing/tiny-stable-diffusion-pipe",
-                    safety_checker=None,
-                    cache_dir=tmpdirname,
-                    from_hf_hub=True,
-                    from_diffusers=True,
-                )
-
-            download_requests = [r.method for r in m.request_history]
-            # 15 - 2 because no call to config or model file for `safety_checker`
-            assert download_requests.count("HEAD") == 13, "13 calls to files"
-            # 17 - 2 because no call to config or model file for `safety_checker`
-            assert download_requests.count("GET") == 15, "13 calls to files + model_info + model_index.json"
-            assert (
-                len(download_requests) == 28
-            ), "2 calls per file (13 files) + send_telemetry, model_info and model_index.json"
-
-            with requests_mock.mock(real_http=True) as m:
-                DiffusionPipeline.download(
-                    "hf-internal-testing/tiny-stable-diffusion-pipe",
-                    safety_checker=None,
-                    cache_dir=tmpdirname,
-                    from_hf_hub=True,
-                    from_diffusers=True,
-                )
-
-            cache_requests = [r.method for r in m.request_history]
-            assert cache_requests.count("HEAD") == 1, "model_index.json is only HEAD"
-            assert cache_requests.count("GET") == 1, "model info is only GET"
-            assert (
-                len(cache_requests) == 2
-            ), "We should call only `model_info` to check for _commit hash and `send_telemetry`"
-
-    def test_download_only_pytorch(self):
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            tmpdirname = DiffusionPipeline.download(
-                "hf-internal-testing/tiny-stable-diffusion-pipe",
-                safety_checker=None,
-                cache_dir=tmpdirname,
-                from_hf_hub=True,
-                from_diffusers=True,
-            )
-
-            all_root_files = [t[-1] for t in os.walk(os.path.join(tmpdirname))]
-            all_root_files = [t[-1] for t in os.walk(os.path.join(tmpdirname, os.listdir(tmpdirname)[0], "snapshots"))]
-            files = [item for sublist in all_root_files for item in sublist]
-            assert not any(f.endswith(".msgpack") for f in files)
-            assert not any(f.endswith(".safetensors") for f in files)
-
-    def test_returned_cached_folder(self):
-        prompt = "hello"
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
-        )
-        _, local_path = StableDiffusionPipeline.from_pretrained(
-            "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None, return_cached_folder=True
-        )
-        pipe_2 = StableDiffusionPipeline.from_pretrained(local_path)
-        generator = paddle.Generator().manual_seed(0)
-        out = pipe(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
-        generator = paddle.Generator().manual_seed(0)
-        out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
-        assert np.max(np.abs(out - out_2)) < 0.001
-
-    def test_force_safetensors_error(self):
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            # pipeline has Flax weights
-            with self.assertRaises(EnvironmentError):
-                tmpdirname = DiffusionPipeline.download(
-                    "hf-internal-testing/tiny-stable-diffusion-pipe-no-safetensors",
-                    from_hf_hub=True,
-                    from_diffusers=True,
-                    safety_checker=None,
-                    cache_dir=tmpdirname,
-                    use_safetensors=True,
-                )
-
-    def test_download_safetensors(self):
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            tmpdirname = DiffusionPipeline.download(
-                "hf-internal-testing/tiny-stable-diffusion-pipe-safetensors",
-                from_hf_hub=True,
-                from_diffusers=True,
-                safety_checker=None,
-                cache_dir=tmpdirname,
-                use_safetensors=True,
-            )
-
-            all_root_files = [t[-1] for t in os.walk(os.path.join(tmpdirname))]
-            files = [item for sublist in all_root_files for item in sublist]
-            assert not any(f.endswith(".bin") for f in files)
-
-    def test_download_safetensors_index(self):
-        for variant in ["fp16", None]:
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                tmpdirname = DiffusionPipeline.download(
-                    "hf-internal-testing/tiny-stable-diffusion-pipe-indexes",
-                    cache_dir=tmpdirname,
-                    use_safetensors=True,
-                    variant=variant,
-                    from_hf_hub=True,
-                    from_diffusers=True,
-                )
-
-                all_root_files = [t[-1] for t in os.walk(os.path.join(tmpdirname))]
-                files = [item for sublist in all_root_files for item in sublist]
-
-                # None of the downloaded files should be a safetensors file even if we have some here:
-                # https://huggingface.co/hf-internal-testing/tiny-stable-diffusion-pipe-indexes/tree/main/text_encoder
-                if variant is None:
-                    assert not any("fp16" in f for f in files)
-                else:
-                    model_files = [f for f in files if "safetensors" in f]
-                    assert all("fp16" in f for f in model_files)
-
-                assert len([f for f in files if ".safetensors" in f]) == 8
-                assert not any(".bin" in f for f in files)
-
-    def test_download_bin_index(self):
-        for variant in ["fp16", None]:
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                tmpdirname = DiffusionPipeline.download(
-                    "hf-internal-testing/tiny-stable-diffusion-pipe-indexes",
-                    cache_dir=tmpdirname,
-                    use_safetensors=False,
-                    variant=variant,
-                    from_hf_hub=True,
-                    from_diffusers=True,
-                )
-
-                all_root_files = [t[-1] for t in os.walk(os.path.join(tmpdirname))]
-                files = [item for sublist in all_root_files for item in sublist]
-
-                # None of the downloaded files should be a safetensors file even if we have some here:
-                # https://huggingface.co/hf-internal-testing/tiny-stable-diffusion-pipe-indexes/tree/main/text_encoder
-                if variant is None:
-                    assert not any("fp16" in f for f in files)
-                else:
-                    model_files = [f for f in files if "bin" in f]
-                    assert all("fp16" in f for f in model_files)
-
-                assert len([f for f in files if ".bin" in f]) == 8
-                assert not any(".safetensors" in f for f in files)
-
-    def test_download_no_safety_checker(self):
-        prompt = "hello"
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
-        )
-        generator = paddle.Generator().manual_seed(0)
-        out = pipe(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
-        pipe_2 = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-torch")
-        generator = paddle.Generator().manual_seed(0)
-        out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
-        assert np.max(np.abs(out - out_2)) < 0.001
-
-    def test_load_no_safety_checker_explicit_locally(self):
-        prompt = "hello"
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
-        )
-        generator = paddle.Generator().manual_seed(0)
-        out = pipe(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pipe.save_pretrained(tmpdirname)
-            pipe_2 = StableDiffusionPipeline.from_pretrained(tmpdirname, safety_checker=None)
-            generator = paddle.Generator().manual_seed(0)
-            out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
-        assert np.max(np.abs(out - out_2)) < 0.001
-
-    def test_load_no_safety_checker_default_locally(self):
-        prompt = "hello"
-        pipe = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-torch")
-        generator = paddle.Generator().manual_seed(0)
-        out = pipe(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pipe.save_pretrained(tmpdirname)
-            pipe_2 = StableDiffusionPipeline.from_pretrained(tmpdirname)
-            generator = paddle.Generator().manual_seed(0)
-            out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
-        assert np.max(np.abs(out - out_2)) < 0.001
-
-    def test_cached_files_are_used_when_no_internet(self):
-        response_mock = mock.Mock()
-        response_mock.status_code = 500
-        response_mock.headers = {}
-        response_mock.raise_for_status.side_effect = HTTPError
-        response_mock.json.return_value = {}
-        orig_pipe = StableDiffusionPipeline.from_pretrained(
-            "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
-        )
-        orig_comps = {k: v for k, v in orig_pipe.components.items() if hasattr(v, "parameters")}
-        with mock.patch("requests.request", return_value=response_mock):
-            pipe = StableDiffusionPipeline.from_pretrained(
-                "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None, local_files_only=True
-            )
-            comps = {k: v for k, v in pipe.components.items() if hasattr(v, "parameters")}
-        for m1, m2 in zip(orig_comps.values(), comps.values()):
-            for p1, p2 in zip(m1.parameters(), m2.parameters()):
-                if (p1 != p2).sum() > 0:
-                    assert False, "Parameters not the same!"
-
-    def test_download_from_variant_folder(self):
-        for safe_avail in [False, True]:
-            import ppdiffusers
-
-            ppdiffusers.utils.import_utils._safetensors_available = safe_avail
-            other_format = ".bin" if safe_avail else ".safetensors"
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                tmpdirname = StableDiffusionPipeline.download(
-                    "hf-internal-testing/stable-diffusion-all-variants", cache_dir=tmpdirname
-                )
-                all_root_files = [t[-1] for t in os.walk(tmpdirname)]
-                files = [item for sublist in all_root_files for item in sublist]
-                assert len(files) == 15, f"We should only download 15 files, not {len(files)}"
-                assert not any(f.endswith(other_format) for f in files)
-                assert not any(len(f.split(".")) == 3 for f in files)
-        ppdiffusers.utils.import_utils._safetensors_available = True
-
-    def test_download_variant_all(self):
-        for safe_avail in [False, True]:
-            import ppdiffusers
-
-            ppdiffusers.utils.import_utils._safetensors_available = safe_avail
-            other_format = ".bin" if safe_avail else ".safetensors"
-            this_format = ".safetensors" if safe_avail else ".bin"
-            variant = "fp16"
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                StableDiffusionPipeline.from_pretrained(
-                    "hf-internal-testing/stable-diffusion-all-variants", cache_dir=tmpdirname, variant=variant
-                )
-                all_root_files = [
-                    t[-1] for t in os.walk(os.path.join(tmpdirname, os.listdir(tmpdirname)[0], "snapshots"))
-                ]
-                files = [item for sublist in all_root_files for item in sublist]
-                assert len(files) == 15, f"We should only download 15 files, not {len(files)}"
-                assert len([f for f in files if f.endswith(f"{variant}{this_format}")]) == 4
-                assert not any(f.endswith(this_format) and not f.endswith(f"{variant}{this_format}") for f in files)
-                assert not any(f.endswith(other_format) for f in files)
-        ppdiffusers.utils.import_utils._safetensors_available = True
-
-    def test_download_variant_partly(self):
-        for safe_avail in [False, True]:
-            import ppdiffusers
-
-            ppdiffusers.utils.import_utils._safetensors_available = safe_avail
-            other_format = ".bin" if safe_avail else ".safetensors"
-            this_format = ".safetensors" if safe_avail else ".bin"
-            variant = "no_ema"
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                tmpdirname = StableDiffusionPipeline.download(
-                    "hf-internal-testing/stable-diffusion-all-variants", cache_dir=tmpdirname, variant=variant
-                )
-                all_root_files = [t[-1] for t in os.walk(tmpdirname)]
-                files = [item for sublist in all_root_files for item in sublist]
-
-                unet_files = os.listdir(os.path.join(tmpdirname, "unet"))
-                assert len(files) == 15, f"We should only download 15 files, not {len(files)}"
-                assert f"diffusion_pytorch_model.{variant}{this_format}" in unet_files
-                assert len([f for f in files if f.endswith(f"{variant}{this_format}")]) == 1
-                assert sum(f.endswith(this_format) and not f.endswith(f"{variant}{this_format}") for f in files) == 3
-                assert not any(f.endswith(other_format) for f in files)
-        ppdiffusers.utils.import_utils._safetensors_available = True
-
-    def test_download_broken_variant(self):
-        pass
-        # for safe_avail in [False, True]:
-        #     import ppdiffusers
-
-        #     ppdiffusers.utils.import_utils._safetensors_available = safe_avail
-        #     for variant in [None, "no_ema"]:
-        #         with self.assertRaises(OSError) as error_context:
-        #             with tempfile.TemporaryDirectory() as tmpdirname:
-        #                 tmpdirname = StableDiffusionPipeline.download(
-        #                     "hf-internal-testing/stable-diffusion-broken-variants",
-        #                     cache_dir=tmpdirname,
-        #                     variant=variant,
-        #                 )
-        #         assert "Error no file name" in str(error_context.exception)
-        #     with tempfile.TemporaryDirectory() as tmpdirname:
-        #         tmpdirname = StableDiffusionPipeline.download(
-        #             "hf-internal-testing/stable-diffusion-broken-variants", cache_dir=tmpdirname, variant="fp16"
-        #         )
-
-        #         all_root_files = [t[-1] for t in os.walk(tmpdirname)]
-        #         files = [item for sublist in all_root_files for item in sublist]
-        #         assert len(files) == 15, f"We should only download 15 files, not {len(files)}"
-        # ppdiffusers.utils.import_utils._safetensors_available = True
-
-    def test_local_save_load_index(self):
-        # TODO support index file
-        pass
-
-    @require_torch
-    def test_text_inversion_download(self):
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
-        )
-        import torch
-
-        num_tokens = len(pipe.tokenizer)
-
-        # single token load local
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            ten = {"<*>": torch.ones((32,))}
-            torch.save(ten, os.path.join(tmpdirname, "learned_embeds.bin"))
-
-            pipe.load_textual_inversion(tmpdirname, from_diffusers=True)
-
-            token = pipe.tokenizer.convert_tokens_to_ids("<*>")
-            assert token == num_tokens, "Added token must be at spot `num_tokens`"
-            assert pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() == 32
-            assert pipe._maybe_convert_prompt("<*>", pipe.tokenizer) == "<*>"
-
-            prompt = "hey <*>"
-            out = pipe(prompt, num_inference_steps=1, output_type="numpy").images
-            assert out.shape == (1, 128, 128, 3)
-
-            # single token load local with weight name
-            ten = {"<**>": 2 * torch.ones((1, 32))}
-            torch.save(ten, os.path.join(tmpdirname, "learned_embeds.bin"))
-
-            pipe.load_textual_inversion(tmpdirname, weight_name="learned_embeds.bin", from_diffusers=True)
-
-            token = pipe.tokenizer.convert_tokens_to_ids("<**>")
-            assert token == num_tokens + 1, "Added token must be at spot `num_tokens`"
-            assert pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() == 64
-            assert pipe._maybe_convert_prompt("<**>", pipe.tokenizer) == "<**>"
-
-            prompt = "hey <**>"
-            out = pipe(prompt, num_inference_steps=1, output_type="numpy").images
-            assert out.shape == (1, 128, 128, 3)
-
-            # multi token load
-            ten = {"<***>": torch.cat([3 * torch.ones((1, 32)), 4 * torch.ones((1, 32)), 5 * torch.ones((1, 32))])}
-            torch.save(ten, os.path.join(tmpdirname, "learned_embeds.bin"))
-
-            pipe.load_textual_inversion(tmpdirname, from_diffusers=True)
-
-            token = pipe.tokenizer.convert_tokens_to_ids("<***>")
-            token_1 = pipe.tokenizer.convert_tokens_to_ids("<***>_1")
-            token_2 = pipe.tokenizer.convert_tokens_to_ids("<***>_2")
-
-            assert token == num_tokens + 2, "Added token must be at spot `num_tokens`"
-            assert token_1 == num_tokens + 3, "Added token must be at spot `num_tokens`"
-            assert token_2 == num_tokens + 4, "Added token must be at spot `num_tokens`"
-            assert pipe.text_encoder.get_input_embeddings().weight[-3].sum().item() == 96
-            assert pipe.text_encoder.get_input_embeddings().weight[-2].sum().item() == 128
-            assert pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() == 160
-            assert pipe._maybe_convert_prompt("<***>", pipe.tokenizer) == "<***> <***>_1 <***>_2"
-
-            prompt = "hey <***>"
-            out = pipe(prompt, num_inference_steps=1, output_type="numpy").images
-            assert out.shape == (1, 128, 128, 3)
-
-            # multi token load a1111
-            ten = {
-                "string_to_param": {
-                    "*": torch.cat([3 * torch.ones((1, 32)), 4 * torch.ones((1, 32)), 5 * torch.ones((1, 32))])
-                },
-                "name": "<****>",
-            }
-            torch.save(ten, os.path.join(tmpdirname, "a1111.bin"))
-
-            pipe.load_textual_inversion(tmpdirname, weight_name="a1111.bin", from_diffusers=True)
-
-            token = pipe.tokenizer.convert_tokens_to_ids("<****>")
-            token_1 = pipe.tokenizer.convert_tokens_to_ids("<****>_1")
-            token_2 = pipe.tokenizer.convert_tokens_to_ids("<****>_2")
-
-            assert token == num_tokens + 5, "Added token must be at spot `num_tokens`"
-            assert token_1 == num_tokens + 6, "Added token must be at spot `num_tokens`"
-            assert token_2 == num_tokens + 7, "Added token must be at spot `num_tokens`"
-            assert pipe.text_encoder.get_input_embeddings().weight[-3].sum().item() == 96
-            assert pipe.text_encoder.get_input_embeddings().weight[-2].sum().item() == 128
-            assert pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() == 160
-            assert pipe._maybe_convert_prompt("<****>", pipe.tokenizer) == "<****> <****>_1 <****>_2"
-
-            prompt = "hey <****>"
-            out = pipe(prompt, num_inference_steps=1, output_type="numpy").images
-            assert out.shape == (1, 128, 128, 3)
-
-    def test_download_ignore_files(self):
-        # Check https://huggingface.co/hf-internal-testing/tiny-stable-diffusion-pipe-ignore-files/blob/72f58636e5508a218c6b3f60550dc96445547817/model_index.json#L4
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            # pipeline has Flax weights
-            tmpdirname = DiffusionPipeline.download(
-                "hf-internal-testing/tiny-stable-diffusion-pipe-ignore-files", cache_dir=tmpdirname
-            )
-            files = []
-            for root, ds, fs in os.walk(tmpdirname):
-                for f in fs:
-                    str_path = str(os.path.join(root, f)).replace(str(tmpdirname) + "/", "")
-                    files.append(str_path)
-            # None of the downloaded files should be a pytorch file even if we have some here:
-            # https://huggingface.co/hf-internal-testing/tiny-stable-diffusion-pipe/blob/main/unet/diffusion_flax_model.msgpack
-            assert not any(f in files for f in ["vae/diffusion_pytorch_model.bin", "text_encoder/config.json"])
-            assert len(files) == 13
-
-
-class CustomPipelineTests(unittest.TestCase):
-    def test_load_custom_pipeline(self):
-        pipeline = DiffusionPipeline.from_pretrained(
-            "google/ddpm-cifar10-32", custom_pipeline="junnyu/ppdiffusers-dummy-pipeline"
-        )
-        pipeline = pipeline
-        assert pipeline.__class__.__name__ == "CustomPipeline"
-
-    def test_load_custom_github(self):
-        pipeline = DiffusionPipeline.from_pretrained(
-            "google/ddpm-cifar10-32", custom_pipeline="one_step_unet", custom_revision="develop"
-        )
-        with paddle.no_grad():
-            output = pipeline()
-        assert output.numel() == output.sum()
-
-        del sys.modules["ppdiffusers_modules.git.one_step_unet"]
-        pipeline = DiffusionPipeline.from_pretrained(
-            "google/ddpm-cifar10-32",
-            custom_pipeline="one_step_unet",
-            custom_revision="b088618584825b9a2373daecda4193ef450b72d0",
-        )
-        with paddle.no_grad():
-            output = pipeline()
-        assert output.numel() != output.sum()
-
-        assert pipeline.__class__.__name__ == "UnetSchedulerOneForwardPipeline"
-
-    def test_run_custom_pipeline(self):
-        pipeline = DiffusionPipeline.from_pretrained(
-            "google/ddpm-cifar10-32", custom_pipeline="junnyu/ppdiffusers-dummy-pipeline"
-        )
-        pipeline = pipeline
-        images, output_str = pipeline(num_inference_steps=2, output_type="np")
-        assert images[0].shape == (1, 32, 32, 3)
-        assert output_str == "This is a test"
-
-    def test_local_custom_pipeline_repo(self):
-        local_custom_pipeline_path = get_tests_dir("fixtures/custom_pipeline")
-        pipeline = DiffusionPipeline.from_pretrained(
-            "google/ddpm-cifar10-32", custom_pipeline=local_custom_pipeline_path
-        )
-        pipeline = pipeline
-        images, output_str = pipeline(num_inference_steps=2, output_type="np")
-        assert pipeline.__class__.__name__ == "CustomLocalPipeline"
-        assert images[0].shape == (1, 32, 32, 3)
-        assert output_str == "This is a local test"
-
-    def test_local_custom_pipeline_file(self):
-        local_custom_pipeline_path = get_tests_dir("fixtures/custom_pipeline")
-        local_custom_pipeline_path = os.path.join(local_custom_pipeline_path, "what_ever.py")
-        pipeline = DiffusionPipeline.from_pretrained(
-            "google/ddpm-cifar10-32", custom_pipeline=local_custom_pipeline_path
-        )
-        pipeline = pipeline
-        images, output_str = pipeline(num_inference_steps=2, output_type="np")
-        assert pipeline.__class__.__name__ == "CustomLocalPipeline"
-        assert images[0].shape == (1, 32, 32, 3)
-        assert output_str == "This is a local test"
-
-    @slow
-    @require_paddle_gpu
-    def test_download_from_git(self):
-        clip_model_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
-        feature_extractor = CLIPImageProcessor.from_pretrained(clip_model_id, from_hf_hub=False)
-        clip_model = CLIPModel.from_pretrained(
-            clip_model_id, paddle_dtype=paddle.float16, from_hf_hub=False, from_diffusers=False
-        )
-        pipeline = DiffusionPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4",
-            custom_pipeline="clip_guided_stable_diffusion",
-            clip_model=clip_model,
-            feature_extractor=feature_extractor,
-            paddle_dtype=paddle.float16,
-            from_hf_hub=False,
-            from_diffusers=False,
-        )
-        pipeline.enable_attention_slicing()
-        assert pipeline.__class__.__name__ == "CLIPGuidedStableDiffusion"
-        image = pipeline("a prompt", num_inference_steps=2, output_type="np").images[0]
-        assert image.shape == (512, 512, 3)
-
-    def test_save_pipeline_change_config(self):
-        pipe = DiffusionPipeline.from_pretrained(
-            "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
-        )
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pipe.save_pretrained(tmpdirname)
-            pipe = DiffusionPipeline.from_pretrained(tmpdirname)
-
-            assert pipe.scheduler.__class__.__name__ == "PNDMScheduler"
-
-            pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
-            pipe.save_pretrained(tmpdirname)
-            pipe = DiffusionPipeline.from_pretrained(tmpdirname)
-
-            assert pipe.scheduler.__class__.__name__ == "DPMSolverMultistepScheduler"
-            # let's make sure that changing the scheduler is correctly reflected
-
-
-class PipelineFastTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-        import ppdiffusers
-
-        ppdiffusers.utils.import_utils._safetensors_available = True
-
-    def dummy_image(self):
-        batch_size = 1
-        num_channels = 3
-        sizes = 32, 32
-        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0))
-        return image
-
-    def dummy_uncond_unet(self, sample_size=32):
-        paddle.seed(0)
-        model = UNet2DModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=sample_size,
-            in_channels=3,
-            out_channels=3,
-            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
-            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
-        )
-        return model
-
-    def dummy_cond_unet(self, sample_size=32):
-        paddle.seed(0)
-        model = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=sample_size,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        return model
-
-    @property
-    def dummy_vae(self):
-        paddle.seed(0)
-        model = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        return model
-
-    @property
-    def dummy_text_encoder(self):
-        paddle.seed(0)
-        config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        return CLIPTextModel(config).eval()
-
-    @property
-    def dummy_extractor(self):
-        def extract(*args, **kwargs):
-            class Out:
-                def __init__(self):
-                    self.pixel_values = paddle.ones(shape=[0])
-
-                def to(self, device):
-                    self.pixel_values
-                    return self
-
-            return Out()
-
-        return extract
-
-    @parameterized.expand(
-        [
-            [DDIMScheduler, DDIMPipeline, 32],
-            [DDPMScheduler, DDPMPipeline, 32],
-            [DDIMScheduler, DDIMPipeline, (32, 64)],
-            [DDPMScheduler, DDPMPipeline, (64, 32)],
-        ]
-    )
-    def test_uncond_unet_components(self, scheduler_fn=DDPMScheduler, pipeline_fn=DDPMPipeline, sample_size=32):
-        unet = self.dummy_uncond_unet(sample_size)
-        scheduler = scheduler_fn()
-        pipeline = pipeline_fn(unet, scheduler)
-        generator = paddle.Generator().manual_seed(0)
-        out_image = pipeline(generator=generator, num_inference_steps=2, output_type="np").images
-        sample_size = (sample_size, sample_size) if isinstance(sample_size, int) else sample_size
-        assert out_image.shape == (1, *sample_size, 3)
-
-    def test_stable_diffusion_components(self):
-        """Test that components property works correctly"""
-        unet = self.dummy_cond_unet()
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        image = self.dummy_image().cpu().transpose(perm=[0, 2, 3, 1])[0]
-        init_image = Image.fromarray(np.uint8(image)).convert("RGB")
-        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((32, 32))
-        inpaint = StableDiffusionInpaintPipelineLegacy(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=self.dummy_extractor,
-        )
-        img2img = StableDiffusionImg2ImgPipeline(**inpaint.components)
-        text2img = StableDiffusionPipeline(**inpaint.components)
-        prompt = "A painting of a squirrel eating a burger"
-        generator = paddle.Generator().manual_seed(0)
-        image_inpaint = inpaint(
-            [prompt],
-            generator=generator,
-            num_inference_steps=2,
-            output_type="np",
-            image=init_image,
-            mask_image=mask_image,
-        ).images
-        image_img2img = img2img(
-            [prompt], generator=generator, num_inference_steps=2, output_type="np", image=init_image
-        ).images
-        image_text2img = text2img([prompt], generator=generator, num_inference_steps=2, output_type="np").images
-        assert image_inpaint.shape == (1, 32, 32, 3)
-        assert image_img2img.shape == (1, 32, 32, 3)
-        assert image_text2img.shape == (1, 64, 64, 3)
-
-    def test_set_scheduler(self):
-        unet = self.dummy_cond_unet()
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        sd = StableDiffusionPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=self.dummy_extractor,
-        )
-        sd.scheduler = DDIMScheduler.from_config(sd.scheduler.config)
-        assert isinstance(sd.scheduler, DDIMScheduler)
-        sd.scheduler = DDPMScheduler.from_config(sd.scheduler.config)
-        assert isinstance(sd.scheduler, DDPMScheduler)
-        sd.scheduler = PNDMScheduler.from_config(sd.scheduler.config)
-        assert isinstance(sd.scheduler, PNDMScheduler)
-        sd.scheduler = LMSDiscreteScheduler.from_config(sd.scheduler.config)
-        assert isinstance(sd.scheduler, LMSDiscreteScheduler)
-        sd.scheduler = EulerDiscreteScheduler.from_config(sd.scheduler.config)
-        assert isinstance(sd.scheduler, EulerDiscreteScheduler)
-        sd.scheduler = EulerAncestralDiscreteScheduler.from_config(sd.scheduler.config)
-        assert isinstance(sd.scheduler, EulerAncestralDiscreteScheduler)
-        sd.scheduler = DPMSolverMultistepScheduler.from_config(sd.scheduler.config)
-        assert isinstance(sd.scheduler, DPMSolverMultistepScheduler)
-
-    def test_set_component_to_none(self):
-        unet = self.dummy_cond_unet()
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        pipeline = StableDiffusionPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=self.dummy_extractor,
-        )
-
-        generator = paddle.Generator().manual_seed(0)
-
-        prompt = "This is a flower"
-
-        out_image = pipeline(
-            prompt=prompt,
-            generator=generator,
-            num_inference_steps=1,
-            output_type="np",
-        ).images
-
-        pipeline.feature_extractor = None
-        generator = paddle.Generator().manual_seed(0)
-        out_image_2 = pipeline(
-            prompt=prompt,
-            generator=generator,
-            num_inference_steps=1,
-            output_type="np",
-        ).images
-
-        assert out_image.shape == (1, 64, 64, 3)
-        assert np.abs(out_image - out_image_2).max() < 1e-3
-
-    def test_set_scheduler_consistency(self):
-        unet = self.dummy_cond_unet()
-        pndm = PNDMScheduler.from_config("hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler")
-        ddim = DDIMScheduler.from_config("hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler")
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        sd = StableDiffusionPipeline(
-            unet=unet,
-            scheduler=pndm,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=self.dummy_extractor,
-        )
-        pndm_config = sd.scheduler.config
-        sd.scheduler = DDPMScheduler.from_config(pndm_config)
-        sd.scheduler = PNDMScheduler.from_config(sd.scheduler.config)
-        pndm_config_2 = sd.scheduler.config
-        pndm_config_2 = {k: v for k, v in pndm_config_2.items() if k in pndm_config}
-        assert dict(pndm_config) == dict(pndm_config_2)
-        sd = StableDiffusionPipeline(
-            unet=unet,
-            scheduler=ddim,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=self.dummy_extractor,
-        )
-        ddim_config = sd.scheduler.config
-        sd.scheduler = LMSDiscreteScheduler.from_config(ddim_config)
-        sd.scheduler = DDIMScheduler.from_config(sd.scheduler.config)
-        ddim_config_2 = sd.scheduler.config
-        ddim_config_2 = {k: v for k, v in ddim_config_2.items() if k in ddim_config}
-        assert dict(ddim_config) == dict(ddim_config_2)
-
-    def test_save_safe_serialization(self):
-        pipeline = StableDiffusionPipeline.from_pretrained(
-            "hf-internal-testing/tiny-stable-diffusion-torch", from_hf_hub=True, from_diffusers=True
-        )
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pipeline.save_pretrained(tmpdirname, safe_serialization=True, to_diffusers=True)
-            vae_path = os.path.join(tmpdirname, "vae", "diffusion_pytorch_model.safetensors")
-            assert os.path.exists(vae_path), f"Could not find {vae_path}"
-            _ = safetensors.torch.load_file(vae_path)
-            unet_path = os.path.join(tmpdirname, "unet", "diffusion_pytorch_model.safetensors")
-            assert os.path.exists(unet_path), f"Could not find {unet_path}"
-            _ = safetensors.torch.load_file(unet_path)
-            text_encoder_path = os.path.join(tmpdirname, "text_encoder", "model.safetensors")
-            assert os.path.exists(text_encoder_path), f"Could not find {text_encoder_path}"
-            _ = safetensors.torch.load_file(text_encoder_path)
-            pipeline = StableDiffusionPipeline.from_pretrained(tmpdirname, from_diffusers=True)
-            assert pipeline.unet is not None
-            assert pipeline.vae is not None
-            assert pipeline.text_encoder is not None
-            assert pipeline.scheduler is not None
-            assert pipeline.feature_extractor is not None
-
-    def test_no_pytorch_download_when_doing_safetensors(self):
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            _ = StableDiffusionPipeline.from_pretrained(
-                "hf-internal-testing/diffusers-stable-diffusion-tiny-all", cache_dir=tmpdirname
-            )
-            path = os.path.join(
-                tmpdirname,
-                "models--hf-internal-testing--diffusers-stable-diffusion-tiny-all",
-                "snapshots",
-                "07838d72e12f9bcec1375b0482b80c1d399be843",
-                "unet",
-            )
-            assert os.path.exists(os.path.join(path, "diffusion_pytorch_model.safetensors"))
-            assert not os.path.exists(os.path.join(path, "diffusion_pytorch_model.bin"))
-
-    def test_no_safetensors_download_when_doing_pytorch(self):
-        import ppdiffusers
-
-        ppdiffusers.utils.import_utils._safetensors_available = False
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            _ = StableDiffusionPipeline.from_pretrained(
-                "hf-internal-testing/diffusers-stable-diffusion-tiny-all", cache_dir=tmpdirname
-            )
-            path = os.path.join(
-                tmpdirname,
-                "models--hf-internal-testing--diffusers-stable-diffusion-tiny-all",
-                "snapshots",
-                "07838d72e12f9bcec1375b0482b80c1d399be843",
-                "unet",
-            )
-            assert not os.path.exists(os.path.join(path, "diffusion_pytorch_model.safetensors"))
-            assert os.path.exists(os.path.join(path, "diffusion_pytorch_model.bin"))
-        ppdiffusers.utils.import_utils._safetensors_available = True
-
-    def test_optional_components(self):
-        unet = self.dummy_cond_unet()
-        pndm = PNDMScheduler.from_config("hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler")
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        sd = StableDiffusionPipeline(
-            unet=unet,
-            scheduler=pndm,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=unet,
-            feature_extractor=self.dummy_extractor,
-        )
-        assert sd.config.requires_safety_checker is True
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            sd.save_pretrained(tmpdirname)
-            sd = StableDiffusionPipeline.from_pretrained(
-                tmpdirname, feature_extractor=None, safety_checker=None, requires_safety_checker=False
-            )
-            assert sd.config.requires_safety_checker is False
-            assert sd.config.safety_checker == (None, None)
-            assert sd.config.feature_extractor == (None, None)
-            sd.save_pretrained(tmpdirname)
-            sd = StableDiffusionPipeline.from_pretrained(tmpdirname)
-            assert sd.config.requires_safety_checker is False
-            assert sd.config.safety_checker == (None, None)
-            assert sd.config.feature_extractor == (None, None)
-            # sd.save_pretrained(tmpdirname)
-            shutil.rmtree(os.path.join(tmpdirname, "safety_checker"))
-            with open(os.path.join(tmpdirname, sd.config_name)) as f:
-                config = json.load(f)
-                config["safety_checker"] = [None, None]
-            with open(os.path.join(tmpdirname, sd.config_name), "w") as f:
-                json.dump(config, f)
-            sd = StableDiffusionPipeline.from_pretrained(tmpdirname, requires_safety_checker=False)
-            sd.save_pretrained(tmpdirname)
-            sd = StableDiffusionPipeline.from_pretrained(tmpdirname)
-            assert sd.config.requires_safety_checker is False
-            assert sd.config.safety_checker == (None, None)
-            assert sd.config.feature_extractor == (None, None)
-            with open(os.path.join(tmpdirname, sd.config_name)) as f:
-                config = json.load(f)
-                del config["safety_checker"]
-                del config["feature_extractor"]
-            with open(os.path.join(tmpdirname, sd.config_name), "w") as f:
-                json.dump(config, f)
-            sd = StableDiffusionPipeline.from_pretrained(tmpdirname)
-            assert sd.config.requires_safety_checker is False
-            assert sd.config.safety_checker == (None, None)
-            assert sd.config.feature_extractor == (None, None)
-            sd.save_pretrained(tmpdirname)
-            sd = StableDiffusionPipeline.from_pretrained(tmpdirname, feature_extractor=self.dummy_extractor)
-            assert sd.config.requires_safety_checker is False
-            assert sd.config.safety_checker == (None, None)
-            assert sd.config.feature_extractor != (None, None)
-            sd = StableDiffusionPipeline.from_pretrained(
-                tmpdirname,
-                feature_extractor=self.dummy_extractor,
-                safety_checker=unet,
-                requires_safety_checker=[True, True],
-            )
-            assert sd.config.requires_safety_checker == [True, True]
-            assert sd.config.safety_checker != (None, None)
-            assert sd.config.feature_extractor != (None, None)
-            sd.save_pretrained(tmpdirname)
-            sd = StableDiffusionPipeline.from_pretrained(tmpdirname, feature_extractor=self.dummy_extractor)
-            assert sd.config.requires_safety_checker == [True, True]
-            assert sd.config.safety_checker != (None, None)
-            assert sd.config.feature_extractor != (None, None)
-
-    @require_compel
-    def test_weighted_prompts_compel(self):
-        pass
-
-
-@slow
-@require_paddle_gpu
-class PipelineSlowTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def test_smart_download(self):
-        model_id = "hf-internal-testing/unet-pipeline-dummy"
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            _ = DiffusionPipeline.from_pretrained(model_id, cache_dir=tmpdirname, force_download=True)
-            local_repo_name = "--".join(["models"] + model_id.split("/"))
-            snapshot_dir = os.path.join(tmpdirname, local_repo_name, "snapshots")
-            snapshot_dir = os.path.join(snapshot_dir, os.listdir(snapshot_dir)[0])
-            assert os.path.isfile(os.path.join(snapshot_dir, DiffusionPipeline.config_name))
-            assert os.path.isfile(os.path.join(snapshot_dir, CONFIG_NAME))
-            assert os.path.isfile(os.path.join(snapshot_dir, SCHEDULER_CONFIG_NAME))
-            assert os.path.isfile(os.path.join(snapshot_dir, TORCH_WEIGHTS_NAME))
-            assert os.path.isfile(os.path.join(snapshot_dir, "scheduler", SCHEDULER_CONFIG_NAME))
-            assert os.path.isfile(os.path.join(snapshot_dir, "unet", TORCH_WEIGHTS_NAME))
-            assert os.path.isfile(os.path.join(snapshot_dir, "unet", TORCH_WEIGHTS_NAME))
-            assert not os.path.isfile(os.path.join(snapshot_dir, "big_array.npy"))
-
-    def test_warning_unused_kwargs(self):
-        model_id = "hf-internal-testing/unet-pipeline-dummy"
-        logger = logging.get_logger("ppdiffusers.pipelines")
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            with CaptureLogger(logger) as cap_logger:
-                DiffusionPipeline.from_pretrained(model_id, not_used=True, cache_dir=tmpdirname, force_download=True)
-        assert (
-            cap_logger.out.strip().split("\n")[-1]
-            == "Keyword arguments {'not_used': True} are not expected by DDPMPipeline and will be ignored."
-        )
-
-    def test_from_save_pretrained(self):
-        model = UNet2DModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=3,
-            out_channels=3,
-            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
-            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
-        )
-        scheduler = DDPMScheduler(num_train_timesteps=10)
-        ddpm = DDPMPipeline(model, scheduler)
-        ddpm.set_progress_bar_config(disable=None)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            ddpm.save_pretrained(tmpdirname)
-            new_ddpm = DDPMPipeline.from_pretrained(tmpdirname)
-
-        generator = paddle.Generator().manual_seed(0)
-        image = ddpm(generator=generator, num_inference_steps=5, output_type="numpy").images
-
-        generator = paddle.Generator().manual_seed(0)
-        new_image = new_ddpm(generator=generator, num_inference_steps=5, output_type="numpy").images
-
-        assert np.abs(image - new_image).sum() < 1e-5, "Models don't give the same forward pass"
-
-    def test_from_pretrained_hub(self):
-        model_path = "google/ddpm-cifar10-32"
-        scheduler = DDPMScheduler(num_train_timesteps=10)
-        ddpm = DDPMPipeline.from_pretrained(model_path, scheduler=scheduler)
-        ddpm.set_progress_bar_config(disable=None)
-        ddpm_from_hub = DiffusionPipeline.from_pretrained(model_path, scheduler=scheduler)
-        ddpm_from_hub = ddpm_from_hub
-        ddpm_from_hub.set_progress_bar_config(disable=None)
-        generator = paddle.Generator().manual_seed(0)
-        image = ddpm(generator=generator, num_inference_steps=5, output_type="numpy").images
-        generator = paddle.Generator().manual_seed(0)
-        new_image = ddpm_from_hub(generator=generator, num_inference_steps=5, output_type="numpy").images
-        assert np.abs(image - new_image).sum() < 1e-05, "Models don't give the same forward pass"
-
-    def test_from_pretrained_hub_pass_model(self):
-        model_path = "google/ddpm-cifar10-32"
-        scheduler = DDPMScheduler(num_train_timesteps=10)
-        unet = UNet2DModel.from_pretrained(model_path)
-        ddpm_from_hub_custom_model = DiffusionPipeline.from_pretrained(model_path, unet=unet, scheduler=scheduler)
-        ddpm_from_hub_custom_model = ddpm_from_hub_custom_model
-        ddpm_from_hub_custom_model.set_progress_bar_config(disable=None)
-        ddpm_from_hub = DiffusionPipeline.from_pretrained(model_path, scheduler=scheduler)
-        ddpm_from_hub_custom_model.set_progress_bar_config(disable=None)
-        generator = paddle.Generator().manual_seed(0)
-        image = ddpm_from_hub_custom_model(generator=generator, num_inference_steps=5, output_type="numpy").images
-        generator = paddle.Generator().manual_seed(0)
-        new_image = ddpm_from_hub(generator=generator, num_inference_steps=5, output_type="numpy").images
-        assert np.abs(image - new_image).sum() < 1e-05, "Models don't give the same forward pass"
-
-    def test_output_format(self):
-        model_path = "google/ddpm-cifar10-32"
-        scheduler = DDIMScheduler.from_pretrained(model_path)
-        pipe = DDIMPipeline.from_pretrained(model_path, scheduler=scheduler)
-        pipe.set_progress_bar_config(disable=None)
-        images = pipe(output_type="numpy").images
-        assert images.shape == (1, 32, 32, 3)
-        assert isinstance(images, np.ndarray)
-        images = pipe(output_type="pil", num_inference_steps=4).images
-        assert isinstance(images, list)
-        assert len(images) == 1
-        assert isinstance(images[0], PIL.Image.Image)
-        images = pipe(num_inference_steps=4).images
-        assert isinstance(images, list)
-        assert isinstance(images[0], PIL.Image.Image)
-
-
-@nightly
-@require_paddle_gpu
-class PipelineNightlyTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def test_ddpm_ddim_equality_batched(self):
-        seed = 0
-        model_id = "google/ddpm-cifar10-32"
-        unet = UNet2DModel.from_pretrained(model_id)
-        ddpm_scheduler = DDPMScheduler()
-        ddim_scheduler = DDIMScheduler()
-        ddpm = DDPMPipeline(unet=unet, scheduler=ddpm_scheduler)
-        ddpm.set_progress_bar_config(disable=None)
-        ddim = DDIMPipeline(unet=unet, scheduler=ddim_scheduler)
-        ddim.set_progress_bar_config(disable=None)
-        generator = paddle.Generator().manual_seed(seed)
-        ddpm_images = ddpm(batch_size=2, generator=generator, output_type="numpy").images
-        generator = paddle.Generator().manual_seed(seed)
-        ddim_images = ddim(
-            batch_size=2,
-            generator=generator,
-            num_inference_steps=1000,
-            eta=1.0,
-            output_type="numpy",
-            use_clipped_model_output=True,
-        ).images
-        assert np.abs(ddpm_images - ddim_images).max() < 0.1
diff --git a/ppdiffusers/tests/pipelines/test_pipelines_common.py b/ppdiffusers/tests/pipelines/test_pipelines_common.py
deleted file mode 100644
index fef981647ffd..000000000000
--- a/ppdiffusers/tests/pipelines/test_pipelines_common.py
+++ /dev/null
@@ -1,488 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import contextlib
-import gc
-import inspect
-import io
-import re
-import tempfile
-from typing import Callable, Union
-
-import numpy as np
-import paddle
-
-import ppdiffusers
-from ppdiffusers import DiffusionPipeline
-from ppdiffusers.utils import logging
-from ppdiffusers.utils.testing_utils import require_paddle
-
-
-def to_np(tensor):
-    if isinstance(tensor, paddle.Tensor):
-        tensor = tensor.detach().cpu().numpy()
-
-    return tensor
-
-
-@require_paddle
-class PipelineTesterMixin:
-    """
-    This mixin is designed to be used with unittest.TestCase classes.
-    It provides a set of common tests for each PyTorch pipeline, e.g. saving and loading the pipeline,
-    equivalence of dict and tuple outputs, etc.
-    """
-
-    # Canonical parameters that are passed to `__call__` regardless
-    # of the type of pipeline. They are always optional and have common
-    # sense default values.
-    required_optional_params = frozenset(
-        [
-            "num_inference_steps",
-            "num_images_per_prompt",
-            "generator",
-            "latents",
-            "output_type",
-            "return_dict",
-            "callback",
-            "callback_steps",
-        ]
-    )
-    num_inference_steps_args = ["num_inference_steps"]
-    test_attention_slicing = True
-    test_cpu_offload = False
-    test_xformers_attention = True
-
-    def get_generator(self, seed):
-        generator = paddle.Generator().manual_seed(seed)
-        return generator
-
-    @property
-    def pipeline_class(self) -> Union[Callable, DiffusionPipeline]:
-        raise NotImplementedError(
-            "You need to set the attribute `pipeline_class = ClassNameOfPipeline` in the child test class. See existing pipeline tests for reference."
-        )
-
-    def get_dummy_components(self):
-        raise NotImplementedError(
-            "You need to implement `get_dummy_components(self)` in the child test class. See existing pipeline tests for reference."
-        )
-
-    def get_dummy_inputs(self, seed=0):
-        raise NotImplementedError(
-            "You need to implement `get_dummy_inputs(self, seed)` in the child test class. See existing pipeline tests for reference."
-        )
-
-    @property
-    def params(self) -> frozenset:
-        raise NotImplementedError(
-            "You need to set the attribute `params` in the child test class. "
-            "`params` are checked for if all values are present in `__call__`'s signature."
-            " You can set `params` using one of the common set of parameters defined in `pipeline_params.py`"
-            " e.g., `TEXT_TO_IMAGE_PARAMS` defines the common parameters used in text to  "
-            "image pipelines, including prompts and prompt embedding overrides."
-            "If your pipeline's set of arguments has minor changes from one of the common sets of arguments, "
-            "do not make modifications to the existing common sets of arguments. I.e. a text to image pipeline "
-            "with non-configurable height and width arguments should set the attribute as "
-            "`params = TEXT_TO_IMAGE_PARAMS - {'height', 'width'}`. "
-            "See existing pipeline tests for reference."
-        )
-
-    @property
-    def batch_params(self) -> frozenset:
-        raise NotImplementedError(
-            "You need to set the attribute `batch_params` in the child test class. "
-            "`batch_params` are the parameters required to be batched when passed to the pipeline's "
-            "`__call__` method. `pipeline_params.py` provides some common sets of parameters such as "
-            "`TEXT_TO_IMAGE_BATCH_PARAMS`, `IMAGE_VARIATION_BATCH_PARAMS`, etc... If your pipeline's "
-            "set of batch arguments has minor changes from one of the common sets of batch arguments, "
-            "do not make modifications to the existing common sets of batch arguments. I.e. a text to "
-            "image pipeline `negative_prompt` is not batched should set the attribute as "
-            "`batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - {'negative_prompt'}`. "
-            "See existing pipeline tests for reference."
-        )
-
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def test_save_load_local(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        output = pipe(**inputs)[0]
-        with tempfile.TemporaryDirectory() as tmpdir:
-            pipe.save_pretrained(tmpdir, to_diffusers=False)
-            pipe_loaded = self.pipeline_class.from_pretrained(tmpdir, from_diffusers=False)
-            pipe_loaded.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        output_loaded = pipe_loaded(**inputs)[0]
-        max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
-        self.assertLess(max_diff, 0.002)
-
-    def test_pipeline_call_signature(self):
-        self.assertTrue(
-            hasattr(self.pipeline_class, "__call__"), f"{self.pipeline_class} should have a `__call__` method"
-        )
-
-        parameters = inspect.signature(self.pipeline_class.__call__).parameters
-
-        optional_parameters = set()
-
-        for k, v in parameters.items():
-            if v.default != inspect._empty:
-                optional_parameters.add(k)
-
-        parameters = set(parameters.keys())
-        parameters.remove("self")
-        parameters.discard("kwargs")  # kwargs can be added if arguments of pipeline call function are deprecated
-
-        remaining_required_parameters = set()
-
-        for param in self.params:
-            if param not in parameters:
-                remaining_required_parameters.add(param)
-
-        self.assertTrue(
-            len(remaining_required_parameters) == 0,
-            f"Required parameters not present: {remaining_required_parameters}",
-        )
-
-        remaining_required_optional_parameters = set()
-
-        for param in self.required_optional_params:
-            if param not in optional_parameters:
-                remaining_required_optional_parameters.add(param)
-
-        self.assertTrue(
-            len(remaining_required_optional_parameters) == 0,
-            f"Required optional parameters not present: {remaining_required_optional_parameters}",
-        )
-
-    def test_inference_batch_consistent(self, batch_sizes=[2, 4, 13]):
-        self._test_inference_batch_consistent(batch_sizes=batch_sizes)
-
-    def _test_inference_batch_consistent(
-        self, batch_sizes=[2, 4, 13], additional_params_copy_to_batched_inputs=["num_inference_steps"]
-    ):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        logger = logging.get_logger(pipe.__module__)
-        logger.setLevel(level=ppdiffusers.logging.FATAL)
-        for batch_size in batch_sizes:
-            batched_inputs = {}
-            for name, value in inputs.items():
-                if name in self.batch_params:
-                    if name == "prompt":
-                        len_prompt = len(value)
-                        batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
-                        batched_inputs[name][-1] = 2000 * "very long"
-                    else:
-                        batched_inputs[name] = batch_size * [value]
-                elif name == "batch_size":
-                    batched_inputs[name] = batch_size
-                else:
-                    batched_inputs[name] = value
-            for arg in additional_params_copy_to_batched_inputs:
-                batched_inputs[arg] = inputs[arg]
-            batched_inputs["output_type"] = None
-            if self.pipeline_class.__name__ == "DanceDiffusionPipeline":
-                batched_inputs.pop("output_type")
-            output = pipe(**batched_inputs)
-            assert len(output[0]) == batch_size
-            batched_inputs["output_type"] = "np"
-            if self.pipeline_class.__name__ == "DanceDiffusionPipeline":
-                batched_inputs.pop("output_type")
-            output = pipe(**batched_inputs)[0]
-            assert output.shape[0] == batch_size
-        logger.setLevel(level=ppdiffusers.logging.WARNING)
-
-    def test_inference_batch_single_identical(self, batch_size=3):
-        self._test_inference_batch_single_identical(batch_size=batch_size)
-
-    def _test_inference_batch_single_identical(
-        self,
-        batch_size=3,
-        test_max_difference=None,
-        test_mean_pixel_difference=None,
-        relax_max_difference=False,
-        expected_max_diff=1e-4,
-        additional_params_copy_to_batched_inputs=["num_inference_steps"],
-    ):
-
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        logger = logging.get_logger(pipe.__module__)
-        logger.setLevel(level=ppdiffusers.logging.FATAL)
-
-        batched_inputs = {}
-        for name, value in inputs.items():
-            if name in self.batch_params:
-                if name == "prompt":
-                    len_prompt = len(value)
-                    batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
-                    batched_inputs[name][-1] = 2000 * "very long"
-                else:
-                    batched_inputs[name] = batch_size * [value]
-            elif name == "batch_size":
-                batched_inputs[name] = batch_size
-            elif name == "generator":
-                batched_inputs[name] = [self.get_generator(i) for i in range(batch_size)]
-            else:
-                batched_inputs[name] = value
-
-        for arg in additional_params_copy_to_batched_inputs:
-            batched_inputs[arg] = inputs[arg]
-        if self.pipeline_class.__name__ != "DanceDiffusionPipeline":
-            batched_inputs["output_type"] = "np"
-        output_batch = pipe(**batched_inputs)
-        assert output_batch[0].shape[0] == batch_size
-        inputs["generator"] = self.get_generator(0)
-
-        output = pipe(**inputs)
-        logger.setLevel(level=ppdiffusers.logging.WARNING)
-        if test_max_difference:
-            if relax_max_difference:
-                diff = np.abs(output_batch[0][0] - output[0][0])
-                diff = diff.flatten()
-                diff.sort()
-                max_diff = np.median(diff[-5:])
-            else:
-                max_diff = np.abs(output_batch[0][0] - output[0][0]).max()
-            assert max_diff < expected_max_diff
-        if test_mean_pixel_difference:
-            assert_mean_pixel_difference(output_batch[0][0], output[0][0])
-
-    def test_dict_tuple_outputs_equivalent(self):
-
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-
-        output = pipe(**self.get_dummy_inputs())[0]
-        output_tuple = pipe(**self.get_dummy_inputs(), return_dict=False)[0]
-        max_diff = np.abs(to_np(output) - to_np(output_tuple)).max()
-        self.assertLess(max_diff, 0.005)
-
-    def test_components_function(self):
-        init_components = self.get_dummy_components()
-        pipe = self.pipeline_class(**init_components)
-        self.assertTrue(hasattr(pipe, "components"))
-        self.assertTrue(set(pipe.components.keys()) == set(init_components.keys()))
-
-    def test_float16_inference(self, expected_max_diff=1e-2):
-        self._test_float16_inference(expected_max_diff)
-
-    def _test_float16_inference(self, expected_max_diff=1e-2):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-        pipe_fp16 = self.pipeline_class(**components)
-        pipe_fp16.to(paddle_dtype=paddle.float16)
-        pipe_fp16.set_progress_bar_config(disable=None)
-        output = pipe(**self.get_dummy_inputs())[0]
-        output_fp16 = pipe_fp16(**self.get_dummy_inputs())[0]
-        max_diff = np.abs(to_np(output) - to_np(output_fp16)).max()
-        self.assertLess(max_diff, expected_max_diff, "The outputs of the fp16 and fp32 pipelines are too different.")
-
-    def test_save_load_float16(self, expected_max_diff=1e-2):
-        self._test_save_load_float16(expected_max_diff)
-
-    def _test_save_load_float16(self, expected_max_diff=1e-2):
-        pass
-        # components = self.get_dummy_components()
-        # for name, module in components.items():
-        #     if hasattr(module, "to"):
-        #         module.to(dtype=paddle.float16)
-        #     components[name] = module
-        # pipe = self.pipeline_class(**components)
-        # pipe.set_progress_bar_config(disable=None)
-        # inputs = self.get_dummy_inputs()
-        # output = pipe(**inputs)[0]
-        # with tempfile.TemporaryDirectory() as tmpdir:
-        #     pipe.save_pretrained(tmpdir)
-        #     pipe_loaded = self.pipeline_class.from_pretrained(
-        #         tmpdir, paddle_dtype=paddle.float16, from_diffusers=False
-        #     )
-        #     pipe_loaded.set_progress_bar_config(disable=None)
-        # for name, component in pipe_loaded.components.items():
-        #     if hasattr(component, "dtype"):
-        #         self.assertTrue(
-        #             component.dtype == paddle.float16,
-        #             f"`{name}.dtype` switched from `float16` to {component.dtype} after loading.",
-        #         )
-        # inputs = self.get_dummy_inputs()
-        # output_loaded = pipe_loaded(**inputs)[0]
-        # max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
-        # self.assertLess(max_diff, 5, "The output of the fp16 pipeline changed after saving and loading.")
-
-    def test_save_load_optional_components(self):
-        if not hasattr(self.pipeline_class, "_optional_components"):
-            return
-
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-
-        for optional_component in pipe._optional_components:
-            setattr(pipe, optional_component, None)
-        inputs = self.get_dummy_inputs()
-        output = pipe(**inputs)[0]
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # TODO check this
-            pipe.save_pretrained(tmpdir, to_diffusers=False)
-            pipe_loaded = self.pipeline_class.from_pretrained(tmpdir, from_diffusers=False)
-            pipe_loaded.set_progress_bar_config(disable=None)
-        for optional_component in pipe._optional_components:
-            self.assertTrue(
-                getattr(pipe_loaded, optional_component) is None,
-                f"`{optional_component}` did not stay set to None after loading.",
-            )
-        inputs = self.get_dummy_inputs()
-        output_loaded = pipe_loaded(**inputs)[0]
-        max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
-        self.assertLess(max_diff, 0.002)
-
-    # def test_to_device(self):
-    #     components = self.get_dummy_components()
-    #     pipe = self.pipeline_class(**components)
-    #     # we donot test cpu
-    #     # pipe.set_progress_bar_config(disable=None)
-    #     # pipe.to("cpu")
-    #     # model_devices = [str(component.device) for component in components.values() if hasattr(component, "device")]
-    #     # self.assertTrue(all(device == "Place(cpu)" for device in model_devices))
-    #     # output_cpu = pipe(**self.get_dummy_inputs())[0]
-    #     # self.assertTrue(np.isnan(output_cpu).sum() == 0)
-    #     pipe.to("gpu")
-    #     model_devices = [str(component.device) for component in components.values() if hasattr(component, "device")]
-    #     self.assertTrue(all(device == "Place(gpu:0)" for device in model_devices))
-    #     output_cuda = pipe(**self.get_dummy_inputs())[0]
-    #     self.assertTrue(np.isnan(to_np(output_cuda)).sum() == 0)
-
-    def test_to_dtype(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-
-        model_dtypes = [component.dtype for component in components.values() if hasattr(component, "dtype")]
-        self.assertTrue(all(dtype == paddle.float32 for dtype in model_dtypes))
-
-        pipe.to(paddle_dtype=paddle.float16)
-        model_dtypes = [component.dtype for component in components.values() if hasattr(component, "dtype")]
-        self.assertTrue(all(dtype == paddle.float16 for dtype in model_dtypes))
-
-    def test_attention_slicing_forward_pass(self):
-        self._test_attention_slicing_forward_pass()
-
-    def _test_attention_slicing_forward_pass(
-        self, test_max_difference=True, test_mean_pixel_difference=True, expected_max_diff=5e-3
-    ):
-        if not self.test_attention_slicing:
-            return
-
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        output_without_slicing = pipe(**inputs)[0]
-        pipe.enable_attention_slicing(slice_size=1)
-        inputs = self.get_dummy_inputs()
-        output_with_slicing = pipe(**inputs)[0]
-        if test_max_difference:
-            max_diff = np.abs(to_np(output_with_slicing) - to_np(output_without_slicing)).max()
-            self.assertLess(max_diff, expected_max_diff, "Attention slicing should not affect the inference results")
-        if test_mean_pixel_difference:
-            assert_mean_pixel_difference(output_with_slicing[0], output_without_slicing[0])
-
-    def test_xformers_attention_forwardGenerator_pass(self):
-        self._test_xformers_attention_forwardGenerator_pass()
-
-    def _test_xformers_attention_forwardGenerator_pass(
-        self, test_max_difference=True, test_mean_pixel_difference=True, expected_max_diff=1e-2
-    ):
-        if not self.test_xformers_attention:
-            return
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        output_without_xformers = pipe(**inputs)[0]
-        pipe.enable_xformers_memory_efficient_attention()
-        inputs = self.get_dummy_inputs()
-        output_with_xformers = pipe(**inputs)[0]
-        if test_max_difference:
-            if hasattr(output_with_xformers, "numpy"):
-                output_with_xformers = output_with_xformers.numpy()
-            if hasattr(output_without_xformers, "numpy"):
-                output_without_xformers = output_without_xformers.numpy()
-            max_diff = np.abs(output_with_xformers - output_without_xformers).max()
-            self.assertLess(max_diff, expected_max_diff, "XFormers attention should not affect the inference results")
-        if test_mean_pixel_difference:
-            assert_mean_pixel_difference(output_with_xformers[0], output_without_xformers[0])
-
-    def test_progress_bar(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        inputs = self.get_dummy_inputs()
-        with io.StringIO() as stderr, contextlib.redirect_stderr(stderr):
-            _ = pipe(**inputs)
-            stderr = stderr.getvalue()
-            max_steps = re.search("/(.*?) ", stderr).group(1)
-            self.assertTrue(max_steps is not None and len(max_steps) > 0)
-            self.assertTrue(
-                f"{max_steps}/{max_steps}" in stderr, "Progress bar should be enabled and stopped at the max step"
-            )
-        pipe.set_progress_bar_config(disable=True)
-        with io.StringIO() as stderr, contextlib.redirect_stderr(stderr):
-            _ = pipe(**inputs)
-            self.assertTrue(stderr.getvalue() == "", "Progress bar should be disabled")
-
-    def test_num_images_per_prompt(self):
-        sig = inspect.signature(self.pipeline_class.__call__)
-
-        if "num_images_per_prompt" not in sig.parameters:
-            return
-
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-
-        batch_sizes = [1, 2]
-        num_images_per_prompts = [1, 2]
-
-        for batch_size in batch_sizes:
-            for num_images_per_prompt in num_images_per_prompts:
-                inputs = self.get_dummy_inputs()
-
-                for key in inputs.keys():
-                    if key in self.batch_params:
-                        inputs[key] = batch_size * [inputs[key]]
-
-                images = pipe(**inputs, num_images_per_prompt=num_images_per_prompt).images
-
-                assert images.shape[0] == batch_size * num_images_per_prompt
-
-
-def assert_mean_pixel_difference(image, expected_image):
-    image = np.asarray(DiffusionPipeline.numpy_to_pil(image)[0], dtype=np.float32)
-    expected_image = np.asarray(DiffusionPipeline.numpy_to_pil(expected_image)[0], dtype=np.float32)
-    avg_diff = np.abs(image - expected_image).mean()
-    assert avg_diff < 10, f"Error image deviates {avg_diff} pixels on average"
diff --git a/ppdiffusers/tests/pipelines/text_to_video/__init__.py b/ppdiffusers/tests/pipelines/text_to_video/__init__.py
deleted file mode 100644
index 595add0aed9e..000000000000
--- a/ppdiffusers/tests/pipelines/text_to_video/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/ppdiffusers/tests/pipelines/text_to_video/test_text_to_video.py b/ppdiffusers/tests/pipelines/text_to_video/test_text_to_video.py
deleted file mode 100644
index 2b385cf7e031..000000000000
--- a/ppdiffusers/tests/pipelines/text_to_video/test_text_to_video.py
+++ /dev/null
@@ -1,170 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import paddle
-
-from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    DPMSolverMultistepScheduler,
-    TextToVideoSDPipeline,
-    UNet3DConditionModel,
-)
-from ppdiffusers.utils import load_numpy, slow
-
-from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin
-
-
-class TextToVideoSDPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = TextToVideoSDPipeline
-    params = TEXT_TO_IMAGE_PARAMS
-    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
-    required_optional_params = frozenset(
-        ["num_inference_steps", "generator", "latents", "return_dict", "callback", "callback_steps"]
-    )
-
-    def get_dummy_components(self):
-        paddle.seed(0)
-        unet = UNet3DConditionModel(
-            block_out_channels=(32, 64, 64, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "DownBlock3D"),
-            up_block_types=("UpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D"),
-            cross_attention_dim=32,
-            attention_head_dim=4,
-        )
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-        paddle.seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-            sample_size=128,
-        )
-        paddle.seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-            hidden_act="gelu",
-            projection_dim=512,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-        }
-        return components
-
-    def get_dummy_inputs(self, seed=0):
-        generator = paddle.Generator().manual_seed(0)
-        # "output_type": "pd" is problematic
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "pd",
-        }
-        return inputs
-
-    def test_text_to_video_default_case(self):
-        components = self.get_dummy_components()
-        sd_pipe = TextToVideoSDPipeline(**components)
-        sd_pipe = sd_pipe
-        sd_pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs()
-        inputs["output_type"] = "np"
-        frames = sd_pipe(**inputs).frames
-        image_slice = frames[0][-3:, -3:, (-1)]
-        assert frames[0].shape == (64, 64, 3)
-        expected_slice = np.array([51, 148, 141, 100, 238, 122, 141, 181, 79])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_xformers_attention_forwardGenerator_pass(self):
-        self._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False)
-
-    def test_attention_slicing_forward_pass(self):
-        self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False)
-
-    @unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.")
-    def test_inference_batch_consistent(self):
-        pass
-
-    @unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.")
-    def test_inference_batch_single_identical(self):
-        pass
-
-    @unittest.skip(reason="`num_images_per_prompt` argument is not supported for this pipeline.")
-    def test_num_images_per_prompt(self):
-        pass
-
-
-@slow
-class TextToVideoSDPipelineSlowTests(unittest.TestCase):
-    def test_full_model(self):
-        expected_video = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text_to_video/video.npy"
-        )
-        pipe = TextToVideoSDPipeline.from_pretrained(
-            "damo-vilab/text-to-video-ms-1.7b", from_hf_hub=True, from_diffusers=True
-        )
-        pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
-        pipe = pipe
-        prompt = "Spiderman is surfing"
-        generator = paddle.Generator().manual_seed(0)
-        video_frames = pipe(prompt, generator=generator, num_inference_steps=25, output_type="pd").frames
-        video = video_frames.cpu().numpy()
-        assert np.abs(expected_video - video).mean() < 0.8
-
-    def test_two_step_model(self):
-        expected_video = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text_to_video/video_2step.npy"
-        )
-        pipe = TextToVideoSDPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b")
-        pipe = pipe
-        prompt = "Spiderman is surfing"
-        generator = paddle.Generator().manual_seed(0)
-        video_frames = pipe(prompt, generator=generator, num_inference_steps=2, output_type="pd").frames
-        video = video_frames.cpu().numpy()
-        assert np.abs(expected_video - video).mean() < 0.8
diff --git a/ppdiffusers/tests/pipelines/text_to_video/test_text_to_video_zero.py b/ppdiffusers/tests/pipelines/text_to_video/test_text_to_video_zero.py
deleted file mode 100644
index 121798ea45e0..000000000000
--- a/ppdiffusers/tests/pipelines/text_to_video/test_text_to_video_zero.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import paddle
-
-from ppdiffusers import DDIMScheduler, TextToVideoZeroPipeline
-from ppdiffusers.utils import load_pd, require_paddle_gpu, slow
-
-from ..test_pipelines_common import assert_mean_pixel_difference
-
-
-@slow
-@require_paddle_gpu
-class TextToVideoZeroPipelineSlowTests(unittest.TestCase):
-    def test_full_model(self):
-        model_id = "runwayml/stable-diffusion-v1-5"
-        pipe = TextToVideoZeroPipeline.from_pretrained(model_id, torch_dtype="float16")
-        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        generator = paddle.Generator().manual_seed(0)
-        prompt = "A bear is playing a guitar on Times Square"
-        result = pipe(prompt=prompt, generator=generator).images
-        expected_result = load_pd(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text-to-video/A bear is playing a guitar on Times Square.pt"
-        )
-        assert_mean_pixel_difference(result, expected_result)
diff --git a/ppdiffusers/tests/pipelines/unclip/__init__.py b/ppdiffusers/tests/pipelines/unclip/__init__.py
deleted file mode 100644
index a72d388cc895..000000000000
--- a/ppdiffusers/tests/pipelines/unclip/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/ppdiffusers/tests/pipelines/unclip/test_unclip.py b/ppdiffusers/tests/pipelines/unclip/test_unclip.py
deleted file mode 100644
index f742596d4742..000000000000
--- a/ppdiffusers/tests/pipelines/unclip/test_unclip.py
+++ /dev/null
@@ -1,367 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import numpy as np
-import paddle
-from ..pipeline_params import (
-    TEXT_TO_IMAGE_BATCH_PARAMS,
-    TEXT_TO_IMAGE_PARAMS,
-)
-from ..test_pipelines_common import (
-    PipelineTesterMixin,
-    assert_mean_pixel_difference,
-)
-
-from paddlenlp.transformers import (
-    CLIPTextConfig,
-    CLIPTextModelWithProjection,
-    CLIPTokenizer,
-)
-from ppdiffusers import (
-    PriorTransformer,
-    UnCLIPPipeline,
-    UnCLIPScheduler,
-    UNet2DConditionModel,
-    UNet2DModel,
-)
-from ppdiffusers.pipelines.unclip.text_proj import UnCLIPTextProjModel
-from ppdiffusers.utils import slow
-from ppdiffusers.utils.testing_utils import require_paddle_gpu
-
-
-class UnCLIPPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = UnCLIPPipeline
-    params = TEXT_TO_IMAGE_PARAMS - {
-        "negative_prompt",
-        "height",
-        "width",
-        "negative_prompt_embeds",
-        "guidance_scale",
-        "prompt_embeds",
-        "cross_attention_kwargs",
-    }
-    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
-    required_optional_params = frozenset(
-        [
-            "generator",
-            "return_dict",
-            "prior_num_inference_steps",
-            "decoder_num_inference_steps",
-            "super_res_num_inference_steps",
-        ]
-    )
-    test_xformers_attention = False
-
-    @property
-    def text_embedder_hidden_size(self):
-        return 32
-
-    @property
-    def time_input_dim(self):
-        return 32
-
-    @property
-    def block_out_channels_0(self):
-        return self.time_input_dim
-
-    @property
-    def time_embed_dim(self):
-        return self.time_input_dim * 4
-
-    @property
-    def cross_attention_dim(self):
-        return 100
-
-    @property
-    def dummy_tokenizer(self):
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        return tokenizer
-
-    @property
-    def dummy_text_encoder(self):
-        paddle.seed(0)
-        config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=self.text_embedder_hidden_size,
-            projection_dim=self.text_embedder_hidden_size,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        return CLIPTextModelWithProjection(config)
-
-    @property
-    def dummy_prior(self):
-        paddle.seed(0)
-        model_kwargs = {
-            "num_attention_heads": 2,
-            "attention_head_dim": 12,
-            "embedding_dim": self.text_embedder_hidden_size,
-            "num_layers": 1,
-        }
-        model = PriorTransformer(**model_kwargs)
-        return model
-
-    @property
-    def dummy_text_proj(self):
-        paddle.seed(0)
-        model_kwargs = {
-            "clip_embeddings_dim": self.text_embedder_hidden_size,
-            "time_embed_dim": self.time_embed_dim,
-            "cross_attention_dim": self.cross_attention_dim,
-        }
-        model = UnCLIPTextProjModel(**model_kwargs)
-        return model
-
-    @property
-    def dummy_decoder(self):
-        paddle.seed(0)
-        model_kwargs = {
-            "sample_size": 32,
-            "in_channels": 3,
-            "out_channels": 6,
-            "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
-            "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
-            "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
-            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
-            "layers_per_block": 1,
-            "cross_attention_dim": self.cross_attention_dim,
-            "attention_head_dim": 4,
-            "resnet_time_scale_shift": "scale_shift",
-            "class_embed_type": "identity",
-        }
-        model = UNet2DConditionModel(**model_kwargs)
-        return model
-
-    @property
-    def dummy_super_res_kwargs(self):
-        return {
-            "sample_size": 64,
-            "layers_per_block": 1,
-            "down_block_types": ("ResnetDownsampleBlock2D", "ResnetDownsampleBlock2D"),
-            "up_block_types": ("ResnetUpsampleBlock2D", "ResnetUpsampleBlock2D"),
-            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
-            "in_channels": 6,
-            "out_channels": 3,
-        }
-
-    @property
-    def dummy_super_res_first(self):
-        paddle.seed(0)
-        model = UNet2DModel(**self.dummy_super_res_kwargs)
-        return model
-
-    @property
-    def dummy_super_res_last(self):
-        paddle.seed(seed=1)
-        model = UNet2DModel(**self.dummy_super_res_kwargs)
-        return model
-
-    def get_dummy_components(self):
-        prior = self.dummy_prior
-        decoder = self.dummy_decoder
-        text_proj = self.dummy_text_proj
-        text_encoder = self.dummy_text_encoder
-        tokenizer = self.dummy_tokenizer
-        super_res_first = self.dummy_super_res_first
-        super_res_last = self.dummy_super_res_last
-        prior_scheduler = UnCLIPScheduler(
-            variance_type="fixed_small_log", prediction_type="sample", num_train_timesteps=1000, clip_sample_range=5.0
-        )
-        decoder_scheduler = UnCLIPScheduler(
-            variance_type="learned_range", prediction_type="epsilon", num_train_timesteps=1000
-        )
-        super_res_scheduler = UnCLIPScheduler(
-            variance_type="fixed_small_log", prediction_type="epsilon", num_train_timesteps=1000
-        )
-        components = {
-            "prior": prior,
-            "decoder": decoder,
-            "text_proj": text_proj,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "super_res_first": super_res_first,
-            "super_res_last": super_res_last,
-            "prior_scheduler": prior_scheduler,
-            "decoder_scheduler": decoder_scheduler,
-            "super_res_scheduler": super_res_scheduler,
-        }
-        return components
-
-    def get_dummy_inputs(self, seed=0):
-        generator = paddle.Generator().manual_seed(seed)
-
-        inputs = {
-            "prompt": "horse",
-            "generator": generator,
-            "prior_num_inference_steps": 2,
-            "decoder_num_inference_steps": 2,
-            "super_res_num_inference_steps": 2,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_unclip(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-        output = pipe(**self.get_dummy_inputs())
-        image = output.images
-        image_from_tuple = pipe(**self.get_dummy_inputs(), return_dict=False)[0]
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [
-                2.6383996e-04,
-                9.9658674e-01,
-                1.1275411e-03,
-                2.6383996e-04,
-                2.6383996e-04,
-                9.9702907e-01,
-                9.9973619e-01,
-                9.9545717e-01,
-                2.6383996e-04,
-            ]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_unclip_passed_text_embed(self):
-        class DummyScheduler:
-            init_noise_sigma = 1
-
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        prior = components["prior"]
-        decoder = components["decoder"]
-        super_res_first = components["super_res_first"]
-        tokenizer = components["tokenizer"]
-        text_encoder = components["text_encoder"]
-        generator = paddle.Generator().manual_seed(0)
-        dtype = prior.dtype
-        batch_size = 1
-        shape = batch_size, prior.config.embedding_dim
-        prior_latents = pipe.prepare_latents(
-            shape, dtype=dtype, generator=generator, latents=None, scheduler=DummyScheduler()
-        )
-        shape = (batch_size, decoder.config.in_channels, decoder.config.sample_size, decoder.config.sample_size)
-        decoder_latents = pipe.prepare_latents(
-            shape, dtype=dtype, generator=generator, latents=None, scheduler=DummyScheduler()
-        )
-        shape = (
-            batch_size,
-            super_res_first.config.in_channels // 2,
-            super_res_first.config.sample_size,
-            super_res_first.config.sample_size,
-        )
-        super_res_latents = pipe.prepare_latents(
-            shape, dtype=dtype, generator=generator, latents=None, scheduler=DummyScheduler()
-        )
-        pipe.set_progress_bar_config(disable=None)
-        prompt = "this is a prompt example"
-        generator = paddle.Generator().manual_seed(0)
-        output = pipe(
-            [prompt],
-            generator=generator,
-            prior_num_inference_steps=2,
-            decoder_num_inference_steps=2,
-            super_res_num_inference_steps=2,
-            prior_latents=prior_latents,
-            decoder_latents=decoder_latents,
-            super_res_latents=super_res_latents,
-            output_type="np",
-        )
-        image = output.images
-        text_inputs = tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=tokenizer.model_max_length,
-            return_attention_mask=True,
-            return_tensors="pd",
-        )
-        text_model_output = text_encoder(text_inputs.input_ids)
-        text_attention_mask = text_inputs.attention_mask
-        generator = paddle.Generator().manual_seed(0)
-        image_from_text = pipe(
-            generator=generator,
-            prior_num_inference_steps=2,
-            decoder_num_inference_steps=2,
-            super_res_num_inference_steps=2,
-            prior_latents=prior_latents,
-            decoder_latents=decoder_latents,
-            super_res_latents=super_res_latents,
-            text_model_output=text_model_output,
-            text_attention_mask=text_attention_mask,
-            output_type="np",
-        )[0]
-        assert np.abs(image - image_from_text).max() < 0.0001
-
-    def test_attention_slicing_forward_pass(self):
-        test_max_difference = False
-        self._test_attention_slicing_forward_pass(test_max_difference=test_max_difference, expected_max_diff=0.01)
-
-    def test_inference_batch_single_identical(self):
-        test_max_difference = False
-        relax_max_difference = True
-        additional_params_copy_to_batched_inputs = [
-            "prior_num_inference_steps",
-            "decoder_num_inference_steps",
-            "super_res_num_inference_steps",
-        ]
-
-        self._test_inference_batch_single_identical(
-            test_max_difference=test_max_difference,
-            relax_max_difference=relax_max_difference,
-            additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs,
-        )
-
-    def test_inference_batch_consistent(self):
-        additional_params_copy_to_batched_inputs = [
-            "prior_num_inference_steps",
-            "decoder_num_inference_steps",
-            "super_res_num_inference_steps",
-        ]
-
-        self._test_inference_batch_consistent(
-            additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs
-        )
-
-
-@slow
-@require_paddle_gpu
-class UnCLIPPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def test_unclip_karlo(self):
-        # Hard code image
-        expected_image = np.array([[0.73281264, 0.69175875, 0.64672112], [0.71919304, 0.65395129, 0.60436499]])
-        pipeline = UnCLIPPipeline.from_pretrained("kakaobrain/karlo-v1-alpha")
-        pipeline.set_progress_bar_config(disable=None)
-        generator = paddle.Generator().manual_seed(0)
-        output = pipeline("horse", generator=generator, output_type="np")
-        image = output.images[0]
-        assert image.shape == (256, 256, 3)
-        assert_mean_pixel_difference(image[0][0:2], expected_image)
diff --git a/ppdiffusers/tests/pipelines/unclip/test_unclip_image_variation.py b/ppdiffusers/tests/pipelines/unclip/test_unclip_image_variation.py
deleted file mode 100644
index 99a763b06feb..000000000000
--- a/ppdiffusers/tests/pipelines/unclip/test_unclip_image_variation.py
+++ /dev/null
@@ -1,398 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import random
-import unittest
-
-import numpy as np
-import paddle
-from ..pipeline_params import (
-    IMAGE_VARIATION_BATCH_PARAMS,
-    IMAGE_VARIATION_PARAMS,
-)
-from ..test_pipelines_common import (
-    PipelineTesterMixin,
-    assert_mean_pixel_difference,
-)
-
-from paddlenlp.transformers import (
-    CLIPImageProcessor,
-    CLIPTextConfig,
-    CLIPTextModelWithProjection,
-    CLIPTokenizer,
-    CLIPVisionConfig,
-    CLIPVisionModelWithProjection,
-)
-from ppdiffusers import (
-    DiffusionPipeline,
-    UnCLIPImageVariationPipeline,
-    UnCLIPScheduler,
-    UNet2DConditionModel,
-    UNet2DModel,
-)
-from ppdiffusers.pipelines.unclip.text_proj import UnCLIPTextProjModel
-from ppdiffusers.utils import floats_tensor, slow
-from ppdiffusers.utils.testing_utils import load_image, require_paddle_gpu
-
-
-class UnCLIPImageVariationPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = UnCLIPImageVariationPipeline
-    params = IMAGE_VARIATION_PARAMS - {"height", "width", "guidance_scale"}
-    batch_params = IMAGE_VARIATION_BATCH_PARAMS
-    required_optional_params = frozenset(
-        ["generator", "return_dict", "decoder_num_inference_steps", "super_res_num_inference_steps"]
-    )
-    test_xformers_attention = False
-
-    @property
-    def text_embedder_hidden_size(self):
-        return 32
-
-    @property
-    def time_input_dim(self):
-        return 32
-
-    @property
-    def block_out_channels_0(self):
-        return self.time_input_dim
-
-    @property
-    def time_embed_dim(self):
-        return self.time_input_dim * 4
-
-    @property
-    def cross_attention_dim(self):
-        return 100
-
-    @property
-    def dummy_tokenizer(self):
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        return tokenizer
-
-    @property
-    def dummy_text_encoder(self):
-        paddle.seed(0)
-        config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=self.text_embedder_hidden_size,
-            projection_dim=self.text_embedder_hidden_size,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        return CLIPTextModelWithProjection(config)
-
-    @property
-    def dummy_image_encoder(self):
-        paddle.seed(0)
-        config = CLIPVisionConfig(
-            hidden_size=self.text_embedder_hidden_size,
-            projection_dim=self.text_embedder_hidden_size,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            image_size=32,
-            intermediate_size=37,
-            patch_size=1,
-        )
-        return CLIPVisionModelWithProjection(config)
-
-    @property
-    def dummy_text_proj(self):
-        paddle.seed(0)
-        model_kwargs = {
-            "clip_embeddings_dim": self.text_embedder_hidden_size,
-            "time_embed_dim": self.time_embed_dim,
-            "cross_attention_dim": self.cross_attention_dim,
-        }
-        model = UnCLIPTextProjModel(**model_kwargs)
-        return model
-
-    @property
-    def dummy_decoder(self):
-        paddle.seed(0)
-        model_kwargs = {
-            "sample_size": 32,
-            "in_channels": 3,
-            "out_channels": 6,
-            "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
-            "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
-            "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
-            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
-            "layers_per_block": 1,
-            "cross_attention_dim": self.cross_attention_dim,
-            "attention_head_dim": 4,
-            "resnet_time_scale_shift": "scale_shift",
-            "class_embed_type": "identity",
-        }
-        model = UNet2DConditionModel(**model_kwargs)
-        return model
-
-    @property
-    def dummy_super_res_kwargs(self):
-        return {
-            "sample_size": 64,
-            "layers_per_block": 1,
-            "down_block_types": ("ResnetDownsampleBlock2D", "ResnetDownsampleBlock2D"),
-            "up_block_types": ("ResnetUpsampleBlock2D", "ResnetUpsampleBlock2D"),
-            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
-            "in_channels": 6,
-            "out_channels": 3,
-        }
-
-    @property
-    def dummy_super_res_first(self):
-        paddle.seed(0)
-        model = UNet2DModel(**self.dummy_super_res_kwargs)
-        return model
-
-    @property
-    def dummy_super_res_last(self):
-        paddle.seed(seed=1)
-        model = UNet2DModel(**self.dummy_super_res_kwargs)
-        return model
-
-    def get_dummy_components(self):
-        decoder = self.dummy_decoder
-        text_proj = self.dummy_text_proj
-        text_encoder = self.dummy_text_encoder
-        tokenizer = self.dummy_tokenizer
-        super_res_first = self.dummy_super_res_first
-        super_res_last = self.dummy_super_res_last
-        decoder_scheduler = UnCLIPScheduler(
-            variance_type="learned_range", prediction_type="epsilon", num_train_timesteps=1000
-        )
-        super_res_scheduler = UnCLIPScheduler(
-            variance_type="fixed_small_log", prediction_type="epsilon", num_train_timesteps=1000
-        )
-        feature_extractor = CLIPImageProcessor(crop_size=32, size=32)
-        image_encoder = self.dummy_image_encoder
-        return {
-            "decoder": decoder,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "text_proj": text_proj,
-            "feature_extractor": feature_extractor,
-            "image_encoder": image_encoder,
-            "super_res_first": super_res_first,
-            "super_res_last": super_res_last,
-            "decoder_scheduler": decoder_scheduler,
-            "super_res_scheduler": super_res_scheduler,
-        }
-
-    def test_xformers_attention_forwardGenerator_pass(self):
-        pass
-
-    def get_dummy_inputs(self, seed=0, pil_image=True):
-        input_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed))
-        generator = paddle.Generator().manual_seed(seed)
-
-        if pil_image:
-            input_image = input_image * 0.5 + 0.5
-            input_image = input_image.clip(min=0, max=1)
-            input_image = input_image.cpu().transpose(perm=[0, 2, 3, 1]).cast("float32").numpy()
-            input_image = DiffusionPipeline.numpy_to_pil(input_image)[0]
-        return {
-            "image": input_image,
-            "generator": generator,
-            "decoder_num_inference_steps": 2,
-            "super_res_num_inference_steps": 2,
-            "output_type": "np",
-        }
-
-    def test_unclip_image_variation_input_tensor(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-        pipeline_inputs = self.get_dummy_inputs(pil_image=False)
-        output = pipe(**pipeline_inputs)
-        image = output.images
-        tuple_pipeline_inputs = self.get_dummy_inputs(pil_image=False)
-        image_from_tuple = pipe(**tuple_pipeline_inputs, return_dict=False)[0]
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [
-                2.7585030e-03,
-                2.6383996e-04,
-                9.9801058e-01,
-                2.6383996e-04,
-                9.9531418e-01,
-                9.9220645e-01,
-                3.6702752e-03,
-                9.9970925e-01,
-                9.9973619e-01,
-            ]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_unclip_image_variation_input_image(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-        pipeline_inputs = self.get_dummy_inputs(pil_image=True)
-        output = pipe(**pipeline_inputs)
-        image = output.images
-        tuple_pipeline_inputs = self.get_dummy_inputs(pil_image=True)
-        image_from_tuple = pipe(**tuple_pipeline_inputs, return_dict=False)[0]
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [
-                5.2168965e-04,
-                9.9861604e-01,
-                9.9755847e-01,
-                9.9804187e-01,
-                9.9411416e-01,
-                9.9248302e-01,
-                9.9973619e-01,
-                9.9777901e-01,
-                9.9973619e-01,
-            ]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_unclip_image_variation_input_list_images(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-        pipeline_inputs = self.get_dummy_inputs(pil_image=True)
-        pipeline_inputs["image"] = [pipeline_inputs["image"], pipeline_inputs["image"]]
-        output = pipe(**pipeline_inputs)
-        image = output.images
-        tuple_pipeline_inputs = self.get_dummy_inputs(pil_image=True)
-        tuple_pipeline_inputs["image"] = [tuple_pipeline_inputs["image"], tuple_pipeline_inputs["image"]]
-        image_from_tuple = pipe(**tuple_pipeline_inputs, return_dict=False)[0]
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-        assert image.shape == (2, 64, 64, 3)
-        expected_slice = np.array(
-            [
-                5.2201748e-04,
-                9.9861759e-01,
-                9.9755961e-01,
-                9.9804127e-01,
-                9.9411547e-01,
-                9.9248385e-01,
-                9.9973619e-01,
-                9.9777836e-01,
-                9.9973619e-01,
-            ]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_unclip_passed_image_embed(self):
-        class DummyScheduler:
-            init_noise_sigma = 1
-
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-        generator = paddle.Generator().manual_seed(0)
-        dtype = pipe.decoder.dtype
-        batch_size = 1
-        shape = (batch_size, pipe.decoder.config.in_channels, pipe.decoder.config.sample_size, pipe.decoder.config.sample_size)
-        decoder_latents = pipe.prepare_latents(
-            shape, dtype=dtype, generator=generator, latents=None, scheduler=DummyScheduler()
-        )
-        shape = (
-            batch_size,
-            pipe.super_res_first.config.in_channels // 2,
-            pipe.super_res_first.config.sample_size,
-            pipe.super_res_first.config.sample_size,
-        )
-        super_res_latents = pipe.prepare_latents(
-            shape, dtype=dtype, generator=generator, latents=None, scheduler=DummyScheduler()
-        )
-        pipeline_inputs = self.get_dummy_inputs(pil_image=False)
-        img_out_1 = pipe(
-            **pipeline_inputs, decoder_latents=decoder_latents, super_res_latents=super_res_latents
-        ).images
-        pipeline_inputs = self.get_dummy_inputs(pil_image=False)
-        image = pipeline_inputs.pop("image")
-        image_embeddings = pipe.image_encoder(image).image_embeds
-        img_out_2 = pipe(
-            **pipeline_inputs,
-            decoder_latents=decoder_latents,
-            super_res_latents=super_res_latents,
-            image_embeddings=image_embeddings,
-        ).images
-        assert np.abs(img_out_1 - img_out_2).max() < 0.0001
-
-    def test_attention_slicing_forward_pass(self):
-        test_max_difference = False
-        # Check is relaxed because there is not a torch 2.0 sliced attention added kv processor
-        expected_max_diff = 1e-2
-
-        self._test_attention_slicing_forward_pass(
-            test_max_difference=test_max_difference, expected_max_diff=expected_max_diff
-        )
-
-    def test_inference_batch_single_identical(self):
-        test_max_difference = False
-        relax_max_difference = True
-        additional_params_copy_to_batched_inputs = [
-            "decoder_num_inference_steps",
-            "super_res_num_inference_steps",
-        ]
-
-        self._test_inference_batch_single_identical(
-            test_max_difference=test_max_difference,
-            relax_max_difference=relax_max_difference,
-            additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs,
-        )
-
-    def test_inference_batch_consistent(self):
-        additional_params_copy_to_batched_inputs = [
-            "decoder_num_inference_steps",
-            "super_res_num_inference_steps",
-        ]
-
-        self._test_inference_batch_consistent(
-            additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs
-        )
-
-
-@slow
-@require_paddle_gpu
-class UnCLIPImageVariationPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def test_unclip_image_variation_karlo(self):
-        input_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/unclip/cat.png"
-        )
-        expected_image = np.array([[0.09096909, 0.13343304, 0.26244187], [0.15095001, 0.19459972, 0.3182609]])
-        # TODO(wugaosheng): test this function
-        pipeline = UnCLIPImageVariationPipeline.from_pretrained("kakaobrain/karlo-v1-alpha-image-variations")
-        pipeline.set_progress_bar_config(disable=None)
-        generator = paddle.Generator().manual_seed(0)
-        output = pipeline(input_image, generator=generator, output_type="np")
-        image = output.images[0]
-        assert image.shape == (256, 256, 3)
-
-        assert_mean_pixel_difference(image[0][0:2], expected_image)
diff --git a/ppdiffusers/tests/pipelines/unidiffuser/__init__.py b/ppdiffusers/tests/pipelines/unidiffuser/__init__.py
deleted file mode 100644
index 595add0aed9e..000000000000
--- a/ppdiffusers/tests/pipelines/unidiffuser/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/ppdiffusers/tests/pipelines/versatile_diffusion/__init__.py b/ppdiffusers/tests/pipelines/versatile_diffusion/__init__.py
deleted file mode 100644
index a72d388cc895..000000000000
--- a/ppdiffusers/tests/pipelines/versatile_diffusion/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_dual_guided.py b/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_dual_guided.py
deleted file mode 100644
index 466cc4cd0f6f..000000000000
--- a/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_dual_guided.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import tempfile
-import unittest
-
-import numpy as np
-import paddle
-
-from ppdiffusers import VersatileDiffusionDualGuidedPipeline
-from ppdiffusers.utils.testing_utils import load_image, require_paddle_gpu, nightly
-
-
-@nightly
-@require_paddle_gpu
-class VersatileDiffusionDualGuidedPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def test_remove_unused_weights_save_load(self):
-        pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained("shi-labs/versatile-diffusion")
-        pipe.remove_unused_weights()
-        pipe.set_progress_bar_config(disable=None)
-        second_prompt = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/versatile_diffusion/benz.jpg"
-        )
-        generator = paddle.Generator().manual_seed(0)
-        image = pipe(
-            prompt="first prompt",
-            image=second_prompt,
-            text_to_image_strength=0.75,
-            generator=generator,
-            guidance_scale=7.5,
-            num_inference_steps=2,
-            output_type="numpy",
-        ).images
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pipe.save_pretrained(tmpdirname)
-            pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained(tmpdirname, from_diffusers=False)
-        pipe.set_progress_bar_config(disable=None)
-        generator = paddle.Generator().manual_seed(0)
-        new_image = pipe(
-            prompt="first prompt",
-            image=second_prompt,
-            text_to_image_strength=0.75,
-            generator=generator,
-            guidance_scale=7.5,
-            num_inference_steps=2,
-            output_type="numpy",
-        ).images
-        assert np.abs(image - new_image).sum() < 1e-05, "Models don't have the same forward pass"
-
-    def test_inference_dual_guided(self):
-        pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained("shi-labs/versatile-diffusion")
-        pipe.remove_unused_weights()
-        pipe.set_progress_bar_config(disable=None)
-        first_prompt = "cyberpunk 2077"
-        second_prompt = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/versatile_diffusion/benz.jpg"
-        )
-        generator = paddle.Generator().manual_seed(0)
-        image = pipe(
-            prompt=first_prompt,
-            image=second_prompt,
-            text_to_image_strength=0.75,
-            generator=generator,
-            guidance_scale=7.5,
-            num_inference_steps=50,
-            output_type="numpy",
-        ).images
-        image_slice = image[0, 253:256, 253:256, -1]
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array(
-            [
-                0.01500076,
-                0.01142624,
-                0.01418972,
-                0.01518875,
-                0.01114869,
-                0.01190853,
-                0.02978998,
-                0.02376354,
-                0.02396089,
-            ]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
diff --git a/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_variation.py b/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_variation.py
deleted file mode 100644
index 7d982b82b9ae..000000000000
--- a/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_variation.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import paddle
-
-from ppdiffusers import VersatileDiffusionImageVariationPipeline
-from ppdiffusers.utils.testing_utils import load_image, require_paddle_gpu, slow
-
-
-class VersatileDiffusionImageVariationPipelineFastTests(unittest.TestCase):
-    pass
-
-
-@slow
-@require_paddle_gpu
-class VersatileDiffusionImageVariationPipelineIntegrationTests(unittest.TestCase):
-    def test_inference_image_variations(self):
-        pipe = VersatileDiffusionImageVariationPipeline.from_pretrained("shi-labs/versatile-diffusion")
-        pipe.set_progress_bar_config(disable=None)
-        image_prompt = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/versatile_diffusion/benz.jpg"
-        )
-        generator = paddle.Generator().manual_seed(0)
-        image = pipe(
-            image=image_prompt, generator=generator, guidance_scale=7.5, num_inference_steps=50, output_type="numpy"
-        ).images
-        image_slice = image[0, 253:256, 253:256, -1]
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array(
-            [0.12047189, 0.19138041, 0.22884357, 0.08833978, 0.1594424, 0.16826832, 0.07032129, 0.14926612, 0.12981007]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
diff --git a/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py b/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py
deleted file mode 100644
index bcc0801bfd35..000000000000
--- a/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import tempfile
-import unittest
-
-import numpy as np
-import paddle
-
-from ppdiffusers import VersatileDiffusionPipeline
-from ppdiffusers.utils.testing_utils import load_image, require_paddle_gpu, nightly
-
-
-class VersatileDiffusionMegaPipelineFastTests(unittest.TestCase):
-    pass
-
-
-@nightly
-@require_paddle_gpu
-class VersatileDiffusionMegaPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def test_from_save_pretrained(self):
-        pipe = VersatileDiffusionPipeline.from_pretrained("shi-labs/versatile-diffusion")
-        pipe.set_progress_bar_config(disable=None)
-        prompt_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/versatile_diffusion/benz.jpg"
-        )
-        generator = paddle.Generator().manual_seed(0)
-        image = pipe.dual_guided(
-            prompt="first prompt",
-            image=prompt_image,
-            text_to_image_strength=0.75,
-            generator=generator,
-            guidance_scale=7.5,
-            num_inference_steps=2,
-            output_type="numpy",
-        ).images
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pipe.save_pretrained(tmpdirname)
-            pipe = VersatileDiffusionPipeline.from_pretrained(tmpdirname, from_diffusers=False)
-        pipe.set_progress_bar_config(disable=None)
-        generator = paddle.Generator().manual_seed(0)
-        new_image = pipe.dual_guided(
-            prompt="first prompt",
-            image=prompt_image,
-            text_to_image_strength=0.75,
-            generator=generator,
-            guidance_scale=7.5,
-            num_inference_steps=2,
-            output_type="numpy",
-        ).images
-        assert np.abs(image - new_image).sum() < 1e-05, "Models don't have the same forward pass"
-
-    def test_inference_dual_guided_then_text_to_image(self):
-        pipe = VersatileDiffusionPipeline.from_pretrained("shi-labs/versatile-diffusion", paddle_dtype=paddle.float16)
-        pipe.set_progress_bar_config(disable=None)
-        prompt = "cyberpunk 2077"
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/versatile_diffusion/benz.jpg"
-        )
-        generator = paddle.Generator().manual_seed(0)
-        image = pipe.dual_guided(
-            prompt=prompt,
-            image=init_image,
-            text_to_image_strength=0.75,
-            generator=generator,
-            guidance_scale=7.5,
-            num_inference_steps=50,
-            output_type="numpy",
-        ).images
-        image_slice = image[0, 253:256, 253:256, -1]
-        assert image.shape == (1, 512, 512, 3)
-        # expected_slice = np.array([0.1448, 0.1619, 0.1741, 0.1086, 0.1147, 0.1128, 0.1199, 0.1165, 0.1001])
-        expected_slice = np.array(
-            [
-                0.03100586,
-                0.02929688,
-                0.03271484,
-                0.02807617,
-                0.02905273,
-                0.03173828,
-                0.02685547,
-                0.02807617,
-                0.03271484,
-            ]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.1
-        prompt = "A painting of a squirrel eating a burger "
-        generator = paddle.Generator().manual_seed(0)
-        image = pipe.text_to_image(
-            prompt=prompt, generator=generator, guidance_scale=7.5, num_inference_steps=50, output_type="numpy"
-        ).images
-        image_slice = image[0, 253:256, 253:256, -1]
-        assert image.shape == (1, 512, 512, 3)
-        # expected_slice = np.array([0.3367, 0.3169, 0.2656, 0.387, 0.479, 0.3796, 0.4009, 0.4878, 0.4778])
-        expected_slice = np.array(
-            [0.0390625, 0.00854492, 0.0, 0.03930664, 0.00878906, 0.04711914, 0.03686523, 0.0, 0.0246582]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.1
-        image = pipe.image_variation(init_image, generator=generator, output_type="numpy").images
-        image_slice = image[0, 253:256, 253:256, -1]
-        assert image.shape == (1, 512, 512, 3)
-        # expected_slice = np.array([0.3076, 0.3123, 0.3284, 0.3782, 0.377, 0.3894, 0.4297, 0.4331, 0.4456])
-        expected_slice = np.array(
-            [0.34472656, 0.1940918, 0.10546875, 0.38134766, 0.24560547, 0.13208008, 0.38867188, 0.30566406, 0.18188477]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.1
diff --git a/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_text_to_image.py b/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_text_to_image.py
deleted file mode 100644
index e4ee8abdf4a3..000000000000
--- a/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_text_to_image.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import tempfile
-import unittest
-
-import numpy as np
-import paddle
-
-from ppdiffusers import VersatileDiffusionTextToImagePipeline
-from ppdiffusers.utils.testing_utils import nightly, require_paddle_gpu
-
-
-class VersatileDiffusionTextToImagePipelineFastTests(unittest.TestCase):
-    pass
-
-
-@nightly
-@require_paddle_gpu
-class VersatileDiffusionTextToImagePipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def test_remove_unused_weights_save_load(self):
-        pipe = VersatileDiffusionTextToImagePipeline.from_pretrained("shi-labs/versatile-diffusion")
-        pipe.remove_unused_weights()
-        pipe.set_progress_bar_config(disable=None)
-        prompt = "A painting of a squirrel eating a burger "
-        generator = paddle.Generator().manual_seed(0)
-        image = pipe(
-            prompt=prompt, generator=generator, guidance_scale=7.5, num_inference_steps=2, output_type="numpy"
-        ).images
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pipe.save_pretrained(tmpdirname)
-            pipe = VersatileDiffusionTextToImagePipeline.from_pretrained(tmpdirname, from_diffusers=False)
-        pipe.set_progress_bar_config(disable=None)
-        generator = paddle.Generator().manual_seed(0)
-        new_image = pipe(
-            prompt=prompt, generator=generator, guidance_scale=7.5, num_inference_steps=2, output_type="numpy"
-        ).images
-        assert np.abs(image - new_image).sum() < 1e-05, "Models don't have the same forward pass"
-
-    def test_inference_text2img(self):
-        pipe = VersatileDiffusionTextToImagePipeline.from_pretrained("shi-labs/versatile-diffusion")
-        pipe.set_progress_bar_config(disable=None)
-        prompt = "A painting of a squirrel eating a burger "
-        generator = paddle.Generator().manual_seed(0)
-        image = pipe(
-            prompt=prompt, generator=generator, guidance_scale=7.5, num_inference_steps=50, output_type="numpy"
-        ).images
-        image_slice = image[0, 253:256, 253:256, -1]
-        assert image.shape == (1, 512, 512, 3)
-        # expected_slice = np.array([0.3493, 0.3757, 0.4093, 0.4495, 0.4233, 0.4102, 0.4507, 0.4756, 0.4787])
-        expected_slice = np.array(
-            [0.0390625, 0.00854492, 0.0, 0.03930664, 0.00878906, 0.04711914, 0.03686523, 0.0, 0.0246582]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
diff --git a/ppdiffusers/tests/pipelines/vq_diffusion/__init__.py b/ppdiffusers/tests/pipelines/vq_diffusion/__init__.py
deleted file mode 100644
index a72d388cc895..000000000000
--- a/ppdiffusers/tests/pipelines/vq_diffusion/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/ppdiffusers/tests/pipelines/vq_diffusion/test_vq_diffusion.py b/ppdiffusers/tests/pipelines/vq_diffusion/test_vq_diffusion.py
deleted file mode 100644
index d616f1533365..000000000000
--- a/ppdiffusers/tests/pipelines/vq_diffusion/test_vq_diffusion.py
+++ /dev/null
@@ -1,199 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import numpy as np
-import paddle
-
-from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-from ppdiffusers import (
-    Transformer2DModel,
-    VQDiffusionPipeline,
-    VQDiffusionScheduler,
-    VQModel,
-)
-from ppdiffusers.pipelines.vq_diffusion.pipeline_vq_diffusion import (
-    LearnedClassifierFreeSamplingEmbeddings,
-)
-from ppdiffusers.utils import load_numpy, slow
-from ppdiffusers.utils.testing_utils import require_paddle_gpu
-
-
-class VQDiffusionPipelineFastTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    @property
-    def num_embed(self):
-        return 12
-
-    @property
-    def num_embeds_ada_norm(self):
-        return 12
-
-    @property
-    def text_embedder_hidden_size(self):
-        return 32
-
-    @property
-    def dummy_vqvae(self):
-        paddle.seed(0)
-        model = VQModel(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=3,
-            num_vq_embeddings=self.num_embed,
-            vq_embed_dim=3,
-        )
-        return model
-
-    @property
-    def dummy_tokenizer(self):
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        return tokenizer
-
-    @property
-    def dummy_text_encoder(self):
-        paddle.seed(0)
-        config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=self.text_embedder_hidden_size,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        return CLIPTextModel(config).eval()
-
-    @property
-    def dummy_transformer(self):
-        paddle.seed(0)
-        height = 12
-        width = 12
-        model_kwargs = {
-            "attention_bias": True,
-            "cross_attention_dim": 32,
-            "attention_head_dim": height * width,
-            "num_attention_heads": 1,
-            "num_vector_embeds": self.num_embed,
-            "num_embeds_ada_norm": self.num_embeds_ada_norm,
-            "norm_num_groups": 32,
-            "sample_size": width,
-            "activation_fn": "geglu-approximate",
-        }
-        model = Transformer2DModel(**model_kwargs)
-        return model
-
-    def test_vq_diffusion(self):
-        vqvae = self.dummy_vqvae
-        text_encoder = self.dummy_text_encoder
-        tokenizer = self.dummy_tokenizer
-        transformer = self.dummy_transformer
-        scheduler = VQDiffusionScheduler(self.num_embed)
-        learned_classifier_free_sampling_embeddings = LearnedClassifierFreeSamplingEmbeddings(learnable=False)
-        pipe = VQDiffusionPipeline(
-            vqvae=vqvae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            transformer=transformer,
-            scheduler=scheduler,
-            learned_classifier_free_sampling_embeddings=learned_classifier_free_sampling_embeddings,
-        )
-        pipe.set_progress_bar_config(disable=None)
-        prompt = "teddy bear playing in the pool"
-        generator = paddle.Generator().manual_seed(0)
-        output = pipe([prompt], generator=generator, num_inference_steps=2, output_type="np")
-        image = output.images
-        generator = paddle.Generator().manual_seed(0)
-        image_from_tuple = pipe(
-            [prompt], generator=generator, output_type="np", return_dict=False, num_inference_steps=2
-        )[0]
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-        assert image.shape == (1, 24, 24, 3)
-        expected_slice = np.array(
-            [0.5900591 , 0.83443725, 0.4418438 , 0.604656  , 0.89781034, 0.40088692, 0.6107253 , 0.87849474, 0.64088374]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
-
-    def test_vq_diffusion_classifier_free_sampling(self):
-        vqvae = self.dummy_vqvae
-        text_encoder = self.dummy_text_encoder
-        tokenizer = self.dummy_tokenizer
-        transformer = self.dummy_transformer
-        scheduler = VQDiffusionScheduler(self.num_embed)
-        learned_classifier_free_sampling_embeddings = LearnedClassifierFreeSamplingEmbeddings(
-            learnable=True, hidden_size=self.text_embedder_hidden_size, length=tokenizer.model_max_length
-        )
-        pipe = VQDiffusionPipeline(
-            vqvae=vqvae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            transformer=transformer,
-            scheduler=scheduler,
-            learned_classifier_free_sampling_embeddings=learned_classifier_free_sampling_embeddings,
-        )
-        pipe.set_progress_bar_config(disable=None)
-        prompt = "teddy bear playing in the pool"
-        generator = paddle.Generator().manual_seed(0)
-        output = pipe([prompt], generator=generator, num_inference_steps=2, output_type="np")
-        image = output.images
-        generator = paddle.Generator().manual_seed(0)
-        image_from_tuple = pipe(
-            [prompt], generator=generator, output_type="np", return_dict=False, num_inference_steps=2
-        )[0]
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-        assert image.shape == (1, 24, 24, 3)
-        expected_slice = np.array(
-            [0.61711097, 0.8419658 , 0.5493732 , 0.64064896, 0.97944254, 0.5611503 , 0.6145399 , 0.7063037 , 0.54406035]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01
-
-
-@slow
-@require_paddle_gpu
-class VQDiffusionPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        paddle.device.cuda.empty_cache()
-
-    def test_vq_diffusion_classifier_free_sampling(self):
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/vq_diffusion/teddy_bear_pool_classifier_free_sampling.npy"
-        )
-        pipeline = VQDiffusionPipeline.from_pretrained("microsoft/vq-diffusion-ithq")
-        pipeline = pipeline
-        pipeline.set_progress_bar_config(disable=None)
-        generator = paddle.Generator().manual_seed(0)
-        output = pipeline(
-            "teddy bear playing in the pool", num_images_per_prompt=1, generator=generator, output_type="np"
-        )
-        image = output.images[0]
-        assert image.shape == (256, 256, 3)
-        assert np.abs(expected_image - image).max() < 0.01
diff --git a/ppdiffusers/tests/schedulers/__init__.py b/ppdiffusers/tests/schedulers/__init__.py
deleted file mode 100644
index 595add0aed9e..000000000000
--- a/ppdiffusers/tests/schedulers/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_ddim.py b/ppdiffusers/tests/schedulers/test_scheduler_ddim.py
deleted file mode 100644
index c578c2ffb27c..000000000000
--- a/ppdiffusers/tests/schedulers/test_scheduler_ddim.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-
-from ppdiffusers import DDIMScheduler
-
-from .test_schedulers import SchedulerCommonTest
-
-
-class DDIMSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (DDIMScheduler,)
-    forward_default_kwargs = (("eta", 0.0), ("num_inference_steps", 50))
-
-    def get_scheduler_config(self, **kwargs):
-        config = {
-            "num_train_timesteps": 1000,
-            "beta_start": 0.0001,
-            "beta_end": 0.02,
-            "beta_schedule": "linear",
-            "clip_sample": True,
-        }
-
-        config.update(**kwargs)
-        return config
-
-    def full_loop(self, **config):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(**config)
-        scheduler = scheduler_class(**scheduler_config)
-
-        num_inference_steps, eta = 10, 0.0
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter
-
-        scheduler.set_timesteps(num_inference_steps)
-
-        for t in scheduler.timesteps:
-            residual = model(sample, t)
-            sample = scheduler.step(residual, t, sample, eta).prev_sample
-
-        return sample
-
-    def test_timesteps(self):
-        for timesteps in [100, 500, 1000]:
-            self.check_over_configs(num_train_timesteps=timesteps)
-
-    def test_steps_offset(self):
-        for steps_offset in [0, 1]:
-            self.check_over_configs(steps_offset=steps_offset)
-
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(steps_offset=1)
-        scheduler = scheduler_class(**scheduler_config)
-        scheduler.set_timesteps(5)
-        assert paddle.equal_all(scheduler.timesteps, paddle.to_tensor([801, 601, 401, 201, 1]))
-
-    def test_betas(self):
-        for beta_start, beta_end in zip([0.0001, 0.001, 0.01, 0.1], [0.002, 0.02, 0.2, 2]):
-            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
-
-    def test_schedules(self):
-        for schedule in ["linear", "squaredcos_cap_v2"]:
-            self.check_over_configs(beta_schedule=schedule)
-
-    def test_prediction_type(self):
-        for prediction_type in ["epsilon", "v_prediction"]:
-            self.check_over_configs(prediction_type=prediction_type)
-
-    def test_clip_sample(self):
-        for clip_sample in [True, False]:
-            self.check_over_configs(clip_sample=clip_sample)
-
-    def test_thresholding(self):
-        self.check_over_configs(thresholding=False)
-        for threshold in [0.5, 1.0, 2.0]:
-            for prediction_type in ["epsilon", "v_prediction"]:
-                self.check_over_configs(
-                    thresholding=True,
-                    prediction_type=prediction_type,
-                    sample_max_value=threshold,
-                )
-
-    def test_time_indices(self):
-        for t in [1, 10, 49]:
-            self.check_over_forward(time_step=t)
-
-    def test_inference_steps(self):
-        for t, num_inference_steps in zip([1, 10, 50], [10, 50, 500]):
-            self.check_over_forward(time_step=t, num_inference_steps=num_inference_steps)
-
-    def test_eta(self):
-        for t, eta in zip([1, 10, 49], [0.0, 0.5, 1.0]):
-            self.check_over_forward(time_step=t, eta=eta)
-
-    def test_variance(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-
-        assert paddle.sum(paddle.abs(scheduler._get_variance(0, 0) - 0.0)) < 1e-5
-        assert paddle.sum(paddle.abs(scheduler._get_variance(420, 400) - 0.14771)) < 1e-5
-        assert paddle.sum(paddle.abs(scheduler._get_variance(980, 960) - 0.32460)) < 1e-5
-        assert paddle.sum(paddle.abs(scheduler._get_variance(0, 0) - 0.0)) < 1e-5
-        assert paddle.sum(paddle.abs(scheduler._get_variance(487, 486) - 0.00979)) < 1e-5
-        assert paddle.sum(paddle.abs(scheduler._get_variance(999, 998) - 0.02)) < 1e-5
-
-    def test_full_loop_no_noise(self):
-        sample = self.full_loop()
-
-        result_sum = paddle.sum(paddle.abs(sample))
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_sum.item() - 172.0067) < 1e-2
-        assert abs(result_mean.item() - 0.223967) < 1e-3
-
-    def test_full_loop_with_v_prediction(self):
-        sample = self.full_loop(prediction_type="v_prediction")
-
-        result_sum = paddle.sum(paddle.abs(sample))
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_sum.item() - 52.5302) < 1e-2
-        assert abs(result_mean.item() - 0.0684) < 1e-3
-
-    def test_full_loop_with_set_alpha_to_one(self):
-        # We specify different beta, so that the first alpha is 0.99
-        sample = self.full_loop(set_alpha_to_one=True, beta_start=0.01)
-        result_sum = paddle.sum(paddle.abs(sample))
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_sum.item() - 149.8295) < 1e-2
-        assert abs(result_mean.item() - 0.1951) < 1e-3
-
-    def test_full_loop_with_no_set_alpha_to_one(self):
-        # We specify different beta, so that the first alpha is 0.99
-        sample = self.full_loop(set_alpha_to_one=False, beta_start=0.01)
-        result_sum = paddle.sum(paddle.abs(sample))
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_sum.item() - 149.0784) < 1e-2
-        assert abs(result_mean.item() - 0.1941) < 1e-3
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_ddpm.py b/ppdiffusers/tests/schedulers/test_scheduler_ddpm.py
deleted file mode 100644
index 72955abc5e4e..000000000000
--- a/ppdiffusers/tests/schedulers/test_scheduler_ddpm.py
+++ /dev/null
@@ -1,201 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-
-from ppdiffusers import DDPMScheduler
-
-from .test_schedulers import SchedulerCommonTest
-
-
-class DDPMSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (DDPMScheduler,)
-
-    def get_scheduler_config(self, **kwargs):
-        config = {
-            "num_train_timesteps": 1000,
-            "beta_start": 0.0001,
-            "beta_end": 0.02,
-            "beta_schedule": "linear",
-            "variance_type": "fixed_small",
-            "clip_sample": True,
-        }
-
-        config.update(**kwargs)
-        return config
-
-    def test_timesteps(self):
-        for timesteps in [1, 5, 100, 1000]:
-            self.check_over_configs(num_train_timesteps=timesteps)
-
-    def test_betas(self):
-        for beta_start, beta_end in zip([0.0001, 0.001, 0.01, 0.1], [0.002, 0.02, 0.2, 2]):
-            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
-
-    def test_schedules(self):
-        for schedule in ["linear", "squaredcos_cap_v2"]:
-            self.check_over_configs(beta_schedule=schedule)
-
-    def test_variance_type(self):
-        for variance in ["fixed_small", "fixed_large", "other"]:
-            self.check_over_configs(variance_type=variance)
-
-    def test_clip_sample(self):
-        for clip_sample in [True, False]:
-            self.check_over_configs(clip_sample=clip_sample)
-
-    def test_thresholding(self):
-        self.check_over_configs(thresholding=False)
-        for threshold in [0.5, 1.0, 2.0]:
-            for prediction_type in ["epsilon", "sample", "v_prediction"]:
-                self.check_over_configs(
-                    thresholding=True,
-                    prediction_type=prediction_type,
-                    sample_max_value=threshold,
-                )
-
-    def test_prediction_type(self):
-        for prediction_type in ["epsilon", "sample", "v_prediction"]:
-            self.check_over_configs(prediction_type=prediction_type)
-
-    def test_time_indices(self):
-        for t in [0, 500, 999]:
-            self.check_over_forward(time_step=t)
-
-    def test_variance(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-
-        assert paddle.sum(paddle.abs(scheduler._get_variance(0) - 0.0)) < 1e-5
-        assert paddle.sum(paddle.abs(scheduler._get_variance(487) - 0.00979)) < 1e-5
-        assert paddle.sum(paddle.abs(scheduler._get_variance(999) - 0.02)) < 1e-5
-
-    def test_full_loop_no_noise(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-
-        num_trained_timesteps = len(scheduler)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter
-        generator = paddle.Generator().manual_seed(0)
-
-        for t in reversed(range(num_trained_timesteps)):
-            # 1. predict noise residual
-            residual = model(sample, t)
-
-            # 2. predict previous mean of sample x_t-1
-            pred_prev_sample = scheduler.step(residual, t, sample, generator=generator).prev_sample
-
-            # if t > 0:
-            #     noise = self.dummy_sample_deter
-            #     variance = scheduler.get_variance(t) ** (0.5) * noise
-            #
-            # sample = pred_prev_sample + variance
-            sample = pred_prev_sample
-
-        result_sum = paddle.sum(paddle.abs(sample))
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_sum.item() - 261.0068359375) < 1e-2
-        assert abs(result_mean.item() - 0.33985263109207153) < 1e-3
-
-    def test_full_loop_with_v_prediction(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
-        scheduler = scheduler_class(**scheduler_config)
-
-        num_trained_timesteps = len(scheduler)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter
-        generator = paddle.Generator().manual_seed(0)
-
-        for t in reversed(range(num_trained_timesteps)):
-            # 1. predict noise residual
-            residual = model(sample, t)
-
-            # 2. predict previous mean of sample x_t-1
-            pred_prev_sample = scheduler.step(residual, t, sample, generator=generator).prev_sample
-
-            # if t > 0:
-            #     noise = self.dummy_sample_deter
-            #     variance = scheduler.get_variance(t) ** (0.5) * noise
-            #
-            # sample = pred_prev_sample + variance
-            sample = pred_prev_sample
-
-        result_sum = paddle.sum(paddle.abs(sample))
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_sum.item() - 202.7893524169922) < 1e-2
-        assert abs(result_mean.item() - 0.26404863595962524) < 1e-3
-
-    def test_custom_timesteps(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-
-        timesteps = [100, 87, 50, 1, 0]
-
-        scheduler.set_timesteps(timesteps=timesteps)
-
-        scheduler_timesteps = scheduler.timesteps
-
-        for i, timestep in enumerate(scheduler_timesteps):
-            if i == len(timesteps) - 1:
-                expected_prev_t = -1
-            else:
-                expected_prev_t = timesteps[i + 1]
-
-            prev_t = scheduler.previous_timestep(timestep)
-            prev_t = prev_t.item()
-
-            self.assertEqual(prev_t, expected_prev_t)
-
-    def test_custom_timesteps_increasing_order(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-
-        timesteps = [100, 87, 50, 51, 0]
-
-        with self.assertRaises(ValueError, msg="`custom_timesteps` must be in descending order."):
-            scheduler.set_timesteps(timesteps=timesteps)
-
-    def test_custom_timesteps_passing_both_num_inference_steps_and_timesteps(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-
-        timesteps = [100, 87, 50, 1, 0]
-        num_inference_steps = len(timesteps)
-
-        with self.assertRaises(ValueError, msg="Can only pass one of `num_inference_steps` or `custom_timesteps`."):
-            scheduler.set_timesteps(num_inference_steps=num_inference_steps, timesteps=timesteps)
-
-    def test_custom_timesteps_too_large(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-
-        timesteps = [scheduler.config.num_train_timesteps]
-
-        with self.assertRaises(
-            ValueError,
-            msg="`timesteps` must start before `self.config.train_timesteps`: {scheduler.config.num_train_timesteps}}",
-        ):
-            scheduler.set_timesteps(timesteps=timesteps)
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_deis.py b/ppdiffusers/tests/schedulers/test_scheduler_deis.py
deleted file mode 100644
index 7ea11c219802..000000000000
--- a/ppdiffusers/tests/schedulers/test_scheduler_deis.py
+++ /dev/null
@@ -1,252 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import tempfile
-
-import paddle
-
-from ppdiffusers import (
-    DEISMultistepScheduler,
-    DPMSolverMultistepScheduler,
-    DPMSolverSinglestepScheduler,
-    UniPCMultistepScheduler,
-)
-
-from .test_schedulers import SchedulerCommonTest
-
-
-class DEISMultistepSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (DEISMultistepScheduler,)
-    forward_default_kwargs = (("num_inference_steps", 25),)
-
-    def get_scheduler_config(self, **kwargs):
-        config = {
-            "num_train_timesteps": 1000,
-            "beta_start": 0.0001,
-            "beta_end": 0.02,
-            "beta_schedule": "linear",
-            "solver_order": 2,
-        }
-
-        config.update(**kwargs)
-        return config
-
-    def check_over_configs(self, time_step=0, **config):
-        kwargs = dict(self.forward_default_kwargs)
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-        sample = self.dummy_sample
-        residual = 0.1 * sample
-        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config(**config)
-            scheduler = scheduler_class(**scheduler_config)
-            scheduler.set_timesteps(num_inference_steps)
-            # copy over dummy past residuals
-            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
-                new_scheduler.set_timesteps(num_inference_steps)
-                # copy over dummy past residuals
-                new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
-
-            output, new_output = sample, sample
-            for t in range(time_step, time_step + scheduler.config.solver_order + 1):
-                output = scheduler.step(residual, t, output, **kwargs).prev_sample
-                new_output = new_scheduler.step(residual, t, new_output, **kwargs).prev_sample
-
-                assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-    def test_from_save_pretrained(self):
-        pass
-
-    def check_over_forward(self, time_step=0, **forward_kwargs):
-        kwargs = dict(self.forward_default_kwargs)
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-        sample = self.dummy_sample
-        residual = 0.1 * sample
-        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-            scheduler.set_timesteps(num_inference_steps)
-
-            # copy over dummy past residuals (must be after setting timesteps)
-            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
-                # copy over dummy past residuals
-                new_scheduler.set_timesteps(num_inference_steps)
-
-                # copy over dummy past residual (must be after setting timesteps)
-                new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
-
-            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-
-            assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-    def full_loop(self, scheduler=None, **config):
-        if scheduler is None:
-            scheduler_class = self.scheduler_classes[0]
-            scheduler_config = self.get_scheduler_config(**config)
-            scheduler = scheduler_class(**scheduler_config)
-
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(**config)
-        scheduler = scheduler_class(**scheduler_config)
-
-        num_inference_steps = 10
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter
-        scheduler.set_timesteps(num_inference_steps)
-
-        for i, t in enumerate(scheduler.timesteps):
-            residual = model(sample, t)
-            sample = scheduler.step(residual, t, sample).prev_sample
-
-        return sample
-
-    def test_step_shape(self):
-        kwargs = dict(self.forward_default_kwargs)
-
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-
-            sample = self.dummy_sample
-            residual = 0.1 * sample
-
-            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
-                scheduler.set_timesteps(num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
-                kwargs["num_inference_steps"] = num_inference_steps
-
-            # copy over dummy past residuals (must be done after set_timesteps)
-            dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
-            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
-
-            time_step_0 = scheduler.timesteps[5]
-            time_step_1 = scheduler.timesteps[6]
-
-            output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample
-            output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample
-
-            self.assertEqual(output_0.shape, sample.shape)
-            self.assertEqual(output_0.shape, output_1.shape)
-
-    def test_switch(self):
-        # make sure that iterating over schedulers with same config names gives same results
-        # for defaults
-        scheduler = DEISMultistepScheduler(**self.get_scheduler_config())
-        sample = self.full_loop(scheduler=scheduler)
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_mean.item() - 0.23916) < 1e-3
-
-        scheduler = DPMSolverSinglestepScheduler.from_config(scheduler.config)
-        scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
-        scheduler = UniPCMultistepScheduler.from_config(scheduler.config)
-        scheduler = DEISMultistepScheduler.from_config(scheduler.config)
-
-        sample = self.full_loop(scheduler=scheduler)
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_mean.item() - 0.23916) < 1e-3
-
-    def test_timesteps(self):
-        for timesteps in [25, 50, 100, 999, 1000]:
-            self.check_over_configs(num_train_timesteps=timesteps)
-
-    def test_thresholding(self):
-        self.check_over_configs(thresholding=False)
-        for order in [1, 2, 3]:
-            for solver_type in ["logrho"]:
-                for threshold in [0.5, 1.0, 2.0]:
-                    for prediction_type in ["epsilon", "sample"]:
-                        self.check_over_configs(
-                            thresholding=True,
-                            prediction_type=prediction_type,
-                            sample_max_value=threshold,
-                            algorithm_type="deis",
-                            solver_order=order,
-                            solver_type=solver_type,
-                        )
-
-    def test_prediction_type(self):
-        for prediction_type in ["epsilon", "v_prediction"]:
-            self.check_over_configs(prediction_type=prediction_type)
-
-    def test_solver_order_and_type(self):
-        for algorithm_type in ["deis"]:
-            for solver_type in ["logrho"]:
-                for order in [1, 2, 3]:
-                    for prediction_type in ["epsilon", "sample"]:
-                        self.check_over_configs(
-                            solver_order=order,
-                            solver_type=solver_type,
-                            prediction_type=prediction_type,
-                            algorithm_type=algorithm_type,
-                        )
-                        sample = self.full_loop(
-                            solver_order=order,
-                            solver_type=solver_type,
-                            prediction_type=prediction_type,
-                            algorithm_type=algorithm_type,
-                        )
-                        assert not paddle.isnan(sample).any(), "Samples have nan numbers"
-
-    def test_lower_order_final(self):
-        self.check_over_configs(lower_order_final=True)
-        self.check_over_configs(lower_order_final=False)
-
-    def test_inference_steps(self):
-        for num_inference_steps in [1, 2, 3, 5, 10, 50, 100, 999, 1000]:
-            self.check_over_forward(num_inference_steps=num_inference_steps, time_step=0)
-
-    def test_full_loop_no_noise(self):
-        sample = self.full_loop()
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_mean.item() - 0.23916) < 1e-3
-
-    def test_full_loop_with_v_prediction(self):
-        sample = self.full_loop(prediction_type="v_prediction")
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_mean.item() - 0.091) < 1e-3
-
-    def test_fp16_support(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(thresholding=True, dynamic_thresholding_ratio=0)
-        scheduler = scheduler_class(**scheduler_config)
-
-        num_inference_steps = 10
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter.cast("float16")
-        scheduler.set_timesteps(num_inference_steps)
-
-        for i, t in enumerate(scheduler.timesteps):
-            residual = model(sample, t)
-            sample = scheduler.step(residual, t, sample).prev_sample
-
-        # TODO, this scheduler output float32
-        assert sample.dtype == paddle.float32
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_dpm_multi.py b/ppdiffusers/tests/schedulers/test_scheduler_dpm_multi.py
deleted file mode 100644
index 869b1cc9280d..000000000000
--- a/ppdiffusers/tests/schedulers/test_scheduler_dpm_multi.py
+++ /dev/null
@@ -1,274 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import tempfile
-
-import paddle
-
-from ppdiffusers import (
-    DEISMultistepScheduler,
-    DPMSolverMultistepScheduler,
-    DPMSolverSinglestepScheduler,
-    UniPCMultistepScheduler,
-)
-
-from .test_schedulers import SchedulerCommonTest
-
-
-class DPMSolverMultistepSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (DPMSolverMultistepScheduler,)
-    forward_default_kwargs = (("num_inference_steps", 25),)
-
-    def get_scheduler_config(self, **kwargs):
-        config = {
-            "num_train_timesteps": 1000,
-            "beta_start": 0.0001,
-            "beta_end": 0.02,
-            "beta_schedule": "linear",
-            "solver_order": 2,
-            "prediction_type": "epsilon",
-            "thresholding": False,
-            "sample_max_value": 1.0,
-            "algorithm_type": "dpmsolver++",
-            "solver_type": "midpoint",
-            "lower_order_final": False,
-        }
-
-        config.update(**kwargs)
-        return config
-
-    def check_over_configs(self, time_step=0, **config):
-        kwargs = dict(self.forward_default_kwargs)
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-        sample = self.dummy_sample
-        residual = 0.1 * sample
-        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config(**config)
-            scheduler = scheduler_class(**scheduler_config)
-            scheduler.set_timesteps(num_inference_steps)
-            # copy over dummy past residuals
-            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
-                new_scheduler.set_timesteps(num_inference_steps)
-                # copy over dummy past residuals
-                new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
-
-            output, new_output = sample, sample
-            for t in range(time_step, time_step + scheduler.config.solver_order + 1):
-                output = scheduler.step(residual, t, output, **kwargs).prev_sample
-                new_output = new_scheduler.step(residual, t, new_output, **kwargs).prev_sample
-
-                assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-    def test_from_save_pretrained(self):
-        pass
-
-    def check_over_forward(self, time_step=0, **forward_kwargs):
-        kwargs = dict(self.forward_default_kwargs)
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-        sample = self.dummy_sample
-        residual = 0.1 * sample
-        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-            scheduler.set_timesteps(num_inference_steps)
-
-            # copy over dummy past residuals (must be after setting timesteps)
-            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
-                # copy over dummy past residuals
-                new_scheduler.set_timesteps(num_inference_steps)
-
-                # copy over dummy past residual (must be after setting timesteps)
-                new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
-
-            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-
-            assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-    def full_loop(self, scheduler=None, **config):
-        if scheduler is None:
-            scheduler_class = self.scheduler_classes[0]
-            scheduler_config = self.get_scheduler_config(**config)
-            scheduler = scheduler_class(**scheduler_config)
-
-        num_inference_steps = 10
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter
-        scheduler.set_timesteps(num_inference_steps)
-
-        for i, t in enumerate(scheduler.timesteps):
-            residual = model(sample, t)
-            sample = scheduler.step(residual, t, sample).prev_sample
-
-        return sample
-
-    def test_step_shape(self):
-        kwargs = dict(self.forward_default_kwargs)
-
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-
-            sample = self.dummy_sample
-            residual = 0.1 * sample
-
-            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
-                scheduler.set_timesteps(num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
-                kwargs["num_inference_steps"] = num_inference_steps
-
-            # copy over dummy past residuals (must be done after set_timesteps)
-            dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
-            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
-
-            time_step_0 = scheduler.timesteps[5]
-            time_step_1 = scheduler.timesteps[6]
-
-            output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample
-            output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample
-
-            self.assertEqual(output_0.shape, sample.shape)
-            self.assertEqual(output_0.shape, output_1.shape)
-
-    def test_timesteps(self):
-        for timesteps in [25, 50, 100, 999, 1000]:
-            self.check_over_configs(num_train_timesteps=timesteps)
-
-    def test_thresholding(self):
-        self.check_over_configs(thresholding=False)
-        for order in [1, 2, 3]:
-            for solver_type in ["midpoint", "heun"]:
-                for threshold in [0.5, 1.0, 2.0]:
-                    for prediction_type in ["epsilon", "sample"]:
-                        self.check_over_configs(
-                            thresholding=True,
-                            prediction_type=prediction_type,
-                            sample_max_value=threshold,
-                            algorithm_type="dpmsolver++",
-                            solver_order=order,
-                            solver_type=solver_type,
-                        )
-
-    def test_prediction_type(self):
-        for prediction_type in ["epsilon", "v_prediction"]:
-            self.check_over_configs(prediction_type=prediction_type)
-
-    def test_solver_order_and_type(self):
-        for algorithm_type in ["dpmsolver", "dpmsolver++"]:
-            for solver_type in ["midpoint", "heun"]:
-                for order in [1, 2, 3]:
-                    for prediction_type in ["epsilon", "sample"]:
-                        self.check_over_configs(
-                            solver_order=order,
-                            solver_type=solver_type,
-                            prediction_type=prediction_type,
-                            algorithm_type=algorithm_type,
-                        )
-                        sample = self.full_loop(
-                            solver_order=order,
-                            solver_type=solver_type,
-                            prediction_type=prediction_type,
-                            algorithm_type=algorithm_type,
-                        )
-                        assert not paddle.isnan(sample).any(), "Samples have nan numbers"
-
-    def test_lower_order_final(self):
-        self.check_over_configs(lower_order_final=True)
-        self.check_over_configs(lower_order_final=False)
-
-    def test_inference_steps(self):
-        for num_inference_steps in [1, 2, 3, 5, 10, 50, 100, 999, 1000]:
-            self.check_over_forward(num_inference_steps=num_inference_steps, time_step=0)
-
-    def test_full_loop_no_noise(self):
-        sample = self.full_loop()
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_mean.item() - 0.3301) < 1e-3
-
-    def test_full_loop_no_noise_thres(self):
-        sample = self.full_loop(thresholding=True, dynamic_thresholding_ratio=0.87, sample_max_value=0.5)
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_mean.item() - 1.1364) < 1e-3
-
-    def test_full_loop_with_v_prediction(self):
-        sample = self.full_loop(prediction_type="v_prediction")
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_mean.item() - 0.2251) < 1e-3
-
-    def test_full_loop_with_karras_and_v_prediction(self):
-        sample = self.full_loop(prediction_type="v_prediction", use_karras_sigmas=True)
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_mean.item() - 0.2096) < 1e-3
-
-    def test_switch(self):
-        # make sure that iterating over schedulers with same config names gives same results
-        # for defaults
-        scheduler = DPMSolverMultistepScheduler(**self.get_scheduler_config())
-        sample = self.full_loop(scheduler=scheduler)
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_mean.item() - 0.3301) < 1e-3
-
-        scheduler = DPMSolverSinglestepScheduler.from_config(scheduler.config)
-        scheduler = UniPCMultistepScheduler.from_config(scheduler.config)
-        scheduler = DEISMultistepScheduler.from_config(scheduler.config)
-        scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
-
-        sample = self.full_loop(scheduler=scheduler)
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_mean.item() - 0.3301) < 1e-3
-
-    def test_fp16_support(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(thresholding=True, dynamic_thresholding_ratio=0)
-        scheduler = scheduler_class(**scheduler_config)
-
-        num_inference_steps = 10
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter.cast("float16")
-        scheduler.set_timesteps(num_inference_steps)
-
-        for i, t in enumerate(scheduler.timesteps):
-            residual = model(sample, t)
-            sample = scheduler.step(residual, t, sample).prev_sample
-
-        # TODO, this scheduler output float32
-        assert sample.dtype == paddle.float32
-
-    def test_unique_timesteps(self, **config):
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config(**config)
-            scheduler = scheduler_class(**scheduler_config)
-
-            scheduler.set_timesteps(scheduler.config.num_train_timesteps)
-            assert len(scheduler.timesteps.unique()) == scheduler.num_inference_steps
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_dpm_single.py b/ppdiffusers/tests/schedulers/test_scheduler_dpm_single.py
deleted file mode 100644
index ce229323bc36..000000000000
--- a/ppdiffusers/tests/schedulers/test_scheduler_dpm_single.py
+++ /dev/null
@@ -1,226 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import tempfile
-
-import paddle
-
-from ppdiffusers import (
-    DEISMultistepScheduler,
-    DPMSolverMultistepScheduler,
-    DPMSolverSinglestepScheduler,
-    UniPCMultistepScheduler,
-)
-
-from .test_schedulers import SchedulerCommonTest
-
-
-class DPMSolverSinglestepSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (DPMSolverSinglestepScheduler,)
-    forward_default_kwargs = (("num_inference_steps", 25),)
-
-    def get_scheduler_config(self, **kwargs):
-        config = {
-            "num_train_timesteps": 1000,
-            "beta_start": 0.0001,
-            "beta_end": 0.02,
-            "beta_schedule": "linear",
-            "solver_order": 2,
-            "prediction_type": "epsilon",
-            "thresholding": False,
-            "sample_max_value": 1.0,
-            "algorithm_type": "dpmsolver++",
-            "solver_type": "midpoint",
-        }
-
-        config.update(**kwargs)
-        return config
-
-    def check_over_configs(self, time_step=0, **config):
-        kwargs = dict(self.forward_default_kwargs)
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-        sample = self.dummy_sample
-        residual = 0.1 * sample
-        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config(**config)
-            scheduler = scheduler_class(**scheduler_config)
-            scheduler.set_timesteps(num_inference_steps)
-            # copy over dummy past residuals
-            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
-                new_scheduler.set_timesteps(num_inference_steps)
-                # copy over dummy past residuals
-                new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
-
-            output, new_output = sample, sample
-            for t in range(time_step, time_step + scheduler.config.solver_order + 1):
-                output = scheduler.step(residual, t, output, **kwargs).prev_sample
-                new_output = new_scheduler.step(residual, t, new_output, **kwargs).prev_sample
-
-                assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-    def test_from_save_pretrained(self):
-        pass
-
-    def check_over_forward(self, time_step=0, **forward_kwargs):
-        kwargs = dict(self.forward_default_kwargs)
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-        sample = self.dummy_sample
-        residual = 0.1 * sample
-        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-            scheduler.set_timesteps(num_inference_steps)
-
-            # copy over dummy past residuals (must be after setting timesteps)
-            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
-                # copy over dummy past residuals
-                new_scheduler.set_timesteps(num_inference_steps)
-
-                # copy over dummy past residual (must be after setting timesteps)
-                new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
-
-            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-
-            assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-    def full_loop(self, scheduler=None, **config):
-        if scheduler is None:
-            scheduler_class = self.scheduler_classes[0]
-            scheduler_config = self.get_scheduler_config(**config)
-            scheduler = scheduler_class(**scheduler_config)
-
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(**config)
-        scheduler = scheduler_class(**scheduler_config)
-
-        num_inference_steps = 10
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter
-        scheduler.set_timesteps(num_inference_steps)
-
-        for i, t in enumerate(scheduler.timesteps):
-            residual = model(sample, t)
-            sample = scheduler.step(residual, t, sample).prev_sample
-
-        return sample
-
-    def test_timesteps(self):
-        for timesteps in [25, 50, 100, 999, 1000]:
-            self.check_over_configs(num_train_timesteps=timesteps)
-
-    def test_switch(self):
-        # make sure that iterating over schedulers with same config names gives same results
-        # for defaults
-        scheduler = DPMSolverSinglestepScheduler(**self.get_scheduler_config())
-        sample = self.full_loop(scheduler=scheduler)
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_mean.item() - 0.2791) < 1e-3
-
-        scheduler = DEISMultistepScheduler.from_config(scheduler.config)
-        scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
-        scheduler = UniPCMultistepScheduler.from_config(scheduler.config)
-        scheduler = DPMSolverSinglestepScheduler.from_config(scheduler.config)
-
-        sample = self.full_loop(scheduler=scheduler)
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_mean.item() - 0.2791) < 1e-3
-
-    def test_thresholding(self):
-        self.check_over_configs(thresholding=False)
-        for order in [1, 2, 3]:
-            for solver_type in ["midpoint", "heun"]:
-                for threshold in [0.5, 1.0, 2.0]:
-                    for prediction_type in ["epsilon", "sample"]:
-                        self.check_over_configs(
-                            thresholding=True,
-                            prediction_type=prediction_type,
-                            sample_max_value=threshold,
-                            algorithm_type="dpmsolver++",
-                            solver_order=order,
-                            solver_type=solver_type,
-                        )
-
-    def test_prediction_type(self):
-        for prediction_type in ["epsilon", "v_prediction"]:
-            self.check_over_configs(prediction_type=prediction_type)
-
-    def test_solver_order_and_type(self):
-        for algorithm_type in ["dpmsolver", "dpmsolver++"]:
-            for solver_type in ["midpoint", "heun"]:
-                for order in [1, 2, 3]:
-                    for prediction_type in ["epsilon", "sample"]:
-                        self.check_over_configs(
-                            solver_order=order,
-                            solver_type=solver_type,
-                            prediction_type=prediction_type,
-                            algorithm_type=algorithm_type,
-                        )
-                        sample = self.full_loop(
-                            solver_order=order,
-                            solver_type=solver_type,
-                            prediction_type=prediction_type,
-                            algorithm_type=algorithm_type,
-                        )
-                        assert not paddle.isnan(sample).any(), "Samples have nan numbers"
-
-    def test_lower_order_final(self):
-        self.check_over_configs(lower_order_final=True)
-        self.check_over_configs(lower_order_final=False)
-
-    def test_inference_steps(self):
-        for num_inference_steps in [1, 2, 3, 5, 10, 50, 100, 999, 1000]:
-            self.check_over_forward(num_inference_steps=num_inference_steps, time_step=0)
-
-    def test_full_loop_no_noise(self):
-        sample = self.full_loop()
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_mean.item() - 0.2791) < 1e-3
-
-    def test_full_loop_with_v_prediction(self):
-        sample = self.full_loop(prediction_type="v_prediction")
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_mean.item() - 0.1453) < 1e-3
-
-    def test_fp16_support(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(thresholding=True, dynamic_thresholding_ratio=0)
-        scheduler = scheduler_class(**scheduler_config)
-
-        num_inference_steps = 10
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter.cast("float16")
-        scheduler.set_timesteps(num_inference_steps)
-
-        for i, t in enumerate(scheduler.timesteps):
-            residual = model(sample, t)
-            sample = scheduler.step(residual, t, sample).prev_sample
-        # TODO, this scheduler output float32
-        assert sample.dtype == paddle.float32
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_euler.py b/ppdiffusers/tests/schedulers/test_scheduler_euler.py
deleted file mode 100644
index d6cfc9fe4474..000000000000
--- a/ppdiffusers/tests/schedulers/test_scheduler_euler.py
+++ /dev/null
@@ -1,155 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-
-from ppdiffusers import EulerDiscreteScheduler
-
-from .test_schedulers import SchedulerCommonTest
-
-
-class EulerDiscreteSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (EulerDiscreteScheduler,)
-    num_inference_steps = 10
-
-    def get_scheduler_config(self, **kwargs):
-        config = {
-            "num_train_timesteps": 1100,
-            "beta_start": 0.0001,
-            "beta_end": 0.02,
-            "beta_schedule": "linear",
-        }
-
-        config.update(**kwargs)
-        return config
-
-    def test_timesteps(self):
-        for timesteps in [10, 50, 100, 1000]:
-            self.check_over_configs(num_train_timesteps=timesteps)
-
-    def test_betas(self):
-        for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]):
-            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
-
-    def test_schedules(self):
-        for schedule in ["linear", "scaled_linear"]:
-            self.check_over_configs(beta_schedule=schedule)
-
-    def test_prediction_type(self):
-        for prediction_type in ["epsilon", "v_prediction"]:
-            self.check_over_configs(prediction_type=prediction_type)
-
-    def test_full_loop_no_noise(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-
-        scheduler.set_timesteps(self.num_inference_steps)
-
-        generator = paddle.Generator().manual_seed(0)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
-
-        for i, t in enumerate(scheduler.timesteps):
-            sample = scheduler.scale_model_input(sample, t)
-
-            model_output = model(sample, t)
-
-            output = scheduler.step(model_output, t, sample, generator=generator)
-            sample = output.prev_sample
-
-        result_sum = paddle.sum(paddle.abs(sample))
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_sum.item() - 10.0807) < 1e-2
-        assert abs(result_mean.item() - 0.0131) < 1e-3
-
-    def test_full_loop_with_v_prediction(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
-        scheduler = scheduler_class(**scheduler_config)
-
-        scheduler.set_timesteps(self.num_inference_steps)
-
-        generator = paddle.Generator().manual_seed(0)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
-
-        for i, t in enumerate(scheduler.timesteps):
-            sample = scheduler.scale_model_input(sample, t)
-
-            model_output = model(sample, t)
-
-            output = scheduler.step(model_output, t, sample, generator=generator)
-            sample = output.prev_sample
-
-        result_sum = paddle.sum(paddle.abs(sample))
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_sum.item() - 0.0002) < 1e-2
-        assert abs(result_mean.item() - 2.2676e-06) < 1e-3
-
-    def test_full_loop_device(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-
-        scheduler.set_timesteps(self.num_inference_steps)
-
-        generator = paddle.Generator().manual_seed(0)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
-
-        for t in scheduler.timesteps:
-            sample = scheduler.scale_model_input(sample, t)
-
-            model_output = model(sample, t)
-
-            output = scheduler.step(model_output, t, sample, generator=generator)
-            sample = output.prev_sample
-
-        result_sum = paddle.sum(paddle.abs(sample))
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_sum.item() - 10.0807) < 1e-2
-        assert abs(result_mean.item() - 0.0131) < 1e-3
-
-    def test_full_loop_device_karras_sigmas(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config, use_karras_sigmas=True)
-
-        scheduler.set_timesteps(self.num_inference_steps)
-
-        generator = paddle.Generator().manual_seed(0)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
-
-        for t in scheduler.timesteps:
-            sample = scheduler.scale_model_input(sample, t)
-
-            model_output = model(sample, t)
-
-            output = scheduler.step(model_output, t, sample, generator=generator)
-            sample = output.prev_sample
-
-        result_sum = paddle.sum(paddle.abs(sample))
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_sum.item() - 124.52299499511719) < 1e-2
-        assert abs(result_mean.item() - 0.16213932633399963) < 1e-3
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_euler_ancestral.py b/ppdiffusers/tests/schedulers/test_scheduler_euler_ancestral.py
deleted file mode 100644
index fdc7f2a34f30..000000000000
--- a/ppdiffusers/tests/schedulers/test_scheduler_euler_ancestral.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-
-from ppdiffusers import EulerAncestralDiscreteScheduler
-
-from .test_schedulers import SchedulerCommonTest
-
-
-class EulerAncestralDiscreteSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (EulerAncestralDiscreteScheduler,)
-    num_inference_steps = 10
-
-    def get_scheduler_config(self, **kwargs):
-        config = {
-            "num_train_timesteps": 1100,
-            "beta_start": 0.0001,
-            "beta_end": 0.02,
-            "beta_schedule": "linear",
-        }
-
-        config.update(**kwargs)
-        return config
-
-    def test_timesteps(self):
-        for timesteps in [10, 50, 100, 1000]:
-            self.check_over_configs(num_train_timesteps=timesteps)
-
-    def test_betas(self):
-        for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]):
-            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
-
-    def test_schedules(self):
-        for schedule in ["linear", "scaled_linear"]:
-            self.check_over_configs(beta_schedule=schedule)
-
-    def test_prediction_type(self):
-        for prediction_type in ["epsilon", "v_prediction"]:
-            self.check_over_configs(prediction_type=prediction_type)
-
-    def test_full_loop_no_noise(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-
-        scheduler.set_timesteps(self.num_inference_steps)
-
-        generator = paddle.Generator().manual_seed(0)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
-
-        for i, t in enumerate(scheduler.timesteps):
-            sample = scheduler.scale_model_input(sample, t)
-
-            model_output = model(sample, t)
-
-            output = scheduler.step(model_output, t, sample, generator=generator)
-            sample = output.prev_sample
-
-        result_sum = paddle.sum(paddle.abs(sample))
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_sum.item() - 144.80836487) < 1e-2
-        assert abs(result_mean.item() - 0.18855257) < 1e-3
-
-    def test_full_loop_with_v_prediction(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
-        scheduler = scheduler_class(**scheduler_config)
-
-        scheduler.set_timesteps(self.num_inference_steps)
-
-        generator = paddle.Generator().manual_seed(0)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
-
-        for i, t in enumerate(scheduler.timesteps):
-            sample = scheduler.scale_model_input(sample, t)
-
-            model_output = model(sample, t)
-
-            output = scheduler.step(model_output, t, sample, generator=generator)
-            sample = output.prev_sample
-
-        result_sum = paddle.sum(paddle.abs(sample))
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_sum.item() - 102.58072662) < 1e-2
-        assert abs(result_mean.item() - 0.13356867) < 1e-3
-
-    def test_full_loop_device(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-
-        scheduler.set_timesteps(self.num_inference_steps)
-        generator = paddle.Generator().manual_seed(0)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
-
-        for t in scheduler.timesteps:
-            sample = scheduler.scale_model_input(sample, t)
-
-            model_output = model(sample, t)
-
-            output = scheduler.step(model_output, t, sample, generator=generator)
-            sample = output.prev_sample
-
-        result_sum = paddle.sum(paddle.abs(sample))
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_sum.item() - 144.80836487) < 1e-2
-        assert abs(result_mean.item() - 0.18855257) < 1e-3
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_heun.py b/ppdiffusers/tests/schedulers/test_scheduler_heun.py
deleted file mode 100644
index 0f62ae519f4e..000000000000
--- a/ppdiffusers/tests/schedulers/test_scheduler_heun.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-
-from ppdiffusers import HeunDiscreteScheduler
-
-from .test_schedulers import SchedulerCommonTest
-
-
-class HeunDiscreteSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (HeunDiscreteScheduler,)
-    num_inference_steps = 10
-
-    def get_scheduler_config(self, **kwargs):
-        config = {
-            "num_train_timesteps": 1100,
-            "beta_start": 0.0001,
-            "beta_end": 0.02,
-            "beta_schedule": "linear",
-        }
-
-        config.update(**kwargs)
-        return config
-
-    def test_timesteps(self):
-        for timesteps in [10, 50, 100, 1000]:
-            self.check_over_configs(num_train_timesteps=timesteps)
-
-    def test_betas(self):
-        for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]):
-            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
-
-    def test_schedules(self):
-        for schedule in ["linear", "scaled_linear"]:
-            self.check_over_configs(beta_schedule=schedule)
-
-    def test_prediction_type(self):
-        for prediction_type in ["epsilon", "v_prediction"]:
-            self.check_over_configs(prediction_type=prediction_type)
-
-    def test_full_loop_no_noise(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-
-        scheduler.set_timesteps(self.num_inference_steps)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
-
-        for i, t in enumerate(scheduler.timesteps):
-            sample = scheduler.scale_model_input(sample, t)
-
-            model_output = model(sample, t)
-
-            output = scheduler.step(model_output, t, sample)
-            sample = output.prev_sample
-
-        result_sum = paddle.sum(paddle.abs(sample))
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        # CUDA
-        assert abs(result_sum.item() - 0.1233) < 1e-2
-        assert abs(result_mean.item() - 0.0002) < 1e-3
-
-    def test_full_loop_with_v_prediction(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
-        scheduler = scheduler_class(**scheduler_config)
-
-        scheduler.set_timesteps(self.num_inference_steps)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
-
-        for i, t in enumerate(scheduler.timesteps):
-            sample = scheduler.scale_model_input(sample, t)
-
-            model_output = model(sample, t)
-
-            output = scheduler.step(model_output, t, sample)
-            sample = output.prev_sample
-
-        result_sum = paddle.sum(paddle.abs(sample))
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        # CUDA
-        assert abs(result_sum.item() - 4.693428650170972e-07) < 1e-2
-        assert abs(result_mean.item() - 0.0002) < 1e-3
-
-    def test_full_loop_device(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-
-        scheduler.set_timesteps(self.num_inference_steps)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
-
-        for t in scheduler.timesteps:
-            sample = scheduler.scale_model_input(sample, t)
-
-            model_output = model(sample, t)
-
-            output = scheduler.step(model_output, t, sample)
-            sample = output.prev_sample
-
-        result_sum = paddle.sum(paddle.abs(sample))
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        # CUDA
-        assert abs(result_sum.item() - 0.1233) < 1e-2
-        assert abs(result_mean.item() - 0.0002) < 1e-3
-
-    def test_full_loop_device_karras_sigmas(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config, use_karras_sigmas=True)
-
-        scheduler.set_timesteps(self.num_inference_steps)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
-
-        for t in scheduler.timesteps:
-            sample = scheduler.scale_model_input(sample, t)
-
-            model_output = model(sample, t)
-
-            output = scheduler.step(model_output, t, sample)
-            sample = output.prev_sample
-
-        result_sum = paddle.sum(paddle.abs(sample))
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_sum.item() - 0.00015) < 1e-2
-        assert abs(result_mean.item() - 1.9869554535034695e-07) < 1e-2
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_ipndm.py b/ppdiffusers/tests/schedulers/test_scheduler_ipndm.py
deleted file mode 100644
index fcc066790cbe..000000000000
--- a/ppdiffusers/tests/schedulers/test_scheduler_ipndm.py
+++ /dev/null
@@ -1,175 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import tempfile
-
-import paddle
-
-from ppdiffusers import IPNDMScheduler
-
-from .test_schedulers import SchedulerCommonTest
-
-
-class IPNDMSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (IPNDMScheduler,)
-    forward_default_kwargs = (("num_inference_steps", 50),)
-
-    def get_scheduler_config(self, **kwargs):
-        config = {"num_train_timesteps": 1000}
-        config.update(**kwargs)
-        return config
-
-    def check_over_configs(self, time_step=0, **config):
-        kwargs = dict(self.forward_default_kwargs)
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-        sample = self.dummy_sample
-        residual = 0.1 * sample
-        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.1, residual + 0.05]
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config(**config)
-            scheduler = scheduler_class(**scheduler_config)
-            scheduler.set_timesteps(num_inference_steps)
-            # copy over dummy past residuals
-            scheduler.ets = dummy_past_residuals[:]
-
-            if time_step is None:
-                time_step = scheduler.timesteps[len(scheduler.timesteps) // 2]
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
-                new_scheduler.set_timesteps(num_inference_steps)
-                # copy over dummy past residuals
-                new_scheduler.ets = dummy_past_residuals[:]
-
-            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-
-            assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-
-            assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-    def test_from_save_pretrained(self):
-        pass
-
-    def check_over_forward(self, time_step=0, **forward_kwargs):
-        kwargs = dict(self.forward_default_kwargs)
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-        sample = self.dummy_sample
-        residual = 0.1 * sample
-        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.1, residual + 0.05]
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-            scheduler.set_timesteps(num_inference_steps)
-
-            # copy over dummy past residuals (must be after setting timesteps)
-            scheduler.ets = dummy_past_residuals[:]
-
-            if time_step is None:
-                time_step = scheduler.timesteps[len(scheduler.timesteps) // 2]
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
-                # copy over dummy past residuals
-                new_scheduler.set_timesteps(num_inference_steps)
-
-                # copy over dummy past residual (must be after setting timesteps)
-                new_scheduler.ets = dummy_past_residuals[:]
-
-            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-
-            assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-
-            assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-    def full_loop(self, **config):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(**config)
-        scheduler = scheduler_class(**scheduler_config)
-
-        num_inference_steps = 10
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter
-        scheduler.set_timesteps(num_inference_steps)
-
-        for i, t in enumerate(scheduler.timesteps):
-            residual = model(sample, t)
-            sample = scheduler.step(residual, t, sample).prev_sample
-
-        for i, t in enumerate(scheduler.timesteps):
-            residual = model(sample, t)
-            sample = scheduler.step(residual, t, sample).prev_sample
-
-        return sample
-
-    def test_step_shape(self):
-        kwargs = dict(self.forward_default_kwargs)
-
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-
-            sample = self.dummy_sample
-            residual = 0.1 * sample
-
-            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
-                scheduler.set_timesteps(num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
-                kwargs["num_inference_steps"] = num_inference_steps
-
-            # copy over dummy past residuals (must be done after set_timesteps)
-            dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.1, residual + 0.05]
-            scheduler.ets = dummy_past_residuals[:]
-
-            time_step_0 = scheduler.timesteps[5]
-            time_step_1 = scheduler.timesteps[6]
-
-            output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample
-            output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample
-
-            self.assertEqual(output_0.shape, sample.shape)
-            self.assertEqual(output_0.shape, output_1.shape)
-
-            output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample
-            output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample
-
-            self.assertEqual(output_0.shape, sample.shape)
-            self.assertEqual(output_0.shape, output_1.shape)
-
-    def test_timesteps(self):
-        for timesteps in [100, 1000]:
-            self.check_over_configs(num_train_timesteps=timesteps, time_step=None)
-
-    def test_inference_steps(self):
-        for t, num_inference_steps in zip([1, 5, 10], [10, 50, 100]):
-            self.check_over_forward(num_inference_steps=num_inference_steps, time_step=None)
-
-    def test_full_loop_no_noise(self):
-        sample = self.full_loop()
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_mean.item() - 2540529) < 10
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_kdpm2_ancestral.py b/ppdiffusers/tests/schedulers/test_scheduler_kdpm2_ancestral.py
deleted file mode 100644
index 770b4f226ba5..000000000000
--- a/ppdiffusers/tests/schedulers/test_scheduler_kdpm2_ancestral.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-
-from ppdiffusers import KDPM2AncestralDiscreteScheduler
-
-from .test_schedulers import SchedulerCommonTest
-
-
-class KDPM2AncestralDiscreteSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (KDPM2AncestralDiscreteScheduler,)
-    num_inference_steps = 10
-
-    def get_scheduler_config(self, **kwargs):
-        config = {
-            "num_train_timesteps": 1100,
-            "beta_start": 0.0001,
-            "beta_end": 0.02,
-            "beta_schedule": "linear",
-        }
-
-        config.update(**kwargs)
-        return config
-
-    def test_timesteps(self):
-        for timesteps in [10, 50, 100, 1000]:
-            self.check_over_configs(num_train_timesteps=timesteps)
-
-    def test_betas(self):
-        for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]):
-            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
-
-    def test_schedules(self):
-        for schedule in ["linear", "scaled_linear"]:
-            self.check_over_configs(beta_schedule=schedule)
-
-    def test_full_loop_no_noise(self):
-
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-
-        scheduler.set_timesteps(self.num_inference_steps)
-
-        generator = paddle.Generator().manual_seed(0)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
-
-        for i, t in enumerate(scheduler.timesteps):
-            sample = scheduler.scale_model_input(sample, t)
-
-            model_output = model(sample, t)
-
-            output = scheduler.step(model_output, t, sample, generator=generator)
-            sample = output.prev_sample
-
-        result_sum = paddle.sum(paddle.abs(sample))
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_sum.item() - 13913.05566406) < 1e-2
-        assert abs(result_mean.item() - 18.11595917) < 5e-3
-
-    def test_prediction_type(self):
-        for prediction_type in ["epsilon", "v_prediction"]:
-            self.check_over_configs(prediction_type=prediction_type)
-
-    def test_full_loop_with_v_prediction(self):
-
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
-        scheduler = scheduler_class(**scheduler_config)
-
-        scheduler.set_timesteps(self.num_inference_steps)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
-
-        generator = paddle.Generator().manual_seed(0)
-
-        for i, t in enumerate(scheduler.timesteps):
-            sample = scheduler.scale_model_input(sample, t)
-
-            model_output = model(sample, t)
-
-            output = scheduler.step(model_output, t, sample, generator=generator)
-            sample = output.prev_sample
-
-        result_sum = paddle.sum(paddle.abs(sample))
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_sum.item() - 327.80270386) < 1e-2
-        assert abs(result_mean.item() - 0.42682642) < 1e-3
-
-    def test_full_loop_device(self):
-
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-
-        scheduler.set_timesteps(self.num_inference_steps)
-        generator = paddle.Generator().manual_seed(0)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
-
-        for t in scheduler.timesteps:
-            sample = scheduler.scale_model_input(sample, t)
-
-            model_output = model(sample, t)
-
-            output = scheduler.step(model_output, t, sample, generator=generator)
-            sample = output.prev_sample
-
-        result_sum = paddle.sum(paddle.abs(sample))
-        result_mean = paddle.mean(paddle.abs(sample))
-        assert abs(result_sum.item() - 13913.05566406) < 1e-1
-        assert abs(result_mean.item() - 18.11595917) < 1e-3
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_kdpm2_discrete.py b/ppdiffusers/tests/schedulers/test_scheduler_kdpm2_discrete.py
deleted file mode 100644
index 3da7b7e75fd4..000000000000
--- a/ppdiffusers/tests/schedulers/test_scheduler_kdpm2_discrete.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-
-from ppdiffusers import KDPM2DiscreteScheduler
-
-from .test_schedulers import SchedulerCommonTest
-
-
-class KDPM2DiscreteSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (KDPM2DiscreteScheduler,)
-    num_inference_steps = 10
-
-    def get_scheduler_config(self, **kwargs):
-        config = {
-            "num_train_timesteps": 1100,
-            "beta_start": 0.0001,
-            "beta_end": 0.02,
-            "beta_schedule": "linear",
-        }
-
-        config.update(**kwargs)
-        return config
-
-    def test_timesteps(self):
-        for timesteps in [10, 50, 100, 1000]:
-            self.check_over_configs(num_train_timesteps=timesteps)
-
-    def test_betas(self):
-        for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]):
-            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
-
-    def test_schedules(self):
-        for schedule in ["linear", "scaled_linear"]:
-            self.check_over_configs(beta_schedule=schedule)
-
-    def test_prediction_type(self):
-        for prediction_type in ["epsilon", "v_prediction"]:
-            self.check_over_configs(prediction_type=prediction_type)
-
-    def test_full_loop_with_v_prediction(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
-        scheduler = scheduler_class(**scheduler_config)
-
-        scheduler.set_timesteps(self.num_inference_steps)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
-
-        for i, t in enumerate(scheduler.timesteps):
-            sample = scheduler.scale_model_input(sample, t)
-
-            model_output = model(sample, t)
-
-            output = scheduler.step(model_output, t, sample)
-            sample = output.prev_sample
-
-        result_sum = paddle.sum(paddle.abs(sample))
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        # CUDA
-        assert abs(result_sum.item() - 4.693428650170972e-07) < 1e-2
-        assert abs(result_mean.item() - 0.0002) < 1e-3
-
-    def test_full_loop_no_noise(self):
-
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-
-        scheduler.set_timesteps(self.num_inference_steps)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
-
-        for i, t in enumerate(scheduler.timesteps):
-            sample = scheduler.scale_model_input(sample, t)
-
-            model_output = model(sample, t)
-
-            output = scheduler.step(model_output, t, sample)
-            sample = output.prev_sample
-
-        result_sum = paddle.sum(paddle.abs(sample))
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        # CUDA
-        assert abs(result_sum.item() - 20.4125) < 1e-2
-        assert abs(result_mean.item() - 0.0266) < 1e-3
-
-    def test_full_loop_device(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-
-        scheduler.set_timesteps(self.num_inference_steps)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
-
-        for t in scheduler.timesteps:
-            sample = scheduler.scale_model_input(sample, t)
-
-            model_output = model(sample, t)
-
-            output = scheduler.step(model_output, t, sample)
-            sample = output.prev_sample
-
-        result_sum = paddle.sum(paddle.abs(sample))
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        # CUDA
-        assert abs(result_sum.item() - 20.4125) < 1e-2
-        assert abs(result_mean.item() - 0.0266) < 1e-3
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_lms.py b/ppdiffusers/tests/schedulers/test_scheduler_lms.py
deleted file mode 100644
index 8ee87bbddf62..000000000000
--- a/ppdiffusers/tests/schedulers/test_scheduler_lms.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-
-from ppdiffusers import LMSDiscreteScheduler
-
-from .test_schedulers import SchedulerCommonTest
-
-
-class LMSDiscreteSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (LMSDiscreteScheduler,)
-    num_inference_steps = 10
-
-    def get_scheduler_config(self, **kwargs):
-        config = {
-            "num_train_timesteps": 1100,
-            "beta_start": 0.0001,
-            "beta_end": 0.02,
-            "beta_schedule": "linear",
-        }
-
-        config.update(**kwargs)
-        return config
-
-    def test_timesteps(self):
-        for timesteps in [10, 50, 100, 1000]:
-            self.check_over_configs(num_train_timesteps=timesteps)
-
-    def test_betas(self):
-        for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]):
-            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
-
-    def test_schedules(self):
-        for schedule in ["linear", "scaled_linear"]:
-            self.check_over_configs(beta_schedule=schedule)
-
-    def test_prediction_type(self):
-        for prediction_type in ["epsilon", "v_prediction"]:
-            self.check_over_configs(prediction_type=prediction_type)
-
-    def test_time_indices(self):
-        for t in [0, 500, 800]:
-            self.check_over_forward(time_step=t)
-
-    def test_full_loop_no_noise(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-
-        scheduler.set_timesteps(self.num_inference_steps)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
-
-        for i, t in enumerate(scheduler.timesteps):
-            sample = scheduler.scale_model_input(sample, t)
-
-            model_output = model(sample, t)
-
-            output = scheduler.step(model_output, t, sample)
-            sample = output.prev_sample
-
-        result_sum = paddle.sum(paddle.abs(sample))
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_sum.item() - 1006.388) < 1e-2
-        assert abs(result_mean.item() - 1.31) < 1e-3
-
-    def test_full_loop_with_v_prediction(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
-        scheduler = scheduler_class(**scheduler_config)
-
-        scheduler.set_timesteps(self.num_inference_steps)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
-
-        for i, t in enumerate(scheduler.timesteps):
-            sample = scheduler.scale_model_input(sample, t)
-
-            model_output = model(sample, t)
-
-            output = scheduler.step(model_output, t, sample)
-            sample = output.prev_sample
-
-        result_sum = paddle.sum(paddle.abs(sample))
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_sum.item() - 0.0017) < 1e-2
-        assert abs(result_mean.item() - 2.2676e-06) < 1e-3
-
-    def test_full_loop_device(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-
-        scheduler.set_timesteps(self.num_inference_steps)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
-
-        for i, t in enumerate(scheduler.timesteps):
-            sample = scheduler.scale_model_input(sample, t)
-
-            model_output = model(sample, t)
-
-            output = scheduler.step(model_output, t, sample)
-            sample = output.prev_sample
-
-        result_sum = paddle.sum(paddle.abs(sample))
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_sum.item() - 1006.388) < 1e-2
-        assert abs(result_mean.item() - 1.31) < 1e-3
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_pndm.py b/ppdiffusers/tests/schedulers/test_scheduler_pndm.py
deleted file mode 100644
index 1fe1c4490101..000000000000
--- a/ppdiffusers/tests/schedulers/test_scheduler_pndm.py
+++ /dev/null
@@ -1,256 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import tempfile
-
-import paddle
-
-from ppdiffusers import PNDMScheduler
-
-from .test_schedulers import SchedulerCommonTest
-
-
-class PNDMSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (PNDMScheduler,)
-    forward_default_kwargs = (("num_inference_steps", 50),)
-
-    def get_scheduler_config(self, **kwargs):
-        config = {
-            "num_train_timesteps": 1000,
-            "beta_start": 0.0001,
-            "beta_end": 0.02,
-            "beta_schedule": "linear",
-        }
-
-        config.update(**kwargs)
-        return config
-
-    def check_over_configs(self, time_step=0, **config):
-        kwargs = dict(self.forward_default_kwargs)
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-        sample = self.dummy_sample
-        residual = 0.1 * sample
-        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.1, residual + 0.05]
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config(**config)
-            scheduler = scheduler_class(**scheduler_config)
-            scheduler.set_timesteps(num_inference_steps)
-            # copy over dummy past residuals
-            scheduler.ets = dummy_past_residuals[:]
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
-                new_scheduler.set_timesteps(num_inference_steps)
-                # copy over dummy past residuals
-                new_scheduler.ets = dummy_past_residuals[:]
-
-            output = scheduler.step_prk(residual, time_step, sample, **kwargs).prev_sample
-            new_output = new_scheduler.step_prk(residual, time_step, sample, **kwargs).prev_sample
-
-            assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-            output = scheduler.step_plms(residual, time_step, sample, **kwargs).prev_sample
-            new_output = new_scheduler.step_plms(residual, time_step, sample, **kwargs).prev_sample
-
-            assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-    def test_from_save_pretrained(self):
-        pass
-
-    def check_over_forward(self, time_step=0, **forward_kwargs):
-        kwargs = dict(self.forward_default_kwargs)
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-        sample = self.dummy_sample
-        residual = 0.1 * sample
-        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.1, residual + 0.05]
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-            scheduler.set_timesteps(num_inference_steps)
-
-            # copy over dummy past residuals (must be after setting timesteps)
-            scheduler.ets = dummy_past_residuals[:]
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
-                # copy over dummy past residuals
-                new_scheduler.set_timesteps(num_inference_steps)
-
-                # copy over dummy past residual (must be after setting timesteps)
-                new_scheduler.ets = dummy_past_residuals[:]
-
-            output = scheduler.step_prk(residual, time_step, sample, **kwargs).prev_sample
-            new_output = new_scheduler.step_prk(residual, time_step, sample, **kwargs).prev_sample
-
-            assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-            output = scheduler.step_plms(residual, time_step, sample, **kwargs).prev_sample
-            new_output = new_scheduler.step_plms(residual, time_step, sample, **kwargs).prev_sample
-
-            assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-    def full_loop(self, **config):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(**config)
-        scheduler = scheduler_class(**scheduler_config)
-
-        num_inference_steps = 10
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter
-        scheduler.set_timesteps(num_inference_steps)
-
-        for i, t in enumerate(scheduler.prk_timesteps):
-            residual = model(sample, t)
-            sample = scheduler.step_prk(residual, t, sample).prev_sample
-
-        for i, t in enumerate(scheduler.plms_timesteps):
-            residual = model(sample, t)
-            sample = scheduler.step_plms(residual, t, sample).prev_sample
-
-        return sample
-
-    def test_step_shape(self):
-        kwargs = dict(self.forward_default_kwargs)
-
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-
-            sample = self.dummy_sample
-            residual = 0.1 * sample
-
-            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
-                scheduler.set_timesteps(num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
-                kwargs["num_inference_steps"] = num_inference_steps
-
-            # copy over dummy past residuals (must be done after set_timesteps)
-            dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.1, residual + 0.05]
-            scheduler.ets = dummy_past_residuals[:]
-
-            output_0 = scheduler.step_prk(residual, 0, sample, **kwargs).prev_sample
-            output_1 = scheduler.step_prk(residual, 1, sample, **kwargs).prev_sample
-
-            self.assertEqual(output_0.shape, sample.shape)
-            self.assertEqual(output_0.shape, output_1.shape)
-
-            output_0 = scheduler.step_plms(residual, 0, sample, **kwargs).prev_sample
-            output_1 = scheduler.step_plms(residual, 1, sample, **kwargs).prev_sample
-
-            self.assertEqual(output_0.shape, sample.shape)
-            self.assertEqual(output_0.shape, output_1.shape)
-
-    def test_timesteps(self):
-        for timesteps in [100, 1000]:
-            self.check_over_configs(num_train_timesteps=timesteps)
-
-    def test_steps_offset(self):
-        for steps_offset in [0, 1]:
-            self.check_over_configs(steps_offset=steps_offset)
-
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(steps_offset=1)
-        scheduler = scheduler_class(**scheduler_config)
-        scheduler.set_timesteps(10)
-        assert paddle.equal_all(
-            scheduler.timesteps,
-            paddle.to_tensor(
-                [901, 851, 851, 801, 801, 751, 751, 701, 701, 651, 651, 601, 601, 501, 401, 301, 201, 101, 1]
-            ),
-        )
-
-    def test_betas(self):
-        for beta_start, beta_end in zip([0.0001, 0.001], [0.002, 0.02]):
-            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
-
-    def test_schedules(self):
-        for schedule in ["linear", "squaredcos_cap_v2"]:
-            self.check_over_configs(beta_schedule=schedule)
-
-    def test_prediction_type(self):
-        for prediction_type in ["epsilon", "v_prediction"]:
-            self.check_over_configs(prediction_type=prediction_type)
-
-    def test_time_indices(self):
-        for t in [1, 5, 10]:
-            self.check_over_forward(time_step=t)
-
-    def test_inference_steps(self):
-        for t, num_inference_steps in zip([1, 5, 10], [10, 50, 100]):
-            self.check_over_forward(num_inference_steps=num_inference_steps)
-
-    def test_pow_of_3_inference_steps(self):
-        # earlier version of set_timesteps() caused an error indexing alpha's with inference steps as power of 3
-        num_inference_steps = 27
-
-        for scheduler_class in self.scheduler_classes:
-            sample = self.dummy_sample
-            residual = 0.1 * sample
-
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-
-            scheduler.set_timesteps(num_inference_steps)
-
-            # before power of 3 fix, would error on first step, so we only need to do two
-            for i, t in enumerate(scheduler.prk_timesteps[:2]):
-                sample = scheduler.step_prk(residual, t, sample).prev_sample
-
-    def test_inference_plms_no_past_residuals(self):
-        with self.assertRaises(ValueError):
-            scheduler_class = self.scheduler_classes[0]
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-
-            scheduler.step_plms(self.dummy_sample, 1, self.dummy_sample).prev_sample
-
-    def test_full_loop_no_noise(self):
-        sample = self.full_loop()
-        result_sum = paddle.sum(paddle.abs(sample))
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_sum.item() - 198.1318) < 1e-2
-        assert abs(result_mean.item() - 0.2580) < 1e-3
-
-    def test_full_loop_with_v_prediction(self):
-        sample = self.full_loop(prediction_type="v_prediction")
-        result_sum = paddle.sum(paddle.abs(sample))
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_sum.item() - 67.3986) < 1e-2
-        assert abs(result_mean.item() - 0.0878) < 1e-3
-
-    def test_full_loop_with_set_alpha_to_one(self):
-        # We specify different beta, so that the first alpha is 0.99
-        sample = self.full_loop(set_alpha_to_one=True, beta_start=0.01)
-        result_sum = paddle.sum(paddle.abs(sample))
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_sum.item() - 230.0399) < 1e-2
-        assert abs(result_mean.item() - 0.2995) < 1e-3
-
-    def test_full_loop_with_no_set_alpha_to_one(self):
-        # We specify different beta, so that the first alpha is 0.99
-        sample = self.full_loop(set_alpha_to_one=False, beta_start=0.01)
-        result_sum = paddle.sum(paddle.abs(sample))
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_sum.item() - 186.9482) < 1e-2
-        assert abs(result_mean.item() - 0.2434) < 1e-3
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_score_sde_ve.py b/ppdiffusers/tests/schedulers/test_scheduler_score_sde_ve.py
deleted file mode 100644
index ac15c502eda8..000000000000
--- a/ppdiffusers/tests/schedulers/test_scheduler_score_sde_ve.py
+++ /dev/null
@@ -1,211 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import tempfile
-import unittest
-
-import numpy as np
-import paddle
-
-from ppdiffusers import ScoreSdeVeScheduler
-
-
-class ScoreSdeVeSchedulerTest(unittest.TestCase):
-    # TODO adapt with class SchedulerCommonTest (scheduler needs Numpy Integration)
-    scheduler_classes = (ScoreSdeVeScheduler,)
-    forward_default_kwargs = ()
-
-    @property
-    def dummy_sample(self):
-        batch_size = 4
-        num_channels = 3
-        height = 8
-        width = 8
-
-        sample = paddle.rand((batch_size, num_channels, height, width))
-
-        return sample
-
-    @property
-    def dummy_sample_deter(self):
-        batch_size = 4
-        num_channels = 3
-        height = 8
-        width = 8
-
-        num_elems = batch_size * num_channels * height * width
-        sample = paddle.arange(num_elems)
-        sample = sample.reshape([num_channels, height, width, batch_size])
-        sample = sample / num_elems
-        sample = sample.transpose([3, 0, 1, 2])
-
-        return sample
-
-    def dummy_model(self):
-        def model(sample, t, *args):
-            return sample * t / (t + 1)
-
-        return model
-
-    def get_scheduler_config(self, **kwargs):
-        config = {
-            "num_train_timesteps": 2000,
-            "snr": 0.15,
-            "sigma_min": 0.01,
-            "sigma_max": 1348,
-            "sampling_eps": 1e-5,
-        }
-
-        config.update(**kwargs)
-        return config
-
-    def check_over_configs(self, time_step=0, **config):
-        kwargs = dict(self.forward_default_kwargs)
-
-        for scheduler_class in self.scheduler_classes:
-            sample = self.dummy_sample
-            residual = 0.1 * sample
-
-            scheduler_config = self.get_scheduler_config(**config)
-            scheduler = scheduler_class(**scheduler_config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
-
-            output = scheduler.step_pred(
-                residual, time_step, sample, generator=paddle.Generator().manual_seed(0), **kwargs
-            ).prev_sample
-            new_output = new_scheduler.step_pred(
-                residual, time_step, sample, generator=paddle.Generator().manual_seed(0), **kwargs
-            ).prev_sample
-
-            assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-            output = scheduler.step_correct(
-                residual, sample, generator=paddle.Generator().manual_seed(0), **kwargs
-            ).prev_sample
-            new_output = new_scheduler.step_correct(
-                residual, sample, generator=paddle.Generator().manual_seed(0), **kwargs
-            ).prev_sample
-
-            assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler correction are not identical"
-
-    def check_over_forward(self, time_step=0, **forward_kwargs):
-        kwargs = dict(self.forward_default_kwargs)
-        kwargs.update(forward_kwargs)
-
-        for scheduler_class in self.scheduler_classes:
-            sample = self.dummy_sample
-            residual = 0.1 * sample
-
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
-
-            output = scheduler.step_pred(
-                residual, time_step, sample, generator=paddle.Generator().manual_seed(0), **kwargs
-            ).prev_sample
-            new_output = new_scheduler.step_pred(
-                residual, time_step, sample, generator=paddle.Generator().manual_seed(0), **kwargs
-            ).prev_sample
-
-            assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-            output = scheduler.step_correct(
-                residual, sample, generator=paddle.Generator().manual_seed(0), **kwargs
-            ).prev_sample
-            new_output = new_scheduler.step_correct(
-                residual, sample, generator=paddle.Generator().manual_seed(0), **kwargs
-            ).prev_sample
-
-            assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler correction are not identical"
-
-    def test_timesteps(self):
-        for timesteps in [10, 100, 1000]:
-            self.check_over_configs(num_train_timesteps=timesteps)
-
-    def test_sigmas(self):
-        for sigma_min, sigma_max in zip([0.0001, 0.001, 0.01], [1, 100, 1000]):
-            self.check_over_configs(sigma_min=sigma_min, sigma_max=sigma_max)
-
-    def test_time_indices(self):
-        for t in [0.1, 0.5, 0.75]:
-            self.check_over_forward(time_step=t)
-
-    def test_full_loop_no_noise(self):
-        kwargs = dict(self.forward_default_kwargs)
-
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-
-        num_inference_steps = 3
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter
-
-        scheduler.set_sigmas(num_inference_steps)
-        scheduler.set_timesteps(num_inference_steps)
-        generator = paddle.Generator().manual_seed(0)
-
-        for i, t in enumerate(scheduler.timesteps):
-            sigma_t = scheduler.sigmas[i]
-
-            for _ in range(scheduler.config.correct_steps):
-                with paddle.no_grad():
-                    model_output = model(sample, sigma_t)
-                sample = scheduler.step_correct(model_output, sample, generator=generator, **kwargs).prev_sample
-
-            with paddle.no_grad():
-                model_output = model(sample, sigma_t)
-
-            output = scheduler.step_pred(model_output, t, sample, generator=generator, **kwargs)
-            sample, _ = output.prev_sample, output.prev_sample_mean
-
-        result_sum = paddle.sum(paddle.abs(sample))
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert np.isclose(result_sum.item(), 13210036224.0)
-        assert np.isclose(result_mean.item(), 17200568.0)
-
-    def test_step_shape(self):
-        kwargs = dict(self.forward_default_kwargs)
-
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-
-            sample = self.dummy_sample
-            residual = 0.1 * sample
-
-            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
-                scheduler.set_timesteps(num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
-                kwargs["num_inference_steps"] = num_inference_steps
-
-            output_0 = scheduler.step_pred(
-                residual, 0, sample, generator=paddle.Generator().manual_seed(0), **kwargs
-            ).prev_sample
-            output_1 = scheduler.step_pred(
-                residual, 1, sample, generator=paddle.Generator().manual_seed(0), **kwargs
-            ).prev_sample
-
-            self.assertEqual(output_0.shape, sample.shape)
-            self.assertEqual(output_0.shape, output_1.shape)
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_unclip.py b/ppdiffusers/tests/schedulers/test_scheduler_unclip.py
deleted file mode 100644
index b37fa2c51327..000000000000
--- a/ppdiffusers/tests/schedulers/test_scheduler_unclip.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-
-from ppdiffusers import UnCLIPScheduler
-
-from .test_schedulers import SchedulerCommonTest
-
-
-# UnCLIPScheduler is a modified DDPMScheduler with a subset of the configuration.
-class UnCLIPSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (UnCLIPScheduler,)
-
-    def get_scheduler_config(self, **kwargs):
-        config = {
-            "num_train_timesteps": 1000,
-            "variance_type": "fixed_small_log",
-            "clip_sample": True,
-            "clip_sample_range": 1.0,
-            "prediction_type": "epsilon",
-        }
-
-        config.update(**kwargs)
-        return config
-
-    def test_timesteps(self):
-        for timesteps in [1, 5, 100, 1000]:
-            self.check_over_configs(num_train_timesteps=timesteps)
-
-    def test_variance_type(self):
-        for variance in ["fixed_small_log", "learned_range"]:
-            self.check_over_configs(variance_type=variance)
-
-    def test_clip_sample(self):
-        for clip_sample in [True, False]:
-            self.check_over_configs(clip_sample=clip_sample)
-
-    def test_clip_sample_range(self):
-        for clip_sample_range in [1, 5, 10, 20]:
-            self.check_over_configs(clip_sample_range=clip_sample_range)
-
-    def test_prediction_type(self):
-        for prediction_type in ["epsilon", "sample"]:
-            self.check_over_configs(prediction_type=prediction_type)
-
-    def test_time_indices(self):
-        for time_step in [0, 500, 999]:
-            for prev_timestep in [None, 5, 100, 250, 500, 750]:
-                if prev_timestep is not None and prev_timestep >= time_step:
-                    continue
-
-                self.check_over_forward(time_step=time_step, prev_timestep=prev_timestep)
-
-    def test_variance_fixed_small_log(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(variance_type="fixed_small_log")
-        scheduler = scheduler_class(**scheduler_config)
-
-        assert paddle.sum(paddle.abs(scheduler._get_variance(0) - 1.0000e-10)) < 1e-5
-        assert paddle.sum(paddle.abs(scheduler._get_variance(487) - 0.0549625)) < 1e-5
-        assert paddle.sum(paddle.abs(scheduler._get_variance(999) - 0.9994987)) < 1e-5
-
-    def test_variance_learned_range(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(variance_type="learned_range")
-        scheduler = scheduler_class(**scheduler_config)
-
-        predicted_variance = 0.5
-
-        assert scheduler._get_variance(1, predicted_variance=predicted_variance) - -10.1712790 < 1e-5
-        assert scheduler._get_variance(487, predicted_variance=predicted_variance) - -5.7998052 < 1e-5
-        assert scheduler._get_variance(999, predicted_variance=predicted_variance) - -0.0010011 < 1e-5
-
-    def test_full_loop(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-
-        timesteps = scheduler.timesteps
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter
-        generator = paddle.Generator().manual_seed(0)
-
-        for i, t in enumerate(timesteps):
-            # 1. predict noise residual
-            residual = model(sample, t)
-
-            # 2. predict previous mean of sample x_t-1
-            pred_prev_sample = scheduler.step(residual, t, sample, generator=generator).prev_sample
-
-            sample = pred_prev_sample
-
-        result_sum = paddle.sum(paddle.abs(sample))
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_sum.item() - 255.86759949) < 1e-2
-        assert abs(result_mean.item() - 0.33316097) < 1e-3
-
-    def test_full_loop_skip_timesteps(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-
-        scheduler.set_timesteps(25)
-
-        timesteps = scheduler.timesteps
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter
-        generator = paddle.Generator().manual_seed(0)
-
-        for i, t in enumerate(timesteps):
-            # 1. predict noise residual
-            residual = model(sample, t)
-
-            if i + 1 == timesteps.shape[0]:
-                prev_timestep = None
-            else:
-                prev_timestep = timesteps[i + 1]
-
-            # 2. predict previous mean of sample x_t-1
-            pred_prev_sample = scheduler.step(
-                residual, t, sample, prev_timestep=prev_timestep, generator=generator
-            ).prev_sample
-
-            sample = pred_prev_sample
-
-        result_sum = paddle.sum(paddle.abs(sample))
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_sum.item() - 249.76672363) < 1e-2
-        assert abs(result_mean.item() - 0.32521713) < 1e-3
-
-    def test_trained_betas(self):
-        pass
-
-    def test_add_noise_device(self):
-        pass
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_unipc.py b/ppdiffusers/tests/schedulers/test_scheduler_unipc.py
deleted file mode 100644
index 0c19a3bb8387..000000000000
--- a/ppdiffusers/tests/schedulers/test_scheduler_unipc.py
+++ /dev/null
@@ -1,253 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import tempfile
-
-import paddle
-
-from ppdiffusers import (
-    DEISMultistepScheduler,
-    DPMSolverMultistepScheduler,
-    DPMSolverSinglestepScheduler,
-    UniPCMultistepScheduler,
-)
-
-from .test_schedulers import SchedulerCommonTest
-
-
-class UniPCMultistepSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (UniPCMultistepScheduler,)
-    forward_default_kwargs = (("num_inference_steps", 25),)
-
-    def get_scheduler_config(self, **kwargs):
-        config = {
-            "num_train_timesteps": 1000,
-            "beta_start": 0.0001,
-            "beta_end": 0.02,
-            "beta_schedule": "linear",
-            "solver_order": 2,
-            "solver_type": "bh1",
-        }
-
-        config.update(**kwargs)
-        return config
-
-    def check_over_configs(self, time_step=0, **config):
-        kwargs = dict(self.forward_default_kwargs)
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-        sample = self.dummy_sample
-        residual = 0.1 * sample
-        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config(**config)
-            scheduler = scheduler_class(**scheduler_config)
-            scheduler.set_timesteps(num_inference_steps)
-            # copy over dummy past residuals
-            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
-                new_scheduler.set_timesteps(num_inference_steps)
-                # copy over dummy past residuals
-                new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
-
-            output, new_output = sample, sample
-            for t in range(time_step, time_step + scheduler.config.solver_order + 1):
-                output = scheduler.step(residual, t, output, **kwargs).prev_sample
-                new_output = new_scheduler.step(residual, t, new_output, **kwargs).prev_sample
-
-                assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-    def check_over_forward(self, time_step=0, **forward_kwargs):
-        kwargs = dict(self.forward_default_kwargs)
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-        sample = self.dummy_sample
-        residual = 0.1 * sample
-        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-            scheduler.set_timesteps(num_inference_steps)
-
-            # copy over dummy past residuals (must be after setting timesteps)
-            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
-                # copy over dummy past residuals
-                new_scheduler.set_timesteps(num_inference_steps)
-
-                # copy over dummy past residual (must be after setting timesteps)
-                new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
-
-            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-
-            assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-    def full_loop(self, scheduler=None, **config):
-        if scheduler is None:
-            scheduler_class = self.scheduler_classes[0]
-            scheduler_config = self.get_scheduler_config(**config)
-            scheduler = scheduler_class(**scheduler_config)
-
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(**config)
-        scheduler = scheduler_class(**scheduler_config)
-
-        num_inference_steps = 10
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter
-        scheduler.set_timesteps(num_inference_steps)
-
-        for i, t in enumerate(scheduler.timesteps):
-            residual = model(sample, t)
-            sample = scheduler.step(residual, t, sample).prev_sample
-
-        return sample
-
-    def test_step_shape(self):
-        kwargs = dict(self.forward_default_kwargs)
-
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-
-            sample = self.dummy_sample
-            residual = 0.1 * sample
-
-            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
-                scheduler.set_timesteps(num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
-                kwargs["num_inference_steps"] = num_inference_steps
-
-            # copy over dummy past residuals (must be done after set_timesteps)
-            dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
-            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
-
-            time_step_0 = scheduler.timesteps[5]
-            time_step_1 = scheduler.timesteps[6]
-
-            output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample
-            output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample
-
-            self.assertEqual(output_0.shape, sample.shape)
-            self.assertEqual(output_0.shape, output_1.shape)
-
-    def test_switch(self):
-        # make sure that iterating over schedulers with same config names gives same results
-        # for defaults
-        scheduler = UniPCMultistepScheduler(**self.get_scheduler_config())
-        sample = self.full_loop(scheduler=scheduler)
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_mean.item() - 0.2521) < 1e-3
-
-        scheduler = DPMSolverSinglestepScheduler.from_config(scheduler.config)
-        scheduler = DEISMultistepScheduler.from_config(scheduler.config)
-        scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
-        scheduler = UniPCMultistepScheduler.from_config(scheduler.config)
-
-        sample = self.full_loop(scheduler=scheduler)
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_mean.item() - 0.2521) < 1e-3
-
-    def test_timesteps(self):
-        for timesteps in [25, 50, 100, 999, 1000]:
-            self.check_over_configs(num_train_timesteps=timesteps)
-
-    def test_thresholding(self):
-        self.check_over_configs(thresholding=False)
-        for order in [1, 2, 3]:
-            for solver_type in ["bh1", "bh2"]:
-                for threshold in [0.5, 1.0, 2.0]:
-                    for prediction_type in ["epsilon", "sample"]:
-                        self.check_over_configs(
-                            thresholding=True,
-                            prediction_type=prediction_type,
-                            sample_max_value=threshold,
-                            solver_order=order,
-                            solver_type=solver_type,
-                        )
-
-    def test_prediction_type(self):
-        for prediction_type in ["epsilon", "v_prediction"]:
-            self.check_over_configs(prediction_type=prediction_type)
-
-    def test_solver_order_and_type(self):
-        for solver_type in ["bh1", "bh2"]:
-            for order in [1, 2, 3]:
-                for prediction_type in ["epsilon", "sample"]:
-                    self.check_over_configs(
-                        solver_order=order,
-                        solver_type=solver_type,
-                        prediction_type=prediction_type,
-                    )
-                    sample = self.full_loop(
-                        solver_order=order,
-                        solver_type=solver_type,
-                        prediction_type=prediction_type,
-                    )
-                    assert not paddle.isnan(sample).any(), "Samples have nan numbers"
-
-    def test_lower_order_final(self):
-        self.check_over_configs(lower_order_final=True)
-        self.check_over_configs(lower_order_final=False)
-
-    def test_inference_steps(self):
-        for num_inference_steps in [1, 2, 3, 5, 10, 50, 100, 999, 1000]:
-            self.check_over_forward(num_inference_steps=num_inference_steps, time_step=0)
-
-    def test_full_loop_no_noise(self):
-        sample = self.full_loop()
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_mean.item() - 0.2521) < 1e-3
-
-    def test_full_loop_with_v_prediction(self):
-        sample = self.full_loop(prediction_type="v_prediction")
-        result_mean = paddle.mean(paddle.abs(sample))
-
-        assert abs(result_mean.item() - 0.1096) < 1e-3
-
-    def test_fp16_support(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(thresholding=True, dynamic_thresholding_ratio=0)
-        scheduler = scheduler_class(**scheduler_config)
-
-        num_inference_steps = 10
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter.cast("float16")
-        scheduler.set_timesteps(num_inference_steps)
-
-        for i, t in enumerate(scheduler.timesteps):
-            residual = model(sample, t)
-            sample = scheduler.step(residual, t, sample).prev_sample
-
-        assert sample.dtype == paddle.float16
-
-    def test_unique_timesteps(self, **config):
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config(**config)
-            scheduler = scheduler_class(**scheduler_config)
-
-            scheduler.set_timesteps(scheduler.config.num_train_timesteps)
-            assert len(scheduler.timesteps.unique()) == scheduler.num_inference_steps
diff --git a/ppdiffusers/tests/schedulers/test_scheduler_vq_diffusion.py b/ppdiffusers/tests/schedulers/test_scheduler_vq_diffusion.py
deleted file mode 100644
index c40e7834d682..000000000000
--- a/ppdiffusers/tests/schedulers/test_scheduler_vq_diffusion.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.nn.functional as F
-
-from ppdiffusers import VQDiffusionScheduler
-
-from .test_schedulers import SchedulerCommonTest
-
-
-class VQDiffusionSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (VQDiffusionScheduler,)
-
-    def get_scheduler_config(self, **kwargs):
-        config = {
-            "num_vec_classes": 4097,
-            "num_train_timesteps": 100,
-        }
-
-        config.update(**kwargs)
-        return config
-
-    def dummy_sample(self, num_vec_classes):
-        batch_size = 4
-        height = 8
-        width = 8
-
-        sample = paddle.randint(0, num_vec_classes, (batch_size, height * width))
-
-        return sample
-
-    @property
-    def dummy_sample_deter(self):
-        assert False
-
-    def dummy_model(self, num_vec_classes):
-        def model(sample, t, *args):
-            batch_size, num_latent_pixels = sample.shape
-            logits = paddle.rand((batch_size, num_vec_classes - 1, num_latent_pixels))
-            return_value = F.log_softmax(logits.cast("float64"), axis=1).cast("float32")
-            return return_value
-
-        return model
-
-    def test_timesteps(self):
-        for timesteps in [2, 5, 100, 1000]:
-            self.check_over_configs(num_train_timesteps=timesteps)
-
-    def test_num_vec_classes(self):
-        for num_vec_classes in [5, 100, 1000, 4000]:
-            self.check_over_configs(num_vec_classes=num_vec_classes)
-
-    def test_time_indices(self):
-        for t in [0, 50, 99]:
-            self.check_over_forward(time_step=t)
-
-    def test_add_noise_device(self):
-        pass
diff --git a/ppdiffusers/tests/schedulers/test_schedulers.py b/ppdiffusers/tests/schedulers/test_schedulers.py
deleted file mode 100755
index ac6868cf0a21..000000000000
--- a/ppdiffusers/tests/schedulers/test_schedulers.py
+++ /dev/null
@@ -1,642 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import inspect
-import json
-import os
-import tempfile
-import unittest
-from typing import Dict, List, Tuple
-
-import numpy as np
-import paddle
-
-import ppdiffusers
-from ppdiffusers import (
-    EulerAncestralDiscreteScheduler,
-    EulerDiscreteScheduler,
-    IPNDMScheduler,
-    LMSDiscreteScheduler,
-    VQDiffusionScheduler,
-    logging,
-)
-from ppdiffusers.configuration_utils import ConfigMixin, register_to_config
-from ppdiffusers.schedulers.scheduling_utils import SchedulerMixin
-from ppdiffusers.utils.testing_utils import CaptureLogger
-
-
-class SchedulerObject(SchedulerMixin, ConfigMixin):
-    config_name = "config.json"
-
-    @register_to_config
-    def __init__(
-        self,
-        a=2,
-        b=5,
-        c=(2, 5),
-        d="for diffusion",
-        e=[1, 3],
-    ):
-        pass
-
-
-class SchedulerObject2(SchedulerMixin, ConfigMixin):
-    config_name = "config.json"
-
-    @register_to_config
-    def __init__(
-        self,
-        a=2,
-        b=5,
-        c=(2, 5),
-        d="for diffusion",
-        f=[1, 3],
-    ):
-        pass
-
-
-class SchedulerObject3(SchedulerMixin, ConfigMixin):
-    config_name = "config.json"
-
-    @register_to_config
-    def __init__(
-        self,
-        a=2,
-        b=5,
-        c=(2, 5),
-        d="for diffusion",
-        e=[1, 3],
-        f=[1, 3],
-    ):
-        pass
-
-
-class SchedulerBaseTests(unittest.TestCase):
-    def test_save_load_from_different_config(self):
-        obj = SchedulerObject()
-
-        # mock add obj class to `ppdiffusers`
-        setattr(ppdiffusers, "SchedulerObject", SchedulerObject)
-        logger = logging.get_logger("ppdiffusers.configuration_utils")
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            obj.save_config(tmpdirname)
-            with CaptureLogger(logger) as cap_logger_1:
-                config = SchedulerObject2.load_config(tmpdirname)
-                new_obj_1 = SchedulerObject2.from_config(config)
-
-            # now save a config parameter that is not expected
-            with open(os.path.join(tmpdirname, SchedulerObject.config_name), "r") as f:
-                data = json.load(f)
-                data["unexpected"] = True
-
-            with open(os.path.join(tmpdirname, SchedulerObject.config_name), "w") as f:
-                json.dump(data, f)
-
-            with CaptureLogger(logger) as cap_logger_2:
-                config = SchedulerObject.load_config(tmpdirname)
-                new_obj_2 = SchedulerObject.from_config(config)
-
-            with CaptureLogger(logger) as cap_logger_3:
-                config = SchedulerObject2.load_config(tmpdirname)
-                new_obj_3 = SchedulerObject2.from_config(config)
-
-        assert new_obj_1.__class__ == SchedulerObject2
-        assert new_obj_2.__class__ == SchedulerObject
-        assert new_obj_3.__class__ == SchedulerObject2
-
-        assert cap_logger_1.out == ""
-        assert (
-            cap_logger_2.out
-            == "The config attributes {'unexpected': True} were passed to SchedulerObject, but are not expected and"
-            " will"
-            " be ignored. Please verify your config.json configuration file.\n"
-        )
-        assert cap_logger_2.out.replace("SchedulerObject", "SchedulerObject2") == cap_logger_3.out
-
-    def test_save_load_compatible_schedulers(self):
-        SchedulerObject2._compatibles = ["SchedulerObject"]
-        SchedulerObject._compatibles = ["SchedulerObject2"]
-
-        obj = SchedulerObject()
-
-        # mock add obj class to `ppdiffusers`
-        setattr(ppdiffusers, "SchedulerObject", SchedulerObject)
-        setattr(ppdiffusers, "SchedulerObject2", SchedulerObject2)
-        logger = logging.get_logger("ppdiffusers.configuration_utils")
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            obj.save_config(tmpdirname)
-
-            # now save a config parameter that is expected by another class, but not origin class
-            with open(os.path.join(tmpdirname, SchedulerObject.config_name), "r") as f:
-                data = json.load(f)
-                data["f"] = [0, 0]
-                data["unexpected"] = True
-
-            with open(os.path.join(tmpdirname, SchedulerObject.config_name), "w") as f:
-                json.dump(data, f)
-
-            with CaptureLogger(logger) as cap_logger:
-                config = SchedulerObject.load_config(tmpdirname)
-                new_obj = SchedulerObject.from_config(config)
-
-        assert new_obj.__class__ == SchedulerObject
-
-        assert (
-            cap_logger.out
-            == "The config attributes {'unexpected': True} were passed to SchedulerObject, but are not expected and"
-            " will"
-            " be ignored. Please verify your config.json configuration file.\n"
-        )
-
-    def test_save_load_from_different_config_comp_schedulers(self):
-        SchedulerObject3._compatibles = ["SchedulerObject", "SchedulerObject2"]
-        SchedulerObject2._compatibles = ["SchedulerObject", "SchedulerObject3"]
-        SchedulerObject._compatibles = ["SchedulerObject2", "SchedulerObject3"]
-
-        obj = SchedulerObject()
-
-        # mock add obj class to `ppdiffusers`
-        setattr(ppdiffusers, "SchedulerObject", SchedulerObject)
-        setattr(ppdiffusers, "SchedulerObject2", SchedulerObject2)
-        setattr(ppdiffusers, "SchedulerObject3", SchedulerObject3)
-        logger = logging.get_logger("ppdiffusers.configuration_utils")
-        logger.setLevel(ppdiffusers.logging.INFO)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            obj.save_config(tmpdirname)
-
-            with CaptureLogger(logger) as cap_logger_1:
-                config = SchedulerObject.load_config(tmpdirname)
-                new_obj_1 = SchedulerObject.from_config(config)
-
-            with CaptureLogger(logger) as cap_logger_2:
-                config = SchedulerObject2.load_config(tmpdirname)
-                new_obj_2 = SchedulerObject2.from_config(config)
-
-            with CaptureLogger(logger) as cap_logger_3:
-                config = SchedulerObject3.load_config(tmpdirname)
-                new_obj_3 = SchedulerObject3.from_config(config)
-
-        assert new_obj_1.__class__ == SchedulerObject
-        assert new_obj_2.__class__ == SchedulerObject2
-        assert new_obj_3.__class__ == SchedulerObject3
-
-        assert cap_logger_1.out == ""
-        assert cap_logger_2.out == "{'f'} was not found in config. Values will be initialized to default values.\n"
-        assert cap_logger_3.out == "{'f'} was not found in config. Values will be initialized to default values.\n"
-
-
-class SchedulerCommonTest(unittest.TestCase):
-    scheduler_classes = ()
-    forward_default_kwargs = ()
-
-    @property
-    def dummy_sample(self):
-        batch_size = 4
-        num_channels = 3
-        height = 8
-        width = 8
-
-        sample = paddle.rand((batch_size, num_channels, height, width))
-
-        return sample
-
-    @property
-    def dummy_sample_deter(self):
-        batch_size = 4
-        num_channels = 3
-        height = 8
-        width = 8
-
-        num_elems = batch_size * num_channels * height * width
-        sample = paddle.arange(num_elems)
-        sample = sample.reshape([num_channels, height, width, batch_size])
-        sample = sample / num_elems
-        sample = sample.transpose([3, 0, 1, 2])
-
-        return sample
-
-    def get_scheduler_config(self):
-        raise NotImplementedError
-
-    def dummy_model(self):
-        def model(sample, t, *args):
-            return sample * t / (t + 1)
-
-        return model
-
-    def check_over_configs(self, time_step=0, **config):
-        kwargs = dict(self.forward_default_kwargs)
-
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-
-        for scheduler_class in self.scheduler_classes:
-            # TODO(Suraj) - delete the following two lines once DDPM, DDIM, and PNDM have timesteps casted to float by default
-            if scheduler_class in (EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, LMSDiscreteScheduler):
-                time_step = float(time_step)
-
-            scheduler_config = self.get_scheduler_config(**config)
-            scheduler = scheduler_class(**scheduler_config)
-
-            if scheduler_class == VQDiffusionScheduler:
-                num_vec_classes = scheduler_config["num_vec_classes"]
-                sample = self.dummy_sample(num_vec_classes)
-                model = self.dummy_model(num_vec_classes)
-                residual = model(sample, time_step)
-            else:
-                sample = self.dummy_sample
-                residual = 0.1 * sample
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
-
-            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
-                scheduler.set_timesteps(num_inference_steps)
-                new_scheduler.set_timesteps(num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
-                kwargs["num_inference_steps"] = num_inference_steps
-
-            # Make sure `scale_model_input` is invoked to prevent a warning
-            if scheduler_class != VQDiffusionScheduler:
-                _ = scheduler.scale_model_input(sample, 0)
-                _ = new_scheduler.scale_model_input(sample, 0)
-
-            # Set the seed before step() as some schedulers are stochastic like EulerAncestralDiscreteScheduler, EulerDiscreteScheduler
-            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
-                kwargs["generator"] = paddle.Generator().manual_seed(0)
-            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-
-            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
-                kwargs["generator"] = paddle.Generator().manual_seed(0)
-            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-
-            assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-    def check_over_forward(self, time_step=0, **forward_kwargs):
-        kwargs = dict(self.forward_default_kwargs)
-        kwargs.update(forward_kwargs)
-
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-
-        for scheduler_class in self.scheduler_classes:
-            if scheduler_class in (EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, LMSDiscreteScheduler):
-                time_step = float(time_step)
-
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-
-            if scheduler_class == VQDiffusionScheduler:
-                num_vec_classes = scheduler_config["num_vec_classes"]
-                sample = self.dummy_sample(num_vec_classes)
-                model = self.dummy_model(num_vec_classes)
-                residual = model(sample, time_step)
-            else:
-                sample = self.dummy_sample
-                residual = 0.1 * sample
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
-
-            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
-                scheduler.set_timesteps(num_inference_steps)
-                new_scheduler.set_timesteps(num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
-                kwargs["num_inference_steps"] = num_inference_steps
-
-            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
-                kwargs["generator"] = paddle.Generator().manual_seed(0)
-            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-
-            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
-                kwargs["generator"] = paddle.Generator().manual_seed(0)
-            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-
-            assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-    def test_from_save_pretrained(self):
-        kwargs = dict(self.forward_default_kwargs)
-
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-
-        for scheduler_class in self.scheduler_classes:
-            timestep = 1
-            if scheduler_class in (EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, LMSDiscreteScheduler):
-                timestep = float(timestep)
-
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-
-            if scheduler_class == VQDiffusionScheduler:
-                num_vec_classes = scheduler_config["num_vec_classes"]
-                sample = self.dummy_sample(num_vec_classes)
-                model = self.dummy_model(num_vec_classes)
-                residual = model(sample, timestep)
-            else:
-                sample = self.dummy_sample
-                residual = 0.1 * sample
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
-
-            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
-                scheduler.set_timesteps(num_inference_steps)
-                new_scheduler.set_timesteps(num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
-                kwargs["num_inference_steps"] = num_inference_steps
-
-            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
-                kwargs["generator"] = paddle.Generator().manual_seed(0)
-            output = scheduler.step(residual, timestep, sample, **kwargs).prev_sample
-
-            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
-                kwargs["generator"] = paddle.Generator().manual_seed(0)
-            new_output = new_scheduler.step(residual, timestep, sample, **kwargs).prev_sample
-
-            assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-    def test_compatibles(self):
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config()
-
-            scheduler = scheduler_class(**scheduler_config)
-
-            assert all(c is not None for c in scheduler.compatibles)
-
-            for comp_scheduler_cls in scheduler.compatibles:
-                comp_scheduler = comp_scheduler_cls.from_config(scheduler.config)
-                assert comp_scheduler is not None
-
-            new_scheduler = scheduler_class.from_config(comp_scheduler.config)
-
-            new_scheduler_config = {k: v for k, v in new_scheduler.config.items() if k in scheduler.config}
-            scheduler_diff = {k: v for k, v in new_scheduler.config.items() if k not in scheduler.config}
-
-            # make sure that configs are essentially identical
-            assert new_scheduler_config == dict(scheduler.config)
-
-            # make sure that only differences are for configs that are not in init
-            init_keys = inspect.signature(scheduler_class.__init__).parameters.keys()
-            assert set(scheduler_diff.keys()).intersection(set(init_keys)) == set()
-
-    def test_from_pretrained(self):
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config()
-
-            scheduler = scheduler_class(**scheduler_config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_pretrained(tmpdirname)
-                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
-
-            assert scheduler.config == new_scheduler.config
-
-    def test_step_shape(self):
-        kwargs = dict(self.forward_default_kwargs)
-
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-
-        timestep_0 = 0
-        timestep_1 = 1
-
-        for scheduler_class in self.scheduler_classes:
-            if scheduler_class in (EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, LMSDiscreteScheduler):
-                timestep_0 = float(timestep_0)
-                timestep_1 = float(timestep_1)
-
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-
-            if scheduler_class == VQDiffusionScheduler:
-                num_vec_classes = scheduler_config["num_vec_classes"]
-                sample = self.dummy_sample(num_vec_classes)
-                model = self.dummy_model(num_vec_classes)
-                residual = model(sample, timestep_0)
-            else:
-                sample = self.dummy_sample
-                residual = 0.1 * sample
-
-            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
-                scheduler.set_timesteps(num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
-                kwargs["num_inference_steps"] = num_inference_steps
-
-            output_0 = scheduler.step(residual, timestep_0, sample, **kwargs).prev_sample
-            output_1 = scheduler.step(residual, timestep_1, sample, **kwargs).prev_sample
-
-            self.assertEqual(output_0.shape, sample.shape)
-            self.assertEqual(output_0.shape, output_1.shape)
-
-    def test_scheduler_outputs_equivalence(self):
-        def set_nan_tensor_to_zero(t):
-            zeros = paddle.zeros_like(t)
-            t = paddle.where(t == float("inf"), zeros, t)
-            return t
-
-        def recursive_check(tuple_object, dict_object):
-            if isinstance(tuple_object, (List, Tuple)):
-                for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object.values()):
-                    recursive_check(tuple_iterable_value, dict_iterable_value)
-            elif isinstance(tuple_object, Dict):
-                for tuple_iterable_value, dict_iterable_value in zip(tuple_object.values(), dict_object.values()):
-                    recursive_check(tuple_iterable_value, dict_iterable_value)
-            elif tuple_object is None:
-                return
-            else:
-                self.assertTrue(
-                    paddle.allclose(
-                        set_nan_tensor_to_zero(tuple_object).cast("float32"),
-                        set_nan_tensor_to_zero(dict_object).cast("float32"),
-                        atol=1e-5,
-                    ),
-                    msg=(
-                        "Tuple and dict output are not equal. Difference:"
-                        f" {paddle.max(paddle.abs(tuple_object - dict_object))}. Tuple has `nan`:"
-                        f" {paddle.isnan(tuple_object).any()} and `inf`: {paddle.isinf(tuple_object)}. Dict has"
-                        f" `nan`: {paddle.isnan(dict_object).any()} and `inf`: {paddle.isinf(dict_object)}."
-                    ),
-                )
-
-        kwargs = dict(self.forward_default_kwargs)
-        num_inference_steps = kwargs.pop("num_inference_steps", 50)
-
-        timestep = 0
-        if len(self.scheduler_classes) > 0 and self.scheduler_classes[0] == IPNDMScheduler:
-            timestep = 1
-
-        for scheduler_class in self.scheduler_classes:
-            if scheduler_class in (EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, LMSDiscreteScheduler):
-                timestep = float(timestep)
-
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-
-            if scheduler_class == VQDiffusionScheduler:
-                num_vec_classes = scheduler_config["num_vec_classes"]
-                sample = self.dummy_sample(num_vec_classes)
-                model = self.dummy_model(num_vec_classes)
-                residual = model(sample, timestep)
-            else:
-                sample = self.dummy_sample
-                residual = 0.1 * sample
-
-            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
-                scheduler.set_timesteps(num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
-                kwargs["num_inference_steps"] = num_inference_steps
-
-            # Set the seed before state as some schedulers are stochastic like EulerAncestralDiscreteScheduler, EulerDiscreteScheduler
-            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
-                kwargs["generator"] = paddle.Generator().manual_seed(0)
-            outputs_dict = scheduler.step(residual, timestep, sample, **kwargs)
-
-            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
-                scheduler.set_timesteps(num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
-                kwargs["num_inference_steps"] = num_inference_steps
-
-            # Set the seed before state as some schedulers are stochastic like EulerAncestralDiscreteScheduler, EulerDiscreteScheduler
-            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
-                kwargs["generator"] = paddle.Generator().manual_seed(0)
-            outputs_tuple = scheduler.step(residual, timestep, sample, return_dict=False, **kwargs)
-
-            recursive_check(outputs_tuple, outputs_dict)
-
-    def test_scheduler_public_api(self):
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-
-            if scheduler_class != VQDiffusionScheduler:
-                self.assertTrue(
-                    hasattr(scheduler, "init_noise_sigma"),
-                    f"{scheduler_class} does not implement a required attribute `init_noise_sigma`",
-                )
-                self.assertTrue(
-                    hasattr(scheduler, "scale_model_input"),
-                    (
-                        f"{scheduler_class} does not implement a required class method `scale_model_input(sample,"
-                        " timestep)`"
-                    ),
-                )
-            self.assertTrue(
-                hasattr(scheduler, "step"),
-                f"{scheduler_class} does not implement a required class method `step(...)`",
-            )
-
-            if scheduler_class != VQDiffusionScheduler:
-                sample = self.dummy_sample
-                scaled_sample = scheduler.scale_model_input(sample, 0.0)
-                self.assertEqual(sample.shape, scaled_sample.shape)
-
-    def test_add_noise_device(self):
-        for scheduler_class in self.scheduler_classes:
-            if scheduler_class == IPNDMScheduler:
-                continue
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-            scheduler.set_timesteps(100)
-
-            sample = self.dummy_sample
-            scaled_sample = scheduler.scale_model_input(sample, 0.0)
-            self.assertEqual(sample.shape, scaled_sample.shape)
-
-            noise = paddle.randn(scaled_sample.shape, dtype=scaled_sample.dtype)
-            t = scheduler.timesteps[5][None]
-            noised = scheduler.add_noise(scaled_sample, noise, t)
-            self.assertEqual(noised.shape, scaled_sample.shape)
-
-    def test_deprecated_kwargs(self):
-        for scheduler_class in self.scheduler_classes:
-            has_kwarg_in_model_class = "kwargs" in inspect.signature(scheduler_class.__init__).parameters
-            has_deprecated_kwarg = len(scheduler_class._deprecated_kwargs) > 0
-
-            if has_kwarg_in_model_class and not has_deprecated_kwarg:
-                raise ValueError(
-                    f"{scheduler_class} has `**kwargs` in its __init__ method but has not defined any deprecated"
-                    " kwargs under the `_deprecated_kwargs` class attribute. Make sure to either remove `**kwargs` if"
-                    " there are no deprecated arguments or add the deprecated argument with `_deprecated_kwargs ="
-                    " [<deprecated_argument>]`"
-                )
-
-            if not has_kwarg_in_model_class and has_deprecated_kwarg:
-                raise ValueError(
-                    f"{scheduler_class} doesn't have `**kwargs` in its __init__ method but has defined deprecated"
-                    " kwargs under the `_deprecated_kwargs` class attribute. Make sure to either add the `**kwargs`"
-                    f" argument to {self.model_class}.__init__ if there are deprecated arguments or remove the"
-                    " deprecated argument from `_deprecated_kwargs = [<deprecated_argument>]`"
-                )
-
-    def test_trained_betas(self):
-        for scheduler_class in self.scheduler_classes:
-            if scheduler_class == VQDiffusionScheduler:
-                continue
-
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config, trained_betas=np.array([0.1, 0.3]))
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_pretrained(tmpdirname)
-                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
-
-            assert scheduler.betas.tolist() == new_scheduler.betas.tolist()
-
-    def test_getattr_is_correct(self):
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-
-            # save some things to test
-            scheduler.dummy_attribute = 5
-            scheduler.register_to_config(test_attribute=5)
-
-            logger = logging.get_logger("ppdiffusers.configuration_utils")
-            # 30 for warning
-            logger.setLevel(30)
-            with CaptureLogger(logger) as cap_logger:
-                assert hasattr(scheduler, "dummy_attribute")
-                assert getattr(scheduler, "dummy_attribute") == 5
-                assert scheduler.dummy_attribute == 5
-
-            # no warning should be thrown
-            assert cap_logger.out == ""
-
-            logger = logging.get_logger("ppdiffusers.schedulers.schedulering_utils")
-            # 30 for warning
-            logger.setLevel(30)
-            with CaptureLogger(logger) as cap_logger:
-                assert hasattr(scheduler, "save_pretrained")
-                fn = scheduler.save_pretrained
-                fn_1 = getattr(scheduler, "save_pretrained")
-
-                assert fn == fn_1
-            # no warning should be thrown
-            assert cap_logger.out == ""
-
-            # warning should be thrown
-            with self.assertWarns(FutureWarning):
-                assert scheduler.test_attribute == 5
-
-            with self.assertWarns(FutureWarning):
-                assert getattr(scheduler, "test_attribute") == 5
-
-            with self.assertRaises(AttributeError) as error:
-                scheduler.does_not_exist
-
-            assert str(error.exception) == f"'{type(scheduler).__name__}' object has no attribute 'does_not_exist'"